ffs_softdep.c revision 241011
1221167Sgnn/*-
2221167Sgnn * Copyright 1998, 2000 Marshall Kirk McKusick.
3221167Sgnn * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4221167Sgnn * All rights reserved.
5221167Sgnn *
6221167Sgnn * The soft updates code is derived from the appendix of a University
7221167Sgnn * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8221167Sgnn * "Soft Updates: A Solution to the Metadata Update Problem in File
9221167Sgnn * Systems", CSE-TR-254-95, August 1995).
10221167Sgnn *
11221167Sgnn * Further information about soft updates can be obtained from:
12221167Sgnn *
13221167Sgnn *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
14221167Sgnn *	1614 Oxford Street		mckusick@mckusick.com
15221167Sgnn *	Berkeley, CA 94709-1608		+1-510-843-9542
16221167Sgnn *	USA
17221167Sgnn *
18221167Sgnn * Redistribution and use in source and binary forms, with or without
19221167Sgnn * modification, are permitted provided that the following conditions
20221167Sgnn * are met:
21221167Sgnn *
22221167Sgnn * 1. Redistributions of source code must retain the above copyright
23221167Sgnn *    notice, this list of conditions and the following disclaimer.
24221167Sgnn * 2. Redistributions in binary form must reproduce the above copyright
25221167Sgnn *    notice, this list of conditions and the following disclaimer in the
26221167Sgnn *    documentation and/or other materials provided with the distribution.
27221167Sgnn *
28221167Sgnn * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29221167Sgnn * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30221167Sgnn * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31221167Sgnn * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32221167Sgnn * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33221167Sgnn * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34221167Sgnn * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35221167Sgnn * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36221167Sgnn * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37221167Sgnn * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38221167Sgnn *
39221167Sgnn *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
40221167Sgnn */
41221167Sgnn
42221167Sgnn#include <sys/cdefs.h>
43221167Sgnn__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 241011 2012-09-27 23:30:49Z mdf $");
44221167Sgnn
45221167Sgnn#include "opt_ffs.h"
46221167Sgnn#include "opt_quota.h"
47221167Sgnn#include "opt_ddb.h"
48221167Sgnn
49221167Sgnn/*
50221167Sgnn * For now we want the safety net that the DEBUG flag provides.
51221167Sgnn */
52221167Sgnn#ifndef DEBUG
53221167Sgnn#define DEBUG
54221167Sgnn#endif
55221167Sgnn
56221167Sgnn#include <sys/param.h>
57221167Sgnn#include <sys/kernel.h>
58221167Sgnn#include <sys/systm.h>
59221167Sgnn#include <sys/bio.h>
60221167Sgnn#include <sys/buf.h>
61221167Sgnn#include <sys/kdb.h>
62221167Sgnn#include <sys/kthread.h>
63221167Sgnn#include <sys/limits.h>
64221167Sgnn#include <sys/lock.h>
65221167Sgnn#include <sys/malloc.h>
66221167Sgnn#include <sys/mount.h>
67221167Sgnn#include <sys/mutex.h>
68221167Sgnn#include <sys/namei.h>
69221167Sgnn#include <sys/priv.h>
70221167Sgnn#include <sys/proc.h>
71221167Sgnn#include <sys/stat.h>
72221167Sgnn#include <sys/sysctl.h>
73221167Sgnn#include <sys/syslog.h>
74221167Sgnn#include <sys/vnode.h>
75221167Sgnn#include <sys/conf.h>
76221167Sgnn
77221167Sgnn#include <ufs/ufs/dir.h>
78221167Sgnn#include <ufs/ufs/extattr.h>
79221167Sgnn#include <ufs/ufs/quota.h>
80221167Sgnn#include <ufs/ufs/inode.h>
81221167Sgnn#include <ufs/ufs/ufsmount.h>
82221167Sgnn#include <ufs/ffs/fs.h>
83221167Sgnn#include <ufs/ffs/softdep.h>
84221167Sgnn#include <ufs/ffs/ffs_extern.h>
85221167Sgnn#include <ufs/ufs/ufs_extern.h>
86221167Sgnn
87221167Sgnn#include <vm/vm.h>
88221167Sgnn#include <vm/vm_extern.h>
89221167Sgnn#include <vm/vm_object.h>
90221167Sgnn
91221167Sgnn#include <ddb/ddb.h>
92221167Sgnn
93221167Sgnn#ifndef SOFTUPDATES
94221167Sgnn
95221167Sgnnint
96221167Sgnnsoftdep_flushfiles(oldmnt, flags, td)
97221167Sgnn	struct mount *oldmnt;
98221167Sgnn	int flags;
99221167Sgnn	struct thread *td;
100221167Sgnn{
101221167Sgnn
102221167Sgnn	panic("softdep_flushfiles called");
103221167Sgnn}
104221167Sgnn
105221167Sgnnint
106221167Sgnnsoftdep_mount(devvp, mp, fs, cred)
107221167Sgnn	struct vnode *devvp;
108221167Sgnn	struct mount *mp;
109221167Sgnn	struct fs *fs;
110221167Sgnn	struct ucred *cred;
111221167Sgnn{
112221167Sgnn
113221167Sgnn	return (0);
114221167Sgnn}
115221167Sgnn
116221167Sgnnvoid
117221167Sgnnsoftdep_initialize()
118221167Sgnn{
119221167Sgnn
120221167Sgnn	return;
121221167Sgnn}
122221167Sgnn
123221167Sgnnvoid
124221167Sgnnsoftdep_uninitialize()
125221167Sgnn{
126221167Sgnn
127221167Sgnn	return;
128221167Sgnn}
129221167Sgnn
130221167Sgnnvoid
131221167Sgnnsoftdep_unmount(mp)
132221167Sgnn	struct mount *mp;
133221167Sgnn{
134221167Sgnn
135221167Sgnn}
136221167Sgnn
137221167Sgnnvoid
138221167Sgnnsoftdep_setup_sbupdate(ump, fs, bp)
139221167Sgnn	struct ufsmount *ump;
140221167Sgnn	struct fs *fs;
141221167Sgnn	struct buf *bp;
142221167Sgnn{
143221167Sgnn}
144221167Sgnn
145221167Sgnnvoid
146221167Sgnnsoftdep_setup_inomapdep(bp, ip, newinum, mode)
147221167Sgnn	struct buf *bp;
148221167Sgnn	struct inode *ip;
149221167Sgnn	ino_t newinum;
150221167Sgnn	int mode;
151221167Sgnn{
152221167Sgnn
153221167Sgnn	panic("softdep_setup_inomapdep called");
154221167Sgnn}
155221167Sgnn
156221167Sgnnvoid
157221167Sgnnsoftdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
158221167Sgnn	struct buf *bp;
159221167Sgnn	struct mount *mp;
160221167Sgnn	ufs2_daddr_t newblkno;
161221167Sgnn	int frags;
162221167Sgnn	int oldfrags;
163221167Sgnn{
164221167Sgnn
165221167Sgnn	panic("softdep_setup_blkmapdep called");
166221167Sgnn}
167221167Sgnn
168221167Sgnnvoid
169221167Sgnnsoftdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
170221167Sgnn	struct inode *ip;
171221167Sgnn	ufs_lbn_t lbn;
172221167Sgnn	ufs2_daddr_t newblkno;
173221167Sgnn	ufs2_daddr_t oldblkno;
174221167Sgnn	long newsize;
175221167Sgnn	long oldsize;
176221167Sgnn	struct buf *bp;
177221167Sgnn{
178221167Sgnn
179221167Sgnn	panic("softdep_setup_allocdirect called");
180221167Sgnn}
181221167Sgnn
182221167Sgnnvoid
183221167Sgnnsoftdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
184221167Sgnn	struct inode *ip;
185221167Sgnn	ufs_lbn_t lbn;
186221167Sgnn	ufs2_daddr_t newblkno;
187221167Sgnn	ufs2_daddr_t oldblkno;
188221167Sgnn	long newsize;
189221167Sgnn	long oldsize;
190221167Sgnn	struct buf *bp;
191221167Sgnn{
192221167Sgnn
193221167Sgnn	panic("softdep_setup_allocext called");
194221167Sgnn}
195221167Sgnn
196221167Sgnnvoid
197221167Sgnnsoftdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
198221167Sgnn	struct inode *ip;
199221167Sgnn	ufs_lbn_t lbn;
200221167Sgnn	struct buf *bp;
201221167Sgnn	int ptrno;
202221167Sgnn	ufs2_daddr_t newblkno;
203221167Sgnn	ufs2_daddr_t oldblkno;
204221167Sgnn	struct buf *nbp;
205221167Sgnn{
206221167Sgnn
207221167Sgnn	panic("softdep_setup_allocindir_page called");
208221167Sgnn}
209221167Sgnn
210221167Sgnnvoid
211221167Sgnnsoftdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
212221167Sgnn	struct buf *nbp;
213221167Sgnn	struct inode *ip;
214221167Sgnn	struct buf *bp;
215221167Sgnn	int ptrno;
216221167Sgnn	ufs2_daddr_t newblkno;
217221167Sgnn{
218221167Sgnn
219221167Sgnn	panic("softdep_setup_allocindir_meta called");
220221167Sgnn}
221221167Sgnn
222221167Sgnnvoid
223221167Sgnnsoftdep_journal_freeblocks(ip, cred, length, flags)
224221167Sgnn	struct inode *ip;
225221167Sgnn	struct ucred *cred;
226221167Sgnn	off_t length;
227221167Sgnn	int flags;
228221167Sgnn{
229221167Sgnn
230221167Sgnn	panic("softdep_journal_freeblocks called");
231221167Sgnn}
232221167Sgnn
233221167Sgnnvoid
234221167Sgnnsoftdep_journal_fsync(ip)
235221167Sgnn	struct inode *ip;
236221167Sgnn{
237221167Sgnn
238221167Sgnn	panic("softdep_journal_fsync called");
239221167Sgnn}
240221167Sgnn
241221167Sgnnvoid
242221167Sgnnsoftdep_setup_freeblocks(ip, length, flags)
243221167Sgnn	struct inode *ip;
244221167Sgnn	off_t length;
245221167Sgnn	int flags;
246221167Sgnn{
247221167Sgnn
248221167Sgnn	panic("softdep_setup_freeblocks called");
249221167Sgnn}
250221167Sgnn
251221167Sgnnvoid
252221167Sgnnsoftdep_freefile(pvp, ino, mode)
253221167Sgnn		struct vnode *pvp;
254221167Sgnn		ino_t ino;
255221167Sgnn		int mode;
256221167Sgnn{
257221167Sgnn
258221167Sgnn	panic("softdep_freefile called");
259221167Sgnn}
260221167Sgnn
261221167Sgnnint
262221167Sgnnsoftdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
263221167Sgnn	struct buf *bp;
264221167Sgnn	struct inode *dp;
265221167Sgnn	off_t diroffset;
266221167Sgnn	ino_t newinum;
267221167Sgnn	struct buf *newdirbp;
268221167Sgnn	int isnewblk;
269221167Sgnn{
270221167Sgnn
271221167Sgnn	panic("softdep_setup_directory_add called");
272221167Sgnn}
273221167Sgnn
274221167Sgnnvoid
275221167Sgnnsoftdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
276221167Sgnn	struct buf *bp;
277221167Sgnn	struct inode *dp;
278221167Sgnn	caddr_t base;
279221167Sgnn	caddr_t oldloc;
280221167Sgnn	caddr_t newloc;
281221167Sgnn	int entrysize;
282221167Sgnn{
283221167Sgnn
284221167Sgnn	panic("softdep_change_directoryentry_offset called");
285221167Sgnn}
286221167Sgnn
287221167Sgnnvoid
288221167Sgnnsoftdep_setup_remove(bp, dp, ip, isrmdir)
289221167Sgnn	struct buf *bp;
290221167Sgnn	struct inode *dp;
291221167Sgnn	struct inode *ip;
292221167Sgnn	int isrmdir;
293221167Sgnn{
294221167Sgnn
295221167Sgnn	panic("softdep_setup_remove called");
296221167Sgnn}
297221167Sgnn
298221167Sgnnvoid
299221167Sgnnsoftdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
300221167Sgnn	struct buf *bp;
301221167Sgnn	struct inode *dp;
302221167Sgnn	struct inode *ip;
303221167Sgnn	ino_t newinum;
304221167Sgnn	int isrmdir;
305221167Sgnn{
306221167Sgnn
307221167Sgnn	panic("softdep_setup_directory_change called");
308221167Sgnn}
309221167Sgnn
310221167Sgnnvoid
311221167Sgnnsoftdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
312221167Sgnn	struct mount *mp;
313221167Sgnn	struct buf *bp;
314221167Sgnn	ufs2_daddr_t blkno;
315221167Sgnn	int frags;
316221167Sgnn	struct workhead *wkhd;
317221167Sgnn{
318221167Sgnn
319221167Sgnn	panic("%s called", __FUNCTION__);
320221167Sgnn}
321221167Sgnn
322221167Sgnnvoid
323221167Sgnnsoftdep_setup_inofree(mp, bp, ino, wkhd)
324221167Sgnn	struct mount *mp;
325221167Sgnn	struct buf *bp;
326221167Sgnn	ino_t ino;
327221167Sgnn	struct workhead *wkhd;
328221167Sgnn{
329221167Sgnn
330221167Sgnn	panic("%s called", __FUNCTION__);
331221167Sgnn}
332221167Sgnn
333221167Sgnnvoid
334221167Sgnnsoftdep_setup_unlink(dp, ip)
335221167Sgnn	struct inode *dp;
336221167Sgnn	struct inode *ip;
337221167Sgnn{
338221167Sgnn
339221167Sgnn	panic("%s called", __FUNCTION__);
340221167Sgnn}
341221167Sgnn
342221167Sgnnvoid
343221167Sgnnsoftdep_setup_link(dp, ip)
344221167Sgnn	struct inode *dp;
345221167Sgnn	struct inode *ip;
346221167Sgnn{
347221167Sgnn
348221167Sgnn	panic("%s called", __FUNCTION__);
349221167Sgnn}
350221167Sgnn
351221167Sgnnvoid
352221167Sgnnsoftdep_revert_link(dp, ip)
353221167Sgnn	struct inode *dp;
354221167Sgnn	struct inode *ip;
355221167Sgnn{
356221167Sgnn
357221167Sgnn	panic("%s called", __FUNCTION__);
358221167Sgnn}
359221167Sgnn
360221167Sgnnvoid
361221167Sgnnsoftdep_setup_rmdir(dp, ip)
362221167Sgnn	struct inode *dp;
363221167Sgnn	struct inode *ip;
364221167Sgnn{
365221167Sgnn
366221167Sgnn	panic("%s called", __FUNCTION__);
367221167Sgnn}
368221167Sgnn
369221167Sgnnvoid
370221167Sgnnsoftdep_revert_rmdir(dp, ip)
371221167Sgnn	struct inode *dp;
372221167Sgnn	struct inode *ip;
373221167Sgnn{
374221167Sgnn
375221167Sgnn	panic("%s called", __FUNCTION__);
376221167Sgnn}
377221167Sgnn
378221167Sgnnvoid
379221167Sgnnsoftdep_setup_create(dp, ip)
380221167Sgnn	struct inode *dp;
381221167Sgnn	struct inode *ip;
382221167Sgnn{
383221167Sgnn
384221167Sgnn	panic("%s called", __FUNCTION__);
385221167Sgnn}
386221167Sgnn
387221167Sgnnvoid
388221167Sgnnsoftdep_revert_create(dp, ip)
389221167Sgnn	struct inode *dp;
390221167Sgnn	struct inode *ip;
391221167Sgnn{
392221167Sgnn
393221167Sgnn	panic("%s called", __FUNCTION__);
394221167Sgnn}
395221167Sgnn
396221167Sgnnvoid
397221167Sgnnsoftdep_setup_mkdir(dp, ip)
398221167Sgnn	struct inode *dp;
399221167Sgnn	struct inode *ip;
400221167Sgnn{
401221167Sgnn
402221167Sgnn	panic("%s called", __FUNCTION__);
403221167Sgnn}
404221167Sgnn
405221167Sgnnvoid
406221167Sgnnsoftdep_revert_mkdir(dp, ip)
407221167Sgnn	struct inode *dp;
408221167Sgnn	struct inode *ip;
409221167Sgnn{
410221167Sgnn
411221167Sgnn	panic("%s called", __FUNCTION__);
412221167Sgnn}
413221167Sgnn
414221167Sgnnvoid
415221167Sgnnsoftdep_setup_dotdot_link(dp, ip)
416221167Sgnn	struct inode *dp;
417221167Sgnn	struct inode *ip;
418221167Sgnn{
419221167Sgnn
420221167Sgnn	panic("%s called", __FUNCTION__);
421221167Sgnn}
422221167Sgnn
423221167Sgnnint
424221167Sgnnsoftdep_prealloc(vp, waitok)
425221167Sgnn	struct vnode *vp;
426221167Sgnn	int waitok;
427221167Sgnn{
428221167Sgnn
429221167Sgnn	panic("%s called", __FUNCTION__);
430221167Sgnn
431221167Sgnn	return (0);
432221167Sgnn}
433221167Sgnn
434221167Sgnnint
435221167Sgnnsoftdep_journal_lookup(mp, vpp)
436221167Sgnn	struct mount *mp;
437221167Sgnn	struct vnode **vpp;
438221167Sgnn{
439221167Sgnn
440221167Sgnn	return (ENOENT);
441221167Sgnn}
442221167Sgnn
443221167Sgnnvoid
444221167Sgnnsoftdep_change_linkcnt(ip)
445221167Sgnn	struct inode *ip;
446221167Sgnn{
447221167Sgnn
448221167Sgnn	panic("softdep_change_linkcnt called");
449221167Sgnn}
450221167Sgnn
451221167Sgnnvoid
452221167Sgnnsoftdep_load_inodeblock(ip)
453221167Sgnn	struct inode *ip;
454221167Sgnn{
455221167Sgnn
456221167Sgnn	panic("softdep_load_inodeblock called");
457221167Sgnn}
458221167Sgnn
459221167Sgnnvoid
460221167Sgnnsoftdep_update_inodeblock(ip, bp, waitfor)
461221167Sgnn	struct inode *ip;
462221167Sgnn	struct buf *bp;
463221167Sgnn	int waitfor;
464221167Sgnn{
465221167Sgnn
466221167Sgnn	panic("softdep_update_inodeblock called");
467221167Sgnn}
468221167Sgnn
469221167Sgnnint
470221167Sgnnsoftdep_fsync(vp)
471221167Sgnn	struct vnode *vp;	/* the "in_core" copy of the inode */
472221167Sgnn{
473221167Sgnn
474221167Sgnn	return (0);
475221167Sgnn}
476221167Sgnn
477221167Sgnnvoid
478221167Sgnnsoftdep_fsync_mountdev(vp)
479221167Sgnn	struct vnode *vp;
480221167Sgnn{
481221167Sgnn
482221167Sgnn	return;
483221167Sgnn}
484221167Sgnn
485221167Sgnnint
486221167Sgnnsoftdep_flushworklist(oldmnt, countp, td)
487221167Sgnn	struct mount *oldmnt;
488221167Sgnn	int *countp;
489221167Sgnn	struct thread *td;
490221167Sgnn{
491221167Sgnn
492221167Sgnn	*countp = 0;
493221167Sgnn	return (0);
494221167Sgnn}
495221167Sgnn
496221167Sgnnint
497221167Sgnnsoftdep_sync_metadata(struct vnode *vp)
498221167Sgnn{
499221167Sgnn
500221167Sgnn	return (0);
501221167Sgnn}
502221167Sgnn
503221167Sgnnint
504221167Sgnnsoftdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
505221167Sgnn{
506221167Sgnn
507221167Sgnn	return (0);
508221167Sgnn}
509221167Sgnn
510221167Sgnnint
511221167Sgnnsoftdep_slowdown(vp)
512221167Sgnn	struct vnode *vp;
513221167Sgnn{
514221167Sgnn
515221167Sgnn	panic("softdep_slowdown called");
516221167Sgnn}
517221167Sgnn
518221167Sgnnvoid
519221167Sgnnsoftdep_releasefile(ip)
520221167Sgnn	struct inode *ip;	/* inode with the zero effective link count */
521221167Sgnn{
522221167Sgnn
523221167Sgnn	panic("softdep_releasefile called");
524221167Sgnn}
525221167Sgnn
526221167Sgnnint
527221167Sgnnsoftdep_request_cleanup(fs, vp, cred, resource)
528221167Sgnn	struct fs *fs;
529221167Sgnn	struct vnode *vp;
530221167Sgnn	struct ucred *cred;
531221167Sgnn	int resource;
532221167Sgnn{
533221167Sgnn
534221167Sgnn	return (0);
535221167Sgnn}
536221167Sgnn
537221167Sgnnint
538221167Sgnnsoftdep_check_suspend(struct mount *mp,
539221167Sgnn		      struct vnode *devvp,
540221167Sgnn		      int softdep_deps,
541221167Sgnn		      int softdep_accdeps,
542221167Sgnn		      int secondary_writes,
543221167Sgnn		      int secondary_accwrites)
544221167Sgnn{
545221167Sgnn	struct bufobj *bo;
546221167Sgnn	int error;
547221167Sgnn
548221167Sgnn	(void) softdep_deps,
549221167Sgnn	(void) softdep_accdeps;
550221167Sgnn
551221167Sgnn	bo = &devvp->v_bufobj;
552221167Sgnn	ASSERT_BO_LOCKED(bo);
553221167Sgnn
554221167Sgnn	MNT_ILOCK(mp);
555221167Sgnn	while (mp->mnt_secondary_writes != 0) {
556221167Sgnn		BO_UNLOCK(bo);
557221167Sgnn		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
558221167Sgnn		    (PUSER - 1) | PDROP, "secwr", 0);
559221167Sgnn		BO_LOCK(bo);
560221167Sgnn		MNT_ILOCK(mp);
561221167Sgnn	}
562221167Sgnn
563221167Sgnn	/*
564221167Sgnn	 * Reasons for needing more work before suspend:
565221167Sgnn	 * - Dirty buffers on devvp.
566221167Sgnn	 * - Secondary writes occurred after start of vnode sync loop
567221167Sgnn	 */
568221167Sgnn	error = 0;
569221167Sgnn	if (bo->bo_numoutput > 0 ||
570221167Sgnn	    bo->bo_dirty.bv_cnt > 0 ||
571221167Sgnn	    secondary_writes != 0 ||
572221167Sgnn	    mp->mnt_secondary_writes != 0 ||
573221167Sgnn	    secondary_accwrites != mp->mnt_secondary_accwrites)
574221167Sgnn		error = EAGAIN;
575221167Sgnn	BO_UNLOCK(bo);
576221167Sgnn	return (error);
577221167Sgnn}
578221167Sgnn
579221167Sgnnvoid
580221167Sgnnsoftdep_get_depcounts(struct mount *mp,
581221167Sgnn		      int *softdepactivep,
582221167Sgnn		      int *softdepactiveaccp)
583221167Sgnn{
584221167Sgnn	(void) mp;
585221167Sgnn	*softdepactivep = 0;
586221167Sgnn	*softdepactiveaccp = 0;
587221167Sgnn}
588221167Sgnn
589221167Sgnnvoid
590221167Sgnnsoftdep_buf_append(bp, wkhd)
591221167Sgnn	struct buf *bp;
592221167Sgnn	struct workhead *wkhd;
593221167Sgnn{
594221167Sgnn
595221167Sgnn	panic("softdep_buf_appendwork called");
596221167Sgnn}
597221167Sgnn
598221167Sgnnvoid
599221167Sgnnsoftdep_inode_append(ip, cred, wkhd)
600221167Sgnn	struct inode *ip;
601221167Sgnn	struct ucred *cred;
602221167Sgnn	struct workhead *wkhd;
603221167Sgnn{
604221167Sgnn
605221167Sgnn	panic("softdep_inode_appendwork called");
606221167Sgnn}
607221167Sgnn
608221167Sgnnvoid
609221167Sgnnsoftdep_freework(wkhd)
610221167Sgnn	struct workhead *wkhd;
611221167Sgnn{
612221167Sgnn
613221167Sgnn	panic("softdep_freework called");
614221167Sgnn}
615221167Sgnn
616221167Sgnn#else
617221167Sgnn
618221167SgnnFEATURE(softupdates, "FFS soft-updates support");
619221167Sgnn
620221167Sgnn/*
621221167Sgnn * These definitions need to be adapted to the system to which
622221167Sgnn * this file is being ported.
623221167Sgnn */
624221167Sgnn
625221167Sgnn#define M_SOFTDEP_FLAGS	(M_WAITOK)
626221167Sgnn
627221167Sgnn#define	D_PAGEDEP	0
628221167Sgnn#define	D_INODEDEP	1
629221167Sgnn#define	D_BMSAFEMAP	2
630221167Sgnn#define	D_NEWBLK	3
631221167Sgnn#define	D_ALLOCDIRECT	4
632221167Sgnn#define	D_INDIRDEP	5
633221167Sgnn#define	D_ALLOCINDIR	6
634221167Sgnn#define	D_FREEFRAG	7
635221167Sgnn#define	D_FREEBLKS	8
636221167Sgnn#define	D_FREEFILE	9
637221167Sgnn#define	D_DIRADD	10
638221167Sgnn#define	D_MKDIR		11
639221167Sgnn#define	D_DIRREM	12
640221167Sgnn#define	D_NEWDIRBLK	13
641221167Sgnn#define	D_FREEWORK	14
642221167Sgnn#define	D_FREEDEP	15
643221167Sgnn#define	D_JADDREF	16
644221167Sgnn#define	D_JREMREF	17
645221167Sgnn#define	D_JMVREF	18
646221167Sgnn#define	D_JNEWBLK	19
647221167Sgnn#define	D_JFREEBLK	20
648221167Sgnn#define	D_JFREEFRAG	21
649221167Sgnn#define	D_JSEG		22
650221167Sgnn#define	D_JSEGDEP	23
651221167Sgnn#define	D_SBDEP		24
652221167Sgnn#define	D_JTRUNC	25
653221167Sgnn#define	D_JFSYNC	26
654221167Sgnn#define	D_SENTINAL	27
655221167Sgnn#define	D_LAST		D_SENTINAL
656221167Sgnn
657221167Sgnnunsigned long dep_current[D_LAST + 1];
658221167Sgnnunsigned long dep_total[D_LAST + 1];
659221167Sgnnunsigned long dep_write[D_LAST + 1];
660221167Sgnn
661221167Sgnn
662221167Sgnnstatic SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
663221167Sgnn    "soft updates stats");
664221167Sgnnstatic SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
665221167Sgnn    "total dependencies allocated");
666221167Sgnnstatic SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
667221167Sgnn    "current dependencies allocated");
668221167Sgnnstatic SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
669221167Sgnn    "current dependencies written");
670221167Sgnn
671221167Sgnn#define	SOFTDEP_TYPE(type, str, long)					\
672221167Sgnn    static MALLOC_DEFINE(M_ ## type, #str, long);			\
673221167Sgnn    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
674221167Sgnn	&dep_total[D_ ## type], 0, "");					\
675221167Sgnn    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
676221167Sgnn	&dep_current[D_ ## type], 0, "");				\
677221167Sgnn    SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, 	\
678221167Sgnn	&dep_write[D_ ## type], 0, "");
679221167Sgnn
680221167SgnnSOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
681221167SgnnSOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
682221167SgnnSOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
683221167Sgnn    "Block or frag allocated from cyl group map");
684221167SgnnSOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
685221167SgnnSOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
686221167SgnnSOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
687221167SgnnSOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
688221167SgnnSOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
689221167SgnnSOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
690221167SgnnSOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
691221167SgnnSOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
692221167SgnnSOFTDEP_TYPE(MKDIR, mkdir, "New directory");
693221167SgnnSOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
694221167SgnnSOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
695221167SgnnSOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
696221167SgnnSOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
697221167SgnnSOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
698221167SgnnSOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
699221167SgnnSOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
700221167SgnnSOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
701221167SgnnSOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
702221167SgnnSOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
703221167SgnnSOFTDEP_TYPE(JSEG, jseg, "Journal segment");
704221167SgnnSOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
705221167SgnnSOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
706221167SgnnSOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
707221167SgnnSOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
708221167Sgnn
709221167Sgnnstatic MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
710221167Sgnnstatic MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
711221167Sgnn
712221167Sgnn/*
713221167Sgnn * translate from workitem type to memory type
714221167Sgnn * MUST match the defines above, such that memtype[D_XXX] == M_XXX
715221167Sgnn */
716221167Sgnnstatic struct malloc_type *memtype[] = {
717221167Sgnn	M_PAGEDEP,
718221167Sgnn	M_INODEDEP,
719221167Sgnn	M_BMSAFEMAP,
720221167Sgnn	M_NEWBLK,
721221167Sgnn	M_ALLOCDIRECT,
722221167Sgnn	M_INDIRDEP,
723221167Sgnn	M_ALLOCINDIR,
724221167Sgnn	M_FREEFRAG,
725221167Sgnn	M_FREEBLKS,
726221167Sgnn	M_FREEFILE,
727221167Sgnn	M_DIRADD,
728221167Sgnn	M_MKDIR,
729221167Sgnn	M_DIRREM,
730221167Sgnn	M_NEWDIRBLK,
731221167Sgnn	M_FREEWORK,
732221167Sgnn	M_FREEDEP,
733221167Sgnn	M_JADDREF,
734221167Sgnn	M_JREMREF,
735221167Sgnn	M_JMVREF,
736221167Sgnn	M_JNEWBLK,
737221167Sgnn	M_JFREEBLK,
738221167Sgnn	M_JFREEFRAG,
739221167Sgnn	M_JSEG,
740221167Sgnn	M_JSEGDEP,
741221167Sgnn	M_SBDEP,
742221167Sgnn	M_JTRUNC,
743221167Sgnn	M_JFSYNC
744221167Sgnn};
745221167Sgnn
746221167Sgnnstatic LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;
747221167Sgnn
748221167Sgnn#define DtoM(type) (memtype[type])
749221167Sgnn
750221167Sgnn/*
751221167Sgnn * Names of malloc types.
752221167Sgnn */
753221167Sgnn#define TYPENAME(type)  \
754221167Sgnn	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
755221167Sgnn/*
756221167Sgnn * End system adaptation definitions.
757221167Sgnn */
758221167Sgnn
759221167Sgnn#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
760221167Sgnn#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
761221167Sgnn
762221167Sgnn/*
763221167Sgnn * Forward declarations.
764221167Sgnn */
765221167Sgnnstruct inodedep_hashhead;
766221167Sgnnstruct newblk_hashhead;
767221167Sgnnstruct pagedep_hashhead;
768221167Sgnnstruct bmsafemap_hashhead;
769221167Sgnn
770221167Sgnn/*
771221167Sgnn * Internal function prototypes.
772221167Sgnn */
773221167Sgnnstatic	void softdep_error(char *, int);
774221167Sgnnstatic	void drain_output(struct vnode *);
775221167Sgnnstatic	struct buf *getdirtybuf(struct buf *, struct mtx *, int);
776221167Sgnnstatic	void clear_remove(void);
777221167Sgnnstatic	void clear_inodedeps(void);
778221167Sgnnstatic	void unlinked_inodedep(struct mount *, struct inodedep *);
779221167Sgnnstatic	void clear_unlinked_inodedep(struct inodedep *);
780221167Sgnnstatic	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
781221167Sgnnstatic	int flush_pagedep_deps(struct vnode *, struct mount *,
782221167Sgnn	    struct diraddhd *);
783221167Sgnnstatic	int free_pagedep(struct pagedep *);
784221167Sgnnstatic	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
785221167Sgnnstatic	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
786221167Sgnnstatic	int flush_deplist(struct allocdirectlst *, int, int *);
787221167Sgnnstatic	int sync_cgs(struct mount *, int);
788221167Sgnnstatic	int handle_written_filepage(struct pagedep *, struct buf *);
789221167Sgnnstatic	int handle_written_sbdep(struct sbdep *, struct buf *);
790221167Sgnnstatic	void initiate_write_sbdep(struct sbdep *);
791221167Sgnnstatic  void diradd_inode_written(struct diradd *, struct inodedep *);
792221167Sgnnstatic	int handle_written_indirdep(struct indirdep *, struct buf *,
793221167Sgnn	    struct buf**);
794221167Sgnnstatic	int handle_written_inodeblock(struct inodedep *, struct buf *);
795221167Sgnnstatic	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
796221167Sgnn	    uint8_t *);
797221167Sgnnstatic	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
798221167Sgnnstatic	void handle_written_jaddref(struct jaddref *);
799221167Sgnnstatic	void handle_written_jremref(struct jremref *);
800221167Sgnnstatic	void handle_written_jseg(struct jseg *, struct buf *);
801221167Sgnnstatic	void handle_written_jnewblk(struct jnewblk *);
802221167Sgnnstatic	void handle_written_jblkdep(struct jblkdep *);
803221167Sgnnstatic	void handle_written_jfreefrag(struct jfreefrag *);
804221167Sgnnstatic	void complete_jseg(struct jseg *);
805221167Sgnnstatic	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
806221167Sgnnstatic	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
807221167Sgnnstatic	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
808221167Sgnnstatic	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
809221167Sgnnstatic	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
810221167Sgnnstatic	void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
811221167Sgnnstatic	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
812221167Sgnnstatic	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
813221167Sgnnstatic	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
814221167Sgnnstatic	inline void inoref_write(struct inoref *, struct jseg *,
815221167Sgnn	    struct jrefrec *);
816221167Sgnnstatic	void handle_allocdirect_partdone(struct allocdirect *,
817221167Sgnn	    struct workhead *);
818221167Sgnnstatic	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
819221167Sgnn	    struct workhead *);
820221167Sgnnstatic	void indirdep_complete(struct indirdep *);
821221167Sgnnstatic	int indirblk_lookup(struct mount *, ufs2_daddr_t);
822221167Sgnnstatic	void indirblk_insert(struct freework *);
823221167Sgnnstatic	void indirblk_remove(struct freework *);
824221167Sgnnstatic	void handle_allocindir_partdone(struct allocindir *);
825221167Sgnnstatic	void initiate_write_filepage(struct pagedep *, struct buf *);
826221167Sgnnstatic	void initiate_write_indirdep(struct indirdep*, struct buf *);
827221167Sgnnstatic	void handle_written_mkdir(struct mkdir *, int);
828221167Sgnnstatic	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
829221167Sgnn	    uint8_t *);
830221167Sgnnstatic	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
831221167Sgnnstatic	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
832221167Sgnnstatic	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
833221167Sgnnstatic	void handle_workitem_freefile(struct freefile *);
834221167Sgnnstatic	int handle_workitem_remove(struct dirrem *, int);
835221167Sgnnstatic	struct dirrem *newdirrem(struct buf *, struct inode *,
836221167Sgnn	    struct inode *, int, struct dirrem **);
837221167Sgnnstatic	struct indirdep *indirdep_lookup(struct mount *, struct inode *,
838221167Sgnn	    struct buf *);
839221167Sgnnstatic	void cancel_indirdep(struct indirdep *, struct buf *,
840221167Sgnn	    struct freeblks *);
841221167Sgnnstatic	void free_indirdep(struct indirdep *);
842221167Sgnnstatic	void free_diradd(struct diradd *, struct workhead *);
843221167Sgnnstatic	void merge_diradd(struct inodedep *, struct diradd *);
844221167Sgnnstatic	void complete_diradd(struct diradd *);
845221167Sgnnstatic	struct diradd *diradd_lookup(struct pagedep *, int);
846221167Sgnnstatic	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
847221167Sgnn	    struct jremref *);
848221167Sgnnstatic	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
849221167Sgnn	    struct jremref *);
850221167Sgnnstatic	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
851221167Sgnn	    struct jremref *, struct jremref *);
852221167Sgnnstatic	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
853221167Sgnn	    struct jremref *);
854221167Sgnnstatic	void cancel_allocindir(struct allocindir *, struct buf *bp,
855221167Sgnn	    struct freeblks *, int);
856221167Sgnnstatic	int setup_trunc_indir(struct freeblks *, struct inode *,
857221167Sgnn	    ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
858221167Sgnnstatic	void complete_trunc_indir(struct freework *);
859221167Sgnnstatic	void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
860221167Sgnn	    int);
861221167Sgnnstatic	void complete_mkdir(struct mkdir *);
862221167Sgnnstatic	void free_newdirblk(struct newdirblk *);
863221167Sgnnstatic	void free_jremref(struct jremref *);
864221167Sgnnstatic	void free_jaddref(struct jaddref *);
865221167Sgnnstatic	void free_jsegdep(struct jsegdep *);
866221167Sgnnstatic	void free_jsegs(struct jblocks *);
867221167Sgnnstatic	void rele_jseg(struct jseg *);
868221167Sgnnstatic	void free_jseg(struct jseg *, struct jblocks *);
869221167Sgnnstatic	void free_jnewblk(struct jnewblk *);
870221167Sgnnstatic	void free_jblkdep(struct jblkdep *);
871221167Sgnnstatic	void free_jfreefrag(struct jfreefrag *);
872221167Sgnnstatic	void free_freedep(struct freedep *);
873221167Sgnnstatic	void journal_jremref(struct dirrem *, struct jremref *,
874221167Sgnn	    struct inodedep *);
875221167Sgnnstatic	void cancel_jnewblk(struct jnewblk *, struct workhead *);
876221167Sgnnstatic	int cancel_jaddref(struct jaddref *, struct inodedep *,
877221167Sgnn	    struct workhead *);
878221167Sgnnstatic	void cancel_jfreefrag(struct jfreefrag *);
879221167Sgnnstatic	inline void setup_freedirect(struct freeblks *, struct inode *,
880221167Sgnn	    int, int);
881221167Sgnnstatic	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
882221167Sgnnstatic	inline void setup_freeindir(struct freeblks *, struct inode *, int,
883221167Sgnn	    ufs_lbn_t, int);
884221167Sgnnstatic	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
885221167Sgnnstatic	void freeblks_free(struct ufsmount *, struct freeblks *, int);
886221167Sgnnstatic	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
887221167Sgnnufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
888221167Sgnnstatic	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
889221167Sgnnstatic	void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
890221167Sgnn	    int, int);
891221167Sgnnstatic	void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
892221167Sgnnstatic 	int cancel_pagedep(struct pagedep *, struct freeblks *, int);
893221167Sgnnstatic	int deallocate_dependencies(struct buf *, struct freeblks *, int);
894221167Sgnnstatic	void newblk_freefrag(struct newblk*);
895221167Sgnnstatic	void free_newblk(struct newblk *);
896221167Sgnnstatic	void cancel_allocdirect(struct allocdirectlst *,
897221167Sgnn	    struct allocdirect *, struct freeblks *);
898221167Sgnnstatic	int check_inode_unwritten(struct inodedep *);
899221167Sgnnstatic	int free_inodedep(struct inodedep *);
900221167Sgnnstatic	void freework_freeblock(struct freework *);
901221167Sgnnstatic	void freework_enqueue(struct freework *);
902221167Sgnnstatic	int handle_workitem_freeblocks(struct freeblks *, int);
903221167Sgnnstatic	int handle_complete_freeblocks(struct freeblks *, int);
904221167Sgnnstatic	void handle_workitem_indirblk(struct freework *);
905221167Sgnnstatic	void handle_written_freework(struct freework *);
906221167Sgnnstatic	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
907221167Sgnnstatic	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
908221167Sgnn	    struct workhead *);
909221167Sgnnstatic	struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
910221167Sgnn	    struct inodedep *, struct allocindir *, ufs_lbn_t);
911221167Sgnnstatic	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
912221167Sgnn	    ufs2_daddr_t, ufs_lbn_t);
913221167Sgnnstatic	void handle_workitem_freefrag(struct freefrag *);
914221167Sgnnstatic	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
915221167Sgnn	    ufs_lbn_t);
916221167Sgnnstatic	void allocdirect_merge(struct allocdirectlst *,
917221167Sgnn	    struct allocdirect *, struct allocdirect *);
918221167Sgnnstatic	struct freefrag *allocindir_merge(struct allocindir *,
919221167Sgnn	    struct allocindir *);
920221167Sgnnstatic	int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int,
921221167Sgnn	    struct bmsafemap **);
922221167Sgnnstatic	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
923221167Sgnn	    int cg, struct bmsafemap *);
924221167Sgnnstatic	int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t,
925221167Sgnn	    int, struct newblk **);
926221167Sgnnstatic	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
927221167Sgnnstatic	int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
928221167Sgnn	    struct inodedep **);
929221167Sgnnstatic	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
930221167Sgnnstatic	int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
931221167Sgnn	    int, struct pagedep **);
932221167Sgnnstatic	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
933221167Sgnn	    struct mount *mp, int, struct pagedep **);
934221167Sgnnstatic	void pause_timer(void *);
935221167Sgnnstatic	int request_cleanup(struct mount *, int);
936221167Sgnnstatic	int process_worklist_item(struct mount *, int, int);
937221167Sgnnstatic	void process_removes(struct vnode *);
938221167Sgnnstatic	void process_truncates(struct vnode *);
939221167Sgnnstatic	void jwork_move(struct workhead *, struct workhead *);
940221167Sgnnstatic	void jwork_insert(struct workhead *, struct jsegdep *);
941221167Sgnnstatic	void add_to_worklist(struct worklist *, int);
942221167Sgnnstatic	void wake_worklist(struct worklist *);
943221167Sgnnstatic	void wait_worklist(struct worklist *, char *);
944221167Sgnnstatic	void remove_from_worklist(struct worklist *);
945221167Sgnnstatic	void softdep_flush(void);
946221167Sgnnstatic	void softdep_flushjournal(struct mount *);
947221167Sgnnstatic	int softdep_speedup(void);
948221167Sgnnstatic	void worklist_speedup(void);
949221167Sgnnstatic	int journal_mount(struct mount *, struct fs *, struct ucred *);
950221167Sgnnstatic	void journal_unmount(struct mount *);
951221167Sgnnstatic	int journal_space(struct ufsmount *, int);
952221167Sgnnstatic	void journal_suspend(struct ufsmount *);
953221167Sgnnstatic	int journal_unsuspend(struct ufsmount *ump);
954221167Sgnnstatic	void softdep_prelink(struct vnode *, struct vnode *);
955221167Sgnnstatic	void add_to_journal(struct worklist *);
956221167Sgnnstatic	void remove_from_journal(struct worklist *);
957221167Sgnnstatic	void softdep_process_journal(struct mount *, struct worklist *, int);
958221167Sgnnstatic	struct jremref *newjremref(struct dirrem *, struct inode *,
959221167Sgnn	    struct inode *ip, off_t, nlink_t);
960221167Sgnnstatic	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
961221167Sgnn	    uint16_t);
962221167Sgnnstatic	inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
963221167Sgnn	    uint16_t);
964221167Sgnnstatic	inline struct jsegdep *inoref_jseg(struct inoref *);
965221167Sgnnstatic	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
966221167Sgnnstatic	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
967221167Sgnn	    ufs2_daddr_t, int);
968221167Sgnnstatic	struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
969221167Sgnnstatic	void move_newblock_dep(struct jaddref *, struct inodedep *);
970221167Sgnnstatic	void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
971221167Sgnnstatic	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
972221167Sgnn	    ufs2_daddr_t, long, ufs_lbn_t);
973221167Sgnnstatic	struct freework *newfreework(struct ufsmount *, struct freeblks *,
974221167Sgnn	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
975221167Sgnnstatic	int jwait(struct worklist *, int);
976221167Sgnnstatic	struct inodedep *inodedep_lookup_ip(struct inode *);
977221167Sgnnstatic	int bmsafemap_rollbacks(struct bmsafemap *);
978221167Sgnnstatic	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
979221167Sgnnstatic	void handle_jwork(struct workhead *);
980221167Sgnnstatic	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
981221167Sgnn	    struct mkdir **);
982221167Sgnnstatic	struct jblocks *jblocks_create(void);
983221167Sgnnstatic	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
984221167Sgnnstatic	void jblocks_free(struct jblocks *, struct mount *, int);
985221167Sgnnstatic	void jblocks_destroy(struct jblocks *);
986221167Sgnnstatic	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
987221167Sgnn
988221167Sgnn/*
989221167Sgnn * Exported softdep operations.
990221167Sgnn */
991221167Sgnnstatic	void softdep_disk_io_initiation(struct buf *);
992221167Sgnnstatic	void softdep_disk_write_complete(struct buf *);
993221167Sgnnstatic	void softdep_deallocate_dependencies(struct buf *);
994221167Sgnnstatic	int softdep_count_dependencies(struct buf *bp, int);
995221167Sgnn
996221167Sgnnstatic struct mtx lk;
997221167SgnnMTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
998221167Sgnn
999221167Sgnn#define TRY_ACQUIRE_LOCK(lk)		mtx_trylock(lk)
1000221167Sgnn#define ACQUIRE_LOCK(lk)		mtx_lock(lk)
1001221167Sgnn#define FREE_LOCK(lk)			mtx_unlock(lk)
1002221167Sgnn
1003221167Sgnn#define	BUF_AREC(bp)			lockallowrecurse(&(bp)->b_lock)
1004221167Sgnn#define	BUF_NOREC(bp)			lockdisablerecurse(&(bp)->b_lock)
1005221167Sgnn
1006221167Sgnn/*
1007221167Sgnn * Worklist queue management.
1008221167Sgnn * These routines require that the lock be held.
1009221167Sgnn */
1010221167Sgnn#ifndef /* NOT */ DEBUG
1011221167Sgnn#define WORKLIST_INSERT(head, item) do {	\
1012221167Sgnn	(item)->wk_state |= ONWORKLIST;		\
1013221167Sgnn	LIST_INSERT_HEAD(head, item, wk_list);	\
1014221167Sgnn} while (0)
1015221167Sgnn#define WORKLIST_REMOVE(item) do {		\
1016221167Sgnn	(item)->wk_state &= ~ONWORKLIST;	\
1017221167Sgnn	LIST_REMOVE(item, wk_list);		\
1018221167Sgnn} while (0)
1019221167Sgnn#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
1020221167Sgnn#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
1021221167Sgnn
1022221167Sgnn#else /* DEBUG */
1023221167Sgnnstatic	void worklist_insert(struct workhead *, struct worklist *, int);
1024221167Sgnnstatic	void worklist_remove(struct worklist *, int);
1025221167Sgnn
1026221167Sgnn#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
1027221167Sgnn#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
1028221167Sgnn#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
1029221167Sgnn#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
1030221167Sgnn
1031221167Sgnnstatic void
1032221167Sgnnworklist_insert(head, item, locked)
1033221167Sgnn	struct workhead *head;
1034221167Sgnn	struct worklist *item;
1035221167Sgnn	int locked;
1036221167Sgnn{
1037221167Sgnn
1038221167Sgnn	if (locked)
1039221167Sgnn		mtx_assert(&lk, MA_OWNED);
1040221167Sgnn	if (item->wk_state & ONWORKLIST)
1041221167Sgnn		panic("worklist_insert: %p %s(0x%X) already on list",
1042221167Sgnn		    item, TYPENAME(item->wk_type), item->wk_state);
1043221167Sgnn	item->wk_state |= ONWORKLIST;
1044221167Sgnn	LIST_INSERT_HEAD(head, item, wk_list);
1045221167Sgnn}
1046221167Sgnn
1047221167Sgnnstatic void
1048221167Sgnnworklist_remove(item, locked)
1049221167Sgnn	struct worklist *item;
1050221167Sgnn	int locked;
1051221167Sgnn{
1052221167Sgnn
1053221167Sgnn	if (locked)
1054221167Sgnn		mtx_assert(&lk, MA_OWNED);
1055221167Sgnn	if ((item->wk_state & ONWORKLIST) == 0)
1056221167Sgnn		panic("worklist_remove: %p %s(0x%X) not on list",
1057221167Sgnn		    item, TYPENAME(item->wk_type), item->wk_state);
1058221167Sgnn	item->wk_state &= ~ONWORKLIST;
1059221167Sgnn	LIST_REMOVE(item, wk_list);
1060221167Sgnn}
1061221167Sgnn#endif /* DEBUG */
1062221167Sgnn
1063221167Sgnn/*
1064221167Sgnn * Merge two jsegdeps keeping only the oldest one as newer references
1065221167Sgnn * can't be discarded until after older references.
1066221167Sgnn */
1067221167Sgnnstatic inline struct jsegdep *
1068221167Sgnnjsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1069221167Sgnn{
1070221167Sgnn	struct jsegdep *swp;
1071221167Sgnn
1072221167Sgnn	if (two == NULL)
1073221167Sgnn		return (one);
1074221167Sgnn
1075221167Sgnn	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1076221167Sgnn		swp = one;
1077221167Sgnn		one = two;
1078221167Sgnn		two = swp;
1079221167Sgnn	}
1080221167Sgnn	WORKLIST_REMOVE(&two->jd_list);
1081221167Sgnn	free_jsegdep(two);
1082221167Sgnn
1083221167Sgnn	return (one);
1084221167Sgnn}
1085221167Sgnn
1086221167Sgnn/*
1087221167Sgnn * If two freedeps are compatible free one to reduce list size.
1088221167Sgnn */
1089221167Sgnnstatic inline struct freedep *
1090221167Sgnnfreedep_merge(struct freedep *one, struct freedep *two)
1091221167Sgnn{
1092221167Sgnn	if (two == NULL)
1093221167Sgnn		return (one);
1094221167Sgnn
1095221167Sgnn	if (one->fd_freework == two->fd_freework) {
1096221167Sgnn		WORKLIST_REMOVE(&two->fd_list);
1097221167Sgnn		free_freedep(two);
1098221167Sgnn	}
1099221167Sgnn	return (one);
1100221167Sgnn}
1101221167Sgnn
1102221167Sgnn/*
1103221167Sgnn * Move journal work from one list to another.  Duplicate freedeps and
1104221167Sgnn * jsegdeps are coalesced to keep the lists as small as possible.
1105221167Sgnn */
1106221167Sgnnstatic void
1107221167Sgnnjwork_move(dst, src)
1108221167Sgnn	struct workhead *dst;
1109221167Sgnn	struct workhead *src;
1110221167Sgnn{
1111221167Sgnn	struct freedep *freedep;
1112221167Sgnn	struct jsegdep *jsegdep;
1113221167Sgnn	struct worklist *wkn;
1114221167Sgnn	struct worklist *wk;
1115221167Sgnn
1116221167Sgnn	KASSERT(dst != src,
1117221167Sgnn	    ("jwork_move: dst == src"));
1118221167Sgnn	freedep = NULL;
1119221167Sgnn	jsegdep = NULL;
1120221167Sgnn	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1121221167Sgnn		if (wk->wk_type == D_JSEGDEP)
1122221167Sgnn			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1123221167Sgnn		if (wk->wk_type == D_FREEDEP)
1124221167Sgnn			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1125221167Sgnn	}
1126221167Sgnn
1127221167Sgnn	mtx_assert(&lk, MA_OWNED);
1128221167Sgnn	while ((wk = LIST_FIRST(src)) != NULL) {
1129221167Sgnn		WORKLIST_REMOVE(wk);
1130221167Sgnn		WORKLIST_INSERT(dst, wk);
1131221167Sgnn		if (wk->wk_type == D_JSEGDEP) {
1132221167Sgnn			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1133221167Sgnn			continue;
1134221167Sgnn		}
1135221167Sgnn		if (wk->wk_type == D_FREEDEP)
1136221167Sgnn			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1137221167Sgnn	}
1138221167Sgnn}
1139221167Sgnn
1140221167Sgnnstatic void
1141221167Sgnnjwork_insert(dst, jsegdep)
1142221167Sgnn	struct workhead *dst;
1143221167Sgnn	struct jsegdep *jsegdep;
1144221167Sgnn{
1145221167Sgnn	struct jsegdep *jsegdepn;
1146221167Sgnn	struct worklist *wk;
1147221167Sgnn
1148221167Sgnn	LIST_FOREACH(wk, dst, wk_list)
1149221167Sgnn		if (wk->wk_type == D_JSEGDEP)
1150221167Sgnn			break;
1151221167Sgnn	if (wk == NULL) {
1152221167Sgnn		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1153221167Sgnn		return;
1154221167Sgnn	}
1155221167Sgnn	jsegdepn = WK_JSEGDEP(wk);
1156221167Sgnn	if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1157221167Sgnn		WORKLIST_REMOVE(wk);
1158221167Sgnn		free_jsegdep(jsegdepn);
1159221167Sgnn		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1160221167Sgnn	} else
1161221167Sgnn		free_jsegdep(jsegdep);
1162221167Sgnn}
1163221167Sgnn
1164221167Sgnn/*
1165221167Sgnn * Routines for tracking and managing workitems.
1166221167Sgnn */
1167221167Sgnnstatic	void workitem_free(struct worklist *, int);
1168221167Sgnnstatic	void workitem_alloc(struct worklist *, int, struct mount *);
1169221167Sgnn
1170221167Sgnn#define	WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
1171221167Sgnn
1172221167Sgnnstatic void
1173221167Sgnnworkitem_free(item, type)
1174221167Sgnn	struct worklist *item;
1175221167Sgnn	int type;
1176221167Sgnn{
1177221167Sgnn	struct ufsmount *ump;
1178221167Sgnn	mtx_assert(&lk, MA_OWNED);
1179221167Sgnn
1180221167Sgnn#ifdef DEBUG
1181221167Sgnn	if (item->wk_state & ONWORKLIST)
1182221167Sgnn		panic("workitem_free: %s(0x%X) still on list",
1183221167Sgnn		    TYPENAME(item->wk_type), item->wk_state);
1184221167Sgnn	if (item->wk_type != type)
1185221167Sgnn		panic("workitem_free: type mismatch %s != %s",
1186221167Sgnn		    TYPENAME(item->wk_type), TYPENAME(type));
1187221167Sgnn#endif
1188221167Sgnn	if (item->wk_state & IOWAITING)
1189221167Sgnn		wakeup(item);
1190221167Sgnn	ump = VFSTOUFS(item->wk_mp);
1191221167Sgnn	if (--ump->softdep_deps == 0 && ump->softdep_req)
1192221167Sgnn		wakeup(&ump->softdep_deps);
1193221167Sgnn	dep_current[type]--;
1194221167Sgnn	free(item, DtoM(type));
1195221167Sgnn}
1196221167Sgnn
1197221167Sgnnstatic void
1198221167Sgnnworkitem_alloc(item, type, mp)
1199221167Sgnn	struct worklist *item;
1200221167Sgnn	int type;
1201221167Sgnn	struct mount *mp;
1202221167Sgnn{
1203221167Sgnn	struct ufsmount *ump;
1204221167Sgnn
1205221167Sgnn	item->wk_type = type;
1206221167Sgnn	item->wk_mp = mp;
1207221167Sgnn	item->wk_state = 0;
1208221167Sgnn
1209221167Sgnn	ump = VFSTOUFS(mp);
1210221167Sgnn	ACQUIRE_LOCK(&lk);
1211221167Sgnn	dep_current[type]++;
1212221167Sgnn	dep_total[type]++;
1213221167Sgnn	ump->softdep_deps++;
1214221167Sgnn	ump->softdep_accdeps++;
1215221167Sgnn	FREE_LOCK(&lk);
1216221167Sgnn}
1217221167Sgnn
1218221167Sgnn/*
1219221167Sgnn * Workitem queue management
1220221167Sgnn */
1221221167Sgnnstatic int max_softdeps;	/* maximum number of structs before slowdown */
1222221167Sgnnstatic int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
1223221167Sgnnstatic int tickdelay = 2;	/* number of ticks to pause during slowdown */
1224221167Sgnnstatic int proc_waiting;	/* tracks whether we have a timeout posted */
1225221167Sgnnstatic int *stat_countp;	/* statistic to count in proc_waiting timeout */
1226221167Sgnnstatic struct callout softdep_callout;
1227221167Sgnnstatic int req_pending;
1228221167Sgnnstatic int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1229221167Sgnnstatic int req_clear_remove;	/* syncer process flush some freeblks */
1230221167Sgnn
1231221167Sgnn/*
1232221167Sgnn * runtime statistics
1233221167Sgnn */
1234221167Sgnnstatic int stat_worklist_push;	/* number of worklist cleanups */
1235221167Sgnnstatic int stat_blk_limit_push;	/* number of times block limit neared */
1236221167Sgnnstatic int stat_ino_limit_push;	/* number of times inode limit neared */
1237221167Sgnnstatic int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1238221167Sgnnstatic int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1239221167Sgnnstatic int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1240221167Sgnnstatic int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1241221167Sgnnstatic int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1242221167Sgnnstatic int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1243221167Sgnnstatic int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1244221167Sgnnstatic int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1245221167Sgnnstatic int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1246221167Sgnnstatic int stat_journal_min;	/* Times hit journal min threshold */
1247221167Sgnnstatic int stat_journal_low;	/* Times hit journal low threshold */
1248221167Sgnnstatic int stat_journal_wait;	/* Times blocked in jwait(). */
1249221167Sgnnstatic int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1250221167Sgnnstatic int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1251221167Sgnnstatic int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1252221167Sgnnstatic int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1253221167Sgnnstatic int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1254221167Sgnnstatic int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1255221167Sgnnstatic int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1256221167Sgnnstatic int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1257221167Sgnnstatic int stat_cleanup_failures; /* Number of cleanup requests that failed */
1258221167Sgnn
1259221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1260221167Sgnn    &max_softdeps, 0, "");
1261221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1262221167Sgnn    &tickdelay, 0, "");
1263221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
1264221167Sgnn    &maxindirdeps, 0, "");
1265221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1266221167Sgnn    &stat_worklist_push, 0,"");
1267221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1268221167Sgnn    &stat_blk_limit_push, 0,"");
1269221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1270221167Sgnn    &stat_ino_limit_push, 0,"");
1271221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1272221167Sgnn    &stat_blk_limit_hit, 0, "");
1273221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1274221167Sgnn    &stat_ino_limit_hit, 0, "");
1275221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1276221167Sgnn    &stat_sync_limit_hit, 0, "");
1277221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1278221167Sgnn    &stat_indir_blk_ptrs, 0, "");
1279221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1280221167Sgnn    &stat_inode_bitmap, 0, "");
1281221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1282221167Sgnn    &stat_direct_blk_ptrs, 0, "");
1283221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1284221167Sgnn    &stat_dir_entry, 0, "");
1285221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1286221167Sgnn    &stat_jaddref, 0, "");
1287221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1288221167Sgnn    &stat_jnewblk, 0, "");
1289221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1290221167Sgnn    &stat_journal_low, 0, "");
1291221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1292221167Sgnn    &stat_journal_min, 0, "");
1293221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1294221167Sgnn    &stat_journal_wait, 0, "");
1295221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1296221167Sgnn    &stat_jwait_filepage, 0, "");
1297221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1298221167Sgnn    &stat_jwait_freeblks, 0, "");
1299221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1300221167Sgnn    &stat_jwait_inode, 0, "");
1301221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1302221167Sgnn    &stat_jwait_newblk, 0, "");
1303221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1304221167Sgnn    &stat_cleanup_blkrequests, 0, "");
1305221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1306221167Sgnn    &stat_cleanup_inorequests, 0, "");
1307221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1308221167Sgnn    &stat_cleanup_high_delay, 0, "");
1309221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1310221167Sgnn    &stat_cleanup_retries, 0, "");
1311221167SgnnSYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1312221167Sgnn    &stat_cleanup_failures, 0, "");
1313221167Sgnn
1314221167SgnnSYSCTL_DECL(_vfs_ffs);
1315221167Sgnn
1316221167SgnnLIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl;
1317221167Sgnnstatic u_long	bmsafemap_hash;	/* size of hash table - 1 */
1318221167Sgnn
1319221167Sgnnstatic int compute_summary_at_mount = 0;	/* Whether to recompute the summary at mount time */
1320221167SgnnSYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1321221167Sgnn	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1322221167Sgnn
1323221167Sgnnstatic struct proc *softdepproc;
1324221167Sgnnstatic struct kproc_desc softdep_kp = {
1325221167Sgnn	"softdepflush",
1326221167Sgnn	softdep_flush,
1327221167Sgnn	&softdepproc
1328221167Sgnn};
1329221167SgnnSYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
1330221167Sgnn    &softdep_kp);
1331221167Sgnn
1332221167Sgnnstatic void
1333221167Sgnnsoftdep_flush(void)
1334221167Sgnn{
1335221167Sgnn	struct mount *nmp;
1336221167Sgnn	struct mount *mp;
1337221167Sgnn	struct ufsmount *ump;
1338221167Sgnn	struct thread *td;
1339221167Sgnn	int remaining;
1340221167Sgnn	int progress;
1341221167Sgnn	int vfslocked;
1342221167Sgnn
1343221167Sgnn	td = curthread;
1344221167Sgnn	td->td_pflags |= TDP_NORUNNINGBUF;
1345221167Sgnn
1346221167Sgnn	for (;;) {
1347221167Sgnn		kproc_suspend_check(softdepproc);
1348221167Sgnn		vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
1349221167Sgnn		ACQUIRE_LOCK(&lk);
1350221167Sgnn		/*
1351221167Sgnn		 * If requested, try removing inode or removal dependencies.
1352221167Sgnn		 */
1353221167Sgnn		if (req_clear_inodedeps) {
1354221167Sgnn			clear_inodedeps();
1355221167Sgnn			req_clear_inodedeps -= 1;
1356221167Sgnn			wakeup_one(&proc_waiting);
1357221167Sgnn		}
1358221167Sgnn		if (req_clear_remove) {
1359221167Sgnn			clear_remove();
1360221167Sgnn			req_clear_remove -= 1;
1361221167Sgnn			wakeup_one(&proc_waiting);
1362221167Sgnn		}
1363221167Sgnn		FREE_LOCK(&lk);
1364221167Sgnn		VFS_UNLOCK_GIANT(vfslocked);
1365221167Sgnn		remaining = progress = 0;
1366221167Sgnn		mtx_lock(&mountlist_mtx);
1367221167Sgnn		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
1368221167Sgnn			nmp = TAILQ_NEXT(mp, mnt_list);
1369221167Sgnn			if (MOUNTEDSOFTDEP(mp) == 0)
1370221167Sgnn				continue;
1371221167Sgnn			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
1372221167Sgnn				continue;
1373221167Sgnn			vfslocked = VFS_LOCK_GIANT(mp);
1374221167Sgnn			progress += softdep_process_worklist(mp, 0);
1375221167Sgnn			ump = VFSTOUFS(mp);
1376221167Sgnn			remaining += ump->softdep_on_worklist;
1377221167Sgnn			VFS_UNLOCK_GIANT(vfslocked);
1378221167Sgnn			mtx_lock(&mountlist_mtx);
1379221167Sgnn			nmp = TAILQ_NEXT(mp, mnt_list);
1380221167Sgnn			vfs_unbusy(mp);
1381221167Sgnn		}
1382221167Sgnn		mtx_unlock(&mountlist_mtx);
1383221167Sgnn		if (remaining && progress)
1384221167Sgnn			continue;
1385221167Sgnn		ACQUIRE_LOCK(&lk);
1386221167Sgnn		if (!req_pending)
1387221167Sgnn			msleep(&req_pending, &lk, PVM, "sdflush", hz);
1388221167Sgnn		req_pending = 0;
1389221167Sgnn		FREE_LOCK(&lk);
1390221167Sgnn	}
1391221167Sgnn}
1392221167Sgnn
1393221167Sgnnstatic void
1394221167Sgnnworklist_speedup(void)
1395221167Sgnn{
1396221167Sgnn	mtx_assert(&lk, MA_OWNED);
1397221167Sgnn	if (req_pending == 0) {
1398221167Sgnn		req_pending = 1;
1399221167Sgnn		wakeup(&req_pending);
1400221167Sgnn	}
1401221167Sgnn}
1402221167Sgnn
1403221167Sgnnstatic int
1404221167Sgnnsoftdep_speedup(void)
1405221167Sgnn{
1406221167Sgnn
1407221167Sgnn	worklist_speedup();
1408221167Sgnn	bd_speedup();
1409221167Sgnn	return speedup_syncer();
1410221167Sgnn}
1411221167Sgnn
1412221167Sgnn/*
1413221167Sgnn * Add an item to the end of the work queue.
1414221167Sgnn * This routine requires that the lock be held.
1415221167Sgnn * This is the only routine that adds items to the list.
1416221167Sgnn * The following routine is the only one that removes items
1417221167Sgnn * and does so in order from first to last.
1418221167Sgnn */
1419221167Sgnn
1420221167Sgnn#define	WK_HEAD		0x0001	/* Add to HEAD. */
1421221167Sgnn#define	WK_NODELAY	0x0002	/* Process immediately. */
1422221167Sgnn
1423221167Sgnnstatic void
1424221167Sgnnadd_to_worklist(wk, flags)
1425221167Sgnn	struct worklist *wk;
1426221167Sgnn	int flags;
1427221167Sgnn{
1428221167Sgnn	struct ufsmount *ump;
1429221167Sgnn
1430221167Sgnn	mtx_assert(&lk, MA_OWNED);
1431221167Sgnn	ump = VFSTOUFS(wk->wk_mp);
1432221167Sgnn	if (wk->wk_state & ONWORKLIST)
1433221167Sgnn		panic("add_to_worklist: %s(0x%X) already on list",
1434221167Sgnn		    TYPENAME(wk->wk_type), wk->wk_state);
1435221167Sgnn	wk->wk_state |= ONWORKLIST;
1436221167Sgnn	if (ump->softdep_on_worklist == 0) {
1437221167Sgnn		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1438221167Sgnn		ump->softdep_worklist_tail = wk;
1439221167Sgnn	} else if (flags & WK_HEAD) {
1440221167Sgnn		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1441221167Sgnn	} else {
1442221167Sgnn		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1443221167Sgnn		ump->softdep_worklist_tail = wk;
1444221167Sgnn	}
1445221167Sgnn	ump->softdep_on_worklist += 1;
1446221167Sgnn	if (flags & WK_NODELAY)
1447221167Sgnn		worklist_speedup();
1448221167Sgnn}
1449221167Sgnn
1450221167Sgnn/*
1451221167Sgnn * Remove the item to be processed. If we are removing the last
1452221167Sgnn * item on the list, we need to recalculate the tail pointer.
1453221167Sgnn */
1454221167Sgnnstatic void
1455221167Sgnnremove_from_worklist(wk)
1456221167Sgnn	struct worklist *wk;
1457221167Sgnn{
1458221167Sgnn	struct ufsmount *ump;
1459221167Sgnn
1460221167Sgnn	ump = VFSTOUFS(wk->wk_mp);
1461221167Sgnn	WORKLIST_REMOVE(wk);
1462221167Sgnn	if (ump->softdep_worklist_tail == wk)
1463221167Sgnn		ump->softdep_worklist_tail =
1464221167Sgnn		    (struct worklist *)wk->wk_list.le_prev;
1465221167Sgnn	ump->softdep_on_worklist -= 1;
1466221167Sgnn}
1467221167Sgnn
1468221167Sgnnstatic void
1469221167Sgnnwake_worklist(wk)
1470221167Sgnn	struct worklist *wk;
1471221167Sgnn{
1472221167Sgnn	if (wk->wk_state & IOWAITING) {
1473221167Sgnn		wk->wk_state &= ~IOWAITING;
1474221167Sgnn		wakeup(wk);
1475221167Sgnn	}
1476221167Sgnn}
1477221167Sgnn
1478221167Sgnnstatic void
1479221167Sgnnwait_worklist(wk, wmesg)
1480221167Sgnn	struct worklist *wk;
1481221167Sgnn	char *wmesg;
1482221167Sgnn{
1483221167Sgnn
1484221167Sgnn	wk->wk_state |= IOWAITING;
1485221167Sgnn	msleep(wk, &lk, PVM, wmesg, 0);
1486221167Sgnn}
1487221167Sgnn
1488221167Sgnn/*
1489221167Sgnn * Process that runs once per second to handle items in the background queue.
1490221167Sgnn *
1491221167Sgnn * Note that we ensure that everything is done in the order in which they
1492221167Sgnn * appear in the queue. The code below depends on this property to ensure
1493221167Sgnn * that blocks of a file are freed before the inode itself is freed. This
1494221167Sgnn * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1495221167Sgnn * until all the old ones have been purged from the dependency lists.
1496221167Sgnn */
1497221167Sgnnint
1498221167Sgnnsoftdep_process_worklist(mp, full)
1499221167Sgnn	struct mount *mp;
1500221167Sgnn	int full;
1501221167Sgnn{
1502221167Sgnn	int cnt, matchcnt;
1503221167Sgnn	struct ufsmount *ump;
1504221167Sgnn	long starttime;
1505221167Sgnn
1506221167Sgnn	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1507221167Sgnn	/*
1508221167Sgnn	 * Record the process identifier of our caller so that we can give
1509221167Sgnn	 * this process preferential treatment in request_cleanup below.
1510221167Sgnn	 */
1511221167Sgnn	matchcnt = 0;
1512221167Sgnn	ump = VFSTOUFS(mp);
1513221167Sgnn	ACQUIRE_LOCK(&lk);
1514221167Sgnn	starttime = time_second;
1515221167Sgnn	softdep_process_journal(mp, NULL, full?MNT_WAIT:0);
1516221167Sgnn	while (ump->softdep_on_worklist > 0) {
1517221167Sgnn		if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1518221167Sgnn			break;
1519221167Sgnn		else
1520221167Sgnn			matchcnt += cnt;
1521221167Sgnn		/*
1522221167Sgnn		 * If requested, try removing inode or removal dependencies.
1523221167Sgnn		 */
1524221167Sgnn		if (req_clear_inodedeps) {
1525221167Sgnn			clear_inodedeps();
1526221167Sgnn			req_clear_inodedeps -= 1;
1527221167Sgnn			wakeup_one(&proc_waiting);
1528221167Sgnn		}
1529221167Sgnn		if (req_clear_remove) {
1530221167Sgnn			clear_remove();
1531221167Sgnn			req_clear_remove -= 1;
1532221167Sgnn			wakeup_one(&proc_waiting);
1533221167Sgnn		}
1534221167Sgnn		/*
1535221167Sgnn		 * We do not generally want to stop for buffer space, but if
1536221167Sgnn		 * we are really being a buffer hog, we will stop and wait.
1537221167Sgnn		 */
1538221167Sgnn		if (should_yield()) {
1539221167Sgnn			FREE_LOCK(&lk);
1540221167Sgnn			kern_yield(PRI_UNCHANGED);
1541221167Sgnn			bwillwrite();
1542221167Sgnn			ACQUIRE_LOCK(&lk);
1543221167Sgnn		}
1544221167Sgnn		/*
1545221167Sgnn		 * Never allow processing to run for more than one
1546221167Sgnn		 * second. Otherwise the other mountpoints may get
1547221167Sgnn		 * excessively backlogged.
1548221167Sgnn		 */
1549221167Sgnn		if (!full && starttime != time_second)
1550221167Sgnn			break;
1551221167Sgnn	}
1552221167Sgnn	if (full == 0)
1553221167Sgnn		journal_unsuspend(ump);
1554221167Sgnn	FREE_LOCK(&lk);
1555221167Sgnn	return (matchcnt);
1556221167Sgnn}
1557221167Sgnn
1558221167Sgnn/*
1559221167Sgnn * Process all removes associated with a vnode if we are running out of
1560221167Sgnn * journal space.  Any other process which attempts to flush these will
1561221167Sgnn * be unable as we have the vnodes locked.
1562221167Sgnn */
1563221167Sgnnstatic void
1564221167Sgnnprocess_removes(vp)
1565221167Sgnn	struct vnode *vp;
1566221167Sgnn{
1567221167Sgnn	struct inodedep *inodedep;
1568221167Sgnn	struct dirrem *dirrem;
1569221167Sgnn	struct mount *mp;
1570221167Sgnn	ino_t inum;
1571221167Sgnn
1572221167Sgnn	mtx_assert(&lk, MA_OWNED);
1573221167Sgnn
1574221167Sgnn	mp = vp->v_mount;
1575221167Sgnn	inum = VTOI(vp)->i_number;
1576221167Sgnn	for (;;) {
1577221167Sgnntop:
1578221167Sgnn		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1579221167Sgnn			return;
1580221167Sgnn		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1581221167Sgnn			/*
1582221167Sgnn			 * If another thread is trying to lock this vnode
1583221167Sgnn			 * it will fail but we must wait for it to do so
1584221167Sgnn			 * before we can proceed.
1585221167Sgnn			 */
1586221167Sgnn			if (dirrem->dm_state & INPROGRESS) {
1587221167Sgnn				wait_worklist(&dirrem->dm_list, "pwrwait");
1588221167Sgnn				goto top;
1589221167Sgnn			}
1590221167Sgnn			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1591221167Sgnn			    (COMPLETE | ONWORKLIST))
1592221167Sgnn				break;
1593221167Sgnn		}
1594221167Sgnn		if (dirrem == NULL)
1595221167Sgnn			return;
1596221167Sgnn		remove_from_worklist(&dirrem->dm_list);
1597221167Sgnn		FREE_LOCK(&lk);
1598221167Sgnn		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1599221167Sgnn			panic("process_removes: suspended filesystem");
1600221167Sgnn		handle_workitem_remove(dirrem, 0);
1601221167Sgnn		vn_finished_secondary_write(mp);
1602221167Sgnn		ACQUIRE_LOCK(&lk);
1603221167Sgnn	}
1604221167Sgnn}
1605221167Sgnn
1606221167Sgnn/*
1607221167Sgnn * Process all truncations associated with a vnode if we are running out
1608221167Sgnn * of journal space.  This is called when the vnode lock is already held
1609221167Sgnn * and no other process can clear the truncation.  This function returns
1610221167Sgnn * a value greater than zero if it did any work.
1611221167Sgnn */
1612221167Sgnnstatic void
1613221167Sgnnprocess_truncates(vp)
1614221167Sgnn	struct vnode *vp;
1615221167Sgnn{
1616221167Sgnn	struct inodedep *inodedep;
1617221167Sgnn	struct freeblks *freeblks;
1618221167Sgnn	struct mount *mp;
1619221167Sgnn	ino_t inum;
1620221167Sgnn	int cgwait;
1621221167Sgnn
1622221167Sgnn	mtx_assert(&lk, MA_OWNED);
1623221167Sgnn
1624221167Sgnn	mp = vp->v_mount;
1625221167Sgnn	inum = VTOI(vp)->i_number;
1626221167Sgnn	for (;;) {
1627221167Sgnn		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1628221167Sgnn			return;
1629221167Sgnn		cgwait = 0;
1630221167Sgnn		TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1631221167Sgnn			/* Journal entries not yet written.  */
1632221167Sgnn			if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1633221167Sgnn				jwait(&LIST_FIRST(
1634221167Sgnn				    &freeblks->fb_jblkdephd)->jb_list,
1635221167Sgnn				    MNT_WAIT);
1636221167Sgnn				break;
1637221167Sgnn			}
1638221167Sgnn			/* Another thread is executing this item. */
1639221167Sgnn			if (freeblks->fb_state & INPROGRESS) {
1640221167Sgnn				wait_worklist(&freeblks->fb_list, "ptrwait");
1641221167Sgnn				break;
1642221167Sgnn			}
1643221167Sgnn			/* Freeblks is waiting on a inode write. */
1644221167Sgnn			if ((freeblks->fb_state & COMPLETE) == 0) {
1645221167Sgnn				FREE_LOCK(&lk);
1646221167Sgnn				ffs_update(vp, 1);
1647221167Sgnn				ACQUIRE_LOCK(&lk);
1648221167Sgnn				break;
1649221167Sgnn			}
1650221167Sgnn			if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1651221167Sgnn			    (ALLCOMPLETE | ONWORKLIST)) {
1652221167Sgnn				remove_from_worklist(&freeblks->fb_list);
1653221167Sgnn				freeblks->fb_state |= INPROGRESS;
1654221167Sgnn				FREE_LOCK(&lk);
1655221167Sgnn				if (vn_start_secondary_write(NULL, &mp,
1656221167Sgnn				    V_NOWAIT))
1657221167Sgnn					panic("process_truncates: "
1658221167Sgnn					    "suspended filesystem");
1659221167Sgnn				handle_workitem_freeblocks(freeblks, 0);
1660221167Sgnn				vn_finished_secondary_write(mp);
1661221167Sgnn				ACQUIRE_LOCK(&lk);
1662221167Sgnn				break;
1663221167Sgnn			}
1664221167Sgnn			if (freeblks->fb_cgwait)
1665221167Sgnn				cgwait++;
1666221167Sgnn		}
1667221167Sgnn		if (cgwait) {
1668221167Sgnn			FREE_LOCK(&lk);
1669221167Sgnn			sync_cgs(mp, MNT_WAIT);
1670221167Sgnn			ffs_sync_snap(mp, MNT_WAIT);
1671221167Sgnn			ACQUIRE_LOCK(&lk);
1672221167Sgnn			continue;
1673221167Sgnn		}
1674221167Sgnn		if (freeblks == NULL)
1675221167Sgnn			break;
1676221167Sgnn	}
1677221167Sgnn	return;
1678221167Sgnn}
1679221167Sgnn
1680221167Sgnn/*
1681221167Sgnn * Process one item on the worklist.
1682221167Sgnn */
1683221167Sgnnstatic int
1684221167Sgnnprocess_worklist_item(mp, target, flags)
1685221167Sgnn	struct mount *mp;
1686221167Sgnn	int target;
1687221167Sgnn	int flags;
1688221167Sgnn{
1689221167Sgnn	struct worklist sintenel;
1690221167Sgnn	struct worklist *wk;
1691221167Sgnn	struct ufsmount *ump;
1692221167Sgnn	int matchcnt;
1693221167Sgnn	int error;
1694221167Sgnn
1695221167Sgnn	mtx_assert(&lk, MA_OWNED);
1696221167Sgnn	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1697221167Sgnn	/*
1698221167Sgnn	 * If we are being called because of a process doing a
1699221167Sgnn	 * copy-on-write, then it is not safe to write as we may
1700221167Sgnn	 * recurse into the copy-on-write routine.
1701221167Sgnn	 */
1702221167Sgnn	if (curthread->td_pflags & TDP_COWINPROGRESS)
1703221167Sgnn		return (-1);
1704221167Sgnn	PHOLD(curproc);	/* Don't let the stack go away. */
1705221167Sgnn	ump = VFSTOUFS(mp);
1706221167Sgnn	matchcnt = 0;
1707221167Sgnn	sintenel.wk_mp = NULL;
1708221167Sgnn	sintenel.wk_type = D_SENTINAL;
1709221167Sgnn	LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sintenel, wk_list);
1710221167Sgnn	for (wk = LIST_NEXT(&sintenel, wk_list); wk != NULL;
1711221167Sgnn	    wk = LIST_NEXT(&sintenel, wk_list)) {
1712221167Sgnn		if (wk->wk_type == D_SENTINAL) {
1713221167Sgnn			LIST_REMOVE(&sintenel, wk_list);
1714221167Sgnn			LIST_INSERT_AFTER(wk, &sintenel, wk_list);
1715221167Sgnn			continue;
1716221167Sgnn		}
1717221167Sgnn		if (wk->wk_state & INPROGRESS)
1718221167Sgnn			panic("process_worklist_item: %p already in progress.",
1719221167Sgnn			    wk);
1720221167Sgnn		wk->wk_state |= INPROGRESS;
1721221167Sgnn		remove_from_worklist(wk);
1722221167Sgnn		FREE_LOCK(&lk);
1723221167Sgnn		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1724221167Sgnn			panic("process_worklist_item: suspended filesystem");
1725221167Sgnn		switch (wk->wk_type) {
1726221167Sgnn		case D_DIRREM:
1727221167Sgnn			/* removal of a directory entry */
1728221167Sgnn			error = handle_workitem_remove(WK_DIRREM(wk), flags);
1729221167Sgnn			break;
1730221167Sgnn
1731221167Sgnn		case D_FREEBLKS:
1732221167Sgnn			/* releasing blocks and/or fragments from a file */
1733221167Sgnn			error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1734221167Sgnn			    flags);
1735221167Sgnn			break;
1736221167Sgnn
1737221167Sgnn		case D_FREEFRAG:
1738221167Sgnn			/* releasing a fragment when replaced as a file grows */
1739221167Sgnn			handle_workitem_freefrag(WK_FREEFRAG(wk));
1740221167Sgnn			error = 0;
1741221167Sgnn			break;
1742221167Sgnn
1743221167Sgnn		case D_FREEFILE:
1744221167Sgnn			/* releasing an inode when its link count drops to 0 */
1745221167Sgnn			handle_workitem_freefile(WK_FREEFILE(wk));
1746221167Sgnn			error = 0;
1747221167Sgnn			break;
1748221167Sgnn
1749221167Sgnn		default:
1750221167Sgnn			panic("%s_process_worklist: Unknown type %s",
1751221167Sgnn			    "softdep", TYPENAME(wk->wk_type));
1752221167Sgnn			/* NOTREACHED */
1753221167Sgnn		}
1754221167Sgnn		vn_finished_secondary_write(mp);
1755221167Sgnn		ACQUIRE_LOCK(&lk);
1756221167Sgnn		if (error == 0) {
1757221167Sgnn			if (++matchcnt == target)
1758221167Sgnn				break;
1759221167Sgnn			continue;
1760221167Sgnn		}
1761221167Sgnn		/*
1762221167Sgnn		 * We have to retry the worklist item later.  Wake up any
1763221167Sgnn		 * waiters who may be able to complete it immediately and
1764221167Sgnn		 * add the item back to the head so we don't try to execute
1765221167Sgnn		 * it again.
1766221167Sgnn		 */
1767221167Sgnn		wk->wk_state &= ~INPROGRESS;
1768221167Sgnn		wake_worklist(wk);
1769221167Sgnn		add_to_worklist(wk, WK_HEAD);
1770221167Sgnn	}
1771221167Sgnn	LIST_REMOVE(&sintenel, wk_list);
1772221167Sgnn	/* Sentinal could've become the tail from remove_from_worklist. */
1773221167Sgnn	if (ump->softdep_worklist_tail == &sintenel)
1774221167Sgnn		ump->softdep_worklist_tail =
1775221167Sgnn		    (struct worklist *)sintenel.wk_list.le_prev;
1776221167Sgnn	PRELE(curproc);
1777221167Sgnn	return (matchcnt);
1778221167Sgnn}
1779221167Sgnn
1780221167Sgnn/*
1781221167Sgnn * Move dependencies from one buffer to another.
1782221167Sgnn */
1783221167Sgnnint
1784221167Sgnnsoftdep_move_dependencies(oldbp, newbp)
1785221167Sgnn	struct buf *oldbp;
1786221167Sgnn	struct buf *newbp;
1787221167Sgnn{
1788221167Sgnn	struct worklist *wk, *wktail;
1789221167Sgnn	int dirty;
1790221167Sgnn
1791221167Sgnn	dirty = 0;
1792221167Sgnn	wktail = NULL;
1793221167Sgnn	ACQUIRE_LOCK(&lk);
1794221167Sgnn	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1795221167Sgnn		LIST_REMOVE(wk, wk_list);
1796221167Sgnn		if (wk->wk_type == D_BMSAFEMAP &&
1797221167Sgnn		    bmsafemap_rollbacks(WK_BMSAFEMAP(wk)))
1798221167Sgnn			dirty = 1;
1799221167Sgnn		if (wktail == 0)
1800221167Sgnn			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1801221167Sgnn		else
1802221167Sgnn			LIST_INSERT_AFTER(wktail, wk, wk_list);
1803221167Sgnn		wktail = wk;
1804221167Sgnn	}
1805221167Sgnn	FREE_LOCK(&lk);
1806221167Sgnn
1807221167Sgnn	return (dirty);
1808221167Sgnn}
1809221167Sgnn
1810221167Sgnn/*
1811221167Sgnn * Purge the work list of all items associated with a particular mount point.
1812221167Sgnn */
1813221167Sgnnint
1814221167Sgnnsoftdep_flushworklist(oldmnt, countp, td)
1815221167Sgnn	struct mount *oldmnt;
1816221167Sgnn	int *countp;
1817221167Sgnn	struct thread *td;
1818221167Sgnn{
1819221167Sgnn	struct vnode *devvp;
1820221167Sgnn	int count, error = 0;
1821221167Sgnn	struct ufsmount *ump;
1822221167Sgnn
1823221167Sgnn	/*
1824221167Sgnn	 * Alternately flush the block device associated with the mount
1825221167Sgnn	 * point and process any dependencies that the flushing
1826221167Sgnn	 * creates. We continue until no more worklist dependencies
1827221167Sgnn	 * are found.
1828221167Sgnn	 */
1829221167Sgnn	*countp = 0;
1830221167Sgnn	ump = VFSTOUFS(oldmnt);
1831221167Sgnn	devvp = ump->um_devvp;
1832221167Sgnn	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1833221167Sgnn		*countp += count;
1834221167Sgnn		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1835221167Sgnn		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1836221167Sgnn		VOP_UNLOCK(devvp, 0);
1837221167Sgnn		if (error)
1838221167Sgnn			break;
1839221167Sgnn	}
1840221167Sgnn	return (error);
1841221167Sgnn}
1842221167Sgnn
1843221167Sgnnint
1844221167Sgnnsoftdep_waitidle(struct mount *mp)
1845221167Sgnn{
1846221167Sgnn	struct ufsmount *ump;
1847221167Sgnn	int error;
1848221167Sgnn	int i;
1849221167Sgnn
1850221167Sgnn	ump = VFSTOUFS(mp);
1851221167Sgnn	ACQUIRE_LOCK(&lk);
1852221167Sgnn	for (i = 0; i < 10 && ump->softdep_deps; i++) {
1853221167Sgnn		ump->softdep_req = 1;
1854221167Sgnn		if (ump->softdep_on_worklist)
1855221167Sgnn			panic("softdep_waitidle: work added after flush.");
1856221167Sgnn		msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
1857221167Sgnn	}
1858221167Sgnn	ump->softdep_req = 0;
1859221167Sgnn	FREE_LOCK(&lk);
1860221167Sgnn	error = 0;
1861221167Sgnn	if (i == 10) {
1862221167Sgnn		error = EBUSY;
1863221167Sgnn		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1864221167Sgnn		    mp);
1865221167Sgnn	}
1866221167Sgnn
1867221167Sgnn	return (error);
1868221167Sgnn}
1869221167Sgnn
1870221167Sgnn/*
1871221167Sgnn * Flush all vnodes and worklist items associated with a specified mount point.
1872221167Sgnn */
1873221167Sgnnint
1874221167Sgnnsoftdep_flushfiles(oldmnt, flags, td)
1875221167Sgnn	struct mount *oldmnt;
1876221167Sgnn	int flags;
1877221167Sgnn	struct thread *td;
1878221167Sgnn{
1879221167Sgnn	int error, depcount, loopcnt, retry_flush_count, retry;
1880221167Sgnn
1881221167Sgnn	loopcnt = 10;
1882221167Sgnn	retry_flush_count = 3;
1883221167Sgnnretry_flush:
1884221167Sgnn	error = 0;
1885221167Sgnn
1886221167Sgnn	/*
1887221167Sgnn	 * Alternately flush the vnodes associated with the mount
1888221167Sgnn	 * point and process any dependencies that the flushing
1889221167Sgnn	 * creates. In theory, this loop can happen at most twice,
1890221167Sgnn	 * but we give it a few extra just to be sure.
1891221167Sgnn	 */
1892221167Sgnn	for (; loopcnt > 0; loopcnt--) {
1893221167Sgnn		/*
1894221167Sgnn		 * Do another flush in case any vnodes were brought in
1895221167Sgnn		 * as part of the cleanup operations.
1896221167Sgnn		 */
1897221167Sgnn		if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
1898221167Sgnn			break;
1899221167Sgnn		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1900221167Sgnn		    depcount == 0)
1901221167Sgnn			break;
1902221167Sgnn	}
1903221167Sgnn	/*
1904221167Sgnn	 * If we are unmounting then it is an error to fail. If we
1905221167Sgnn	 * are simply trying to downgrade to read-only, then filesystem
1906221167Sgnn	 * activity can keep us busy forever, so we just fail with EBUSY.
1907221167Sgnn	 */
1908221167Sgnn	if (loopcnt == 0) {
1909221167Sgnn		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1910221167Sgnn			panic("softdep_flushfiles: looping");
1911221167Sgnn		error = EBUSY;
1912221167Sgnn	}
1913221167Sgnn	if (!error)
1914221167Sgnn		error = softdep_waitidle(oldmnt);
1915221167Sgnn	if (!error) {
1916221167Sgnn		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
1917221167Sgnn			retry = 0;
1918221167Sgnn			MNT_ILOCK(oldmnt);
1919221167Sgnn			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
1920221167Sgnn			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
1921221167Sgnn			if (oldmnt->mnt_nvnodelistsize > 0) {
1922221167Sgnn				if (--retry_flush_count > 0) {
1923221167Sgnn					retry = 1;
1924221167Sgnn					loopcnt = 3;
1925221167Sgnn				} else
1926221167Sgnn					error = EBUSY;
1927221167Sgnn			}
1928221167Sgnn			MNT_IUNLOCK(oldmnt);
1929221167Sgnn			if (retry)
1930221167Sgnn				goto retry_flush;
1931221167Sgnn		}
1932221167Sgnn	}
1933221167Sgnn	return (error);
1934221167Sgnn}
1935221167Sgnn
1936221167Sgnn/*
1937221167Sgnn * Structure hashing.
1938221167Sgnn *
1939221167Sgnn * There are three types of structures that can be looked up:
1940221167Sgnn *	1) pagedep structures identified by mount point, inode number,
1941221167Sgnn *	   and logical block.
1942221167Sgnn *	2) inodedep structures identified by mount point and inode number.
1943221167Sgnn *	3) newblk structures identified by mount point and
1944221167Sgnn *	   physical block number.
1945221167Sgnn *
1946221167Sgnn * The "pagedep" and "inodedep" dependency structures are hashed
1947221167Sgnn * separately from the file blocks and inodes to which they correspond.
1948221167Sgnn * This separation helps when the in-memory copy of an inode or
1949221167Sgnn * file block must be replaced. It also obviates the need to access
1950221167Sgnn * an inode or file page when simply updating (or de-allocating)
1951221167Sgnn * dependency structures. Lookup of newblk structures is needed to
1952221167Sgnn * find newly allocated blocks when trying to associate them with
1953221167Sgnn * their allocdirect or allocindir structure.
1954221167Sgnn *
1955221167Sgnn * The lookup routines optionally create and hash a new instance when
1956221167Sgnn * an existing entry is not found.
1957221167Sgnn */
1958221167Sgnn#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
1959221167Sgnn#define NODELAY		0x0002	/* cannot do background work */
1960221167Sgnn
1961221167Sgnn/*
1962221167Sgnn * Structures and routines associated with pagedep caching.
1963221167Sgnn */
1964221167SgnnLIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
1965221167Sgnnu_long	pagedep_hash;		/* size of hash table - 1 */
1966221167Sgnn#define	PAGEDEP_HASH(mp, inum, lbn) \
1967221167Sgnn	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
1968221167Sgnn	    pagedep_hash])
1969221167Sgnn
1970221167Sgnnstatic int
1971221167Sgnnpagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
1972221167Sgnn	struct pagedep_hashhead *pagedephd;
1973221167Sgnn	ino_t ino;
1974221167Sgnn	ufs_lbn_t lbn;
1975221167Sgnn	struct mount *mp;
1976221167Sgnn	int flags;
1977221167Sgnn	struct pagedep **pagedeppp;
1978221167Sgnn{
1979221167Sgnn	struct pagedep *pagedep;
1980221167Sgnn
1981221167Sgnn	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
1982221167Sgnn		if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn &&
1983221167Sgnn		    mp == pagedep->pd_list.wk_mp) {
1984221167Sgnn			*pagedeppp = pagedep;
1985221167Sgnn			return (1);
1986221167Sgnn		}
1987221167Sgnn	}
1988221167Sgnn	*pagedeppp = NULL;
1989221167Sgnn	return (0);
1990221167Sgnn}
1991221167Sgnn/*
1992221167Sgnn * Look up a pagedep. Return 1 if found, 0 otherwise.
1993221167Sgnn * If not found, allocate if DEPALLOC flag is passed.
1994221167Sgnn * Found or allocated entry is returned in pagedeppp.
1995221167Sgnn * This routine must be called with splbio interrupts blocked.
1996221167Sgnn */
1997221167Sgnnstatic int
1998221167Sgnnpagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
1999221167Sgnn	struct mount *mp;
2000221167Sgnn	struct buf *bp;
2001221167Sgnn	ino_t ino;
2002221167Sgnn	ufs_lbn_t lbn;
2003221167Sgnn	int flags;
2004221167Sgnn	struct pagedep **pagedeppp;
2005221167Sgnn{
2006221167Sgnn	struct pagedep *pagedep;
2007221167Sgnn	struct pagedep_hashhead *pagedephd;
2008221167Sgnn	struct worklist *wk;
2009221167Sgnn	int ret;
2010221167Sgnn	int i;
2011221167Sgnn
2012221167Sgnn	mtx_assert(&lk, MA_OWNED);
2013221167Sgnn	if (bp) {
2014221167Sgnn		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2015221167Sgnn			if (wk->wk_type == D_PAGEDEP) {
2016221167Sgnn				*pagedeppp = WK_PAGEDEP(wk);
2017221167Sgnn				return (1);
2018221167Sgnn			}
2019221167Sgnn		}
2020221167Sgnn	}
2021221167Sgnn	pagedephd = PAGEDEP_HASH(mp, ino, lbn);
2022221167Sgnn	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
2023221167Sgnn	if (ret) {
2024221167Sgnn		if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2025221167Sgnn			WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2026221167Sgnn		return (1);
2027221167Sgnn	}
2028221167Sgnn	if ((flags & DEPALLOC) == 0)
2029221167Sgnn		return (0);
2030221167Sgnn	FREE_LOCK(&lk);
2031221167Sgnn	pagedep = malloc(sizeof(struct pagedep),
2032221167Sgnn	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2033221167Sgnn	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2034221167Sgnn	ACQUIRE_LOCK(&lk);
2035221167Sgnn	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
2036221167Sgnn	if (*pagedeppp) {
2037221167Sgnn		/*
2038221167Sgnn		 * This should never happen since we only create pagedeps
2039221167Sgnn		 * with the vnode lock held.  Could be an assert.
2040221167Sgnn		 */
2041221167Sgnn		WORKITEM_FREE(pagedep, D_PAGEDEP);
2042221167Sgnn		return (ret);
2043221167Sgnn	}
2044221167Sgnn	pagedep->pd_ino = ino;
2045221167Sgnn	pagedep->pd_lbn = lbn;
2046221167Sgnn	LIST_INIT(&pagedep->pd_dirremhd);
2047221167Sgnn	LIST_INIT(&pagedep->pd_pendinghd);
2048221167Sgnn	for (i = 0; i < DAHASHSZ; i++)
2049221167Sgnn		LIST_INIT(&pagedep->pd_diraddhd[i]);
2050221167Sgnn	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2051221167Sgnn	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2052221167Sgnn	*pagedeppp = pagedep;
2053221167Sgnn	return (0);
2054221167Sgnn}
2055221167Sgnn
2056221167Sgnn/*
2057221167Sgnn * Structures and routines associated with inodedep caching.
2058221167Sgnn */
2059221167SgnnLIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
2060221167Sgnnstatic u_long	inodedep_hash;	/* size of hash table - 1 */
2061221167Sgnn#define	INODEDEP_HASH(fs, inum) \
2062221167Sgnn      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
2063221167Sgnn
2064221167Sgnnstatic int
2065221167Sgnninodedep_find(inodedephd, fs, inum, inodedeppp)
2066221167Sgnn	struct inodedep_hashhead *inodedephd;
2067221167Sgnn	struct fs *fs;
2068221167Sgnn	ino_t inum;
2069221167Sgnn	struct inodedep **inodedeppp;
2070221167Sgnn{
2071221167Sgnn	struct inodedep *inodedep;
2072221167Sgnn
2073221167Sgnn	LIST_FOREACH(inodedep, inodedephd, id_hash)
2074221167Sgnn		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
2075221167Sgnn			break;
2076221167Sgnn	if (inodedep) {
2077221167Sgnn		*inodedeppp = inodedep;
2078221167Sgnn		return (1);
2079221167Sgnn	}
2080221167Sgnn	*inodedeppp = NULL;
2081221167Sgnn
2082221167Sgnn	return (0);
2083221167Sgnn}
2084221167Sgnn/*
2085221167Sgnn * Look up an inodedep. Return 1 if found, 0 if not found.
2086221167Sgnn * If not found, allocate if DEPALLOC flag is passed.
2087221167Sgnn * Found or allocated entry is returned in inodedeppp.
2088221167Sgnn * This routine must be called with splbio interrupts blocked.
2089221167Sgnn */
2090221167Sgnnstatic int
2091221167Sgnninodedep_lookup(mp, inum, flags, inodedeppp)
2092221167Sgnn	struct mount *mp;
2093221167Sgnn	ino_t inum;
2094221167Sgnn	int flags;
2095221167Sgnn	struct inodedep **inodedeppp;
2096221167Sgnn{
2097221167Sgnn	struct inodedep *inodedep;
2098221167Sgnn	struct inodedep_hashhead *inodedephd;
2099221167Sgnn	struct fs *fs;
2100221167Sgnn
2101221167Sgnn	mtx_assert(&lk, MA_OWNED);
2102221167Sgnn	fs = VFSTOUFS(mp)->um_fs;
2103221167Sgnn	inodedephd = INODEDEP_HASH(fs, inum);
2104221167Sgnn
2105221167Sgnn	if (inodedep_find(inodedephd, fs, inum, inodedeppp))
2106221167Sgnn		return (1);
2107221167Sgnn	if ((flags & DEPALLOC) == 0)
2108221167Sgnn		return (0);
2109221167Sgnn	/*
2110221167Sgnn	 * If we are over our limit, try to improve the situation.
2111221167Sgnn	 */
2112	if (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0)
2113		request_cleanup(mp, FLUSH_INODES);
2114	FREE_LOCK(&lk);
2115	inodedep = malloc(sizeof(struct inodedep),
2116		M_INODEDEP, M_SOFTDEP_FLAGS);
2117	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2118	ACQUIRE_LOCK(&lk);
2119	if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
2120		WORKITEM_FREE(inodedep, D_INODEDEP);
2121		return (1);
2122	}
2123	inodedep->id_fs = fs;
2124	inodedep->id_ino = inum;
2125	inodedep->id_state = ALLCOMPLETE;
2126	inodedep->id_nlinkdelta = 0;
2127	inodedep->id_savedino1 = NULL;
2128	inodedep->id_savedsize = -1;
2129	inodedep->id_savedextsize = -1;
2130	inodedep->id_savednlink = -1;
2131	inodedep->id_bmsafemap = NULL;
2132	inodedep->id_mkdiradd = NULL;
2133	LIST_INIT(&inodedep->id_dirremhd);
2134	LIST_INIT(&inodedep->id_pendinghd);
2135	LIST_INIT(&inodedep->id_inowait);
2136	LIST_INIT(&inodedep->id_bufwait);
2137	TAILQ_INIT(&inodedep->id_inoreflst);
2138	TAILQ_INIT(&inodedep->id_inoupdt);
2139	TAILQ_INIT(&inodedep->id_newinoupdt);
2140	TAILQ_INIT(&inodedep->id_extupdt);
2141	TAILQ_INIT(&inodedep->id_newextupdt);
2142	TAILQ_INIT(&inodedep->id_freeblklst);
2143	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2144	*inodedeppp = inodedep;
2145	return (0);
2146}
2147
2148/*
2149 * Structures and routines associated with newblk caching.
2150 */
2151LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
2152u_long	newblk_hash;		/* size of hash table - 1 */
2153#define	NEWBLK_HASH(fs, inum) \
2154	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
2155
2156static int
2157newblk_find(newblkhd, mp, newblkno, flags, newblkpp)
2158	struct newblk_hashhead *newblkhd;
2159	struct mount *mp;
2160	ufs2_daddr_t newblkno;
2161	int flags;
2162	struct newblk **newblkpp;
2163{
2164	struct newblk *newblk;
2165
2166	LIST_FOREACH(newblk, newblkhd, nb_hash) {
2167		if (newblkno != newblk->nb_newblkno)
2168			continue;
2169		if (mp != newblk->nb_list.wk_mp)
2170			continue;
2171		/*
2172		 * If we're creating a new dependency don't match those that
2173		 * have already been converted to allocdirects.  This is for
2174		 * a frag extend.
2175		 */
2176		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2177			continue;
2178		break;
2179	}
2180	if (newblk) {
2181		*newblkpp = newblk;
2182		return (1);
2183	}
2184	*newblkpp = NULL;
2185	return (0);
2186}
2187
2188/*
2189 * Look up a newblk. Return 1 if found, 0 if not found.
2190 * If not found, allocate if DEPALLOC flag is passed.
2191 * Found or allocated entry is returned in newblkpp.
2192 */
2193static int
2194newblk_lookup(mp, newblkno, flags, newblkpp)
2195	struct mount *mp;
2196	ufs2_daddr_t newblkno;
2197	int flags;
2198	struct newblk **newblkpp;
2199{
2200	struct newblk *newblk;
2201	struct newblk_hashhead *newblkhd;
2202
2203	newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno);
2204	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp))
2205		return (1);
2206	if ((flags & DEPALLOC) == 0)
2207		return (0);
2208	FREE_LOCK(&lk);
2209	newblk = malloc(sizeof(union allblk), M_NEWBLK,
2210	    M_SOFTDEP_FLAGS | M_ZERO);
2211	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2212	ACQUIRE_LOCK(&lk);
2213	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) {
2214		WORKITEM_FREE(newblk, D_NEWBLK);
2215		return (1);
2216	}
2217	newblk->nb_freefrag = NULL;
2218	LIST_INIT(&newblk->nb_indirdeps);
2219	LIST_INIT(&newblk->nb_newdirblk);
2220	LIST_INIT(&newblk->nb_jwork);
2221	newblk->nb_state = ATTACHED;
2222	newblk->nb_newblkno = newblkno;
2223	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2224	*newblkpp = newblk;
2225	return (0);
2226}
2227
2228/*
2229 * Structures and routines associated with freed indirect block caching.
2230 */
2231struct freeworklst *indir_hashtbl;
2232u_long	indir_hash;		/* size of hash table - 1 */
2233#define	INDIR_HASH(mp, blkno) \
2234	(&indir_hashtbl[((((register_t)(mp)) >> 13) + (blkno)) & indir_hash])
2235
2236/*
2237 * Lookup an indirect block in the indir hash table.  The freework is
2238 * removed and potentially freed.  The caller must do a blocking journal
2239 * write before writing to the blkno.
2240 */
2241static int
2242indirblk_lookup(mp, blkno)
2243	struct mount *mp;
2244	ufs2_daddr_t blkno;
2245{
2246	struct freework *freework;
2247	struct freeworklst *wkhd;
2248
2249	wkhd = INDIR_HASH(mp, blkno);
2250	TAILQ_FOREACH(freework, wkhd, fw_next) {
2251		if (freework->fw_blkno != blkno)
2252			continue;
2253		if (freework->fw_list.wk_mp != mp)
2254			continue;
2255		indirblk_remove(freework);
2256		return (1);
2257	}
2258	return (0);
2259}
2260
2261/*
2262 * Insert an indirect block represented by freework into the indirblk
2263 * hash table so that it may prevent the block from being re-used prior
2264 * to the journal being written.
2265 */
2266static void
2267indirblk_insert(freework)
2268	struct freework *freework;
2269{
2270	struct freeblks *freeblks;
2271	struct jsegdep *jsegdep;
2272	struct worklist *wk;
2273
2274	freeblks = freework->fw_freeblks;
2275	LIST_FOREACH(wk, &freeblks->fb_jwork, wk_list)
2276		if (wk->wk_type == D_JSEGDEP)
2277			break;
2278	if (wk == NULL)
2279		return;
2280
2281	jsegdep = WK_JSEGDEP(wk);
2282	LIST_INSERT_HEAD(&jsegdep->jd_seg->js_indirs, freework, fw_segs);
2283	TAILQ_INSERT_HEAD(INDIR_HASH(freework->fw_list.wk_mp,
2284	    freework->fw_blkno), freework, fw_next);
2285	freework->fw_state &= ~DEPCOMPLETE;
2286}
2287
2288static void
2289indirblk_remove(freework)
2290	struct freework *freework;
2291{
2292
2293	LIST_REMOVE(freework, fw_segs);
2294	TAILQ_REMOVE(INDIR_HASH(freework->fw_list.wk_mp,
2295	    freework->fw_blkno), freework, fw_next);
2296	freework->fw_state |= DEPCOMPLETE;
2297	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2298		WORKITEM_FREE(freework, D_FREEWORK);
2299}
2300
2301/*
2302 * Executed during filesystem system initialization before
2303 * mounting any filesystems.
2304 */
2305void
2306softdep_initialize()
2307{
2308	int i;
2309
2310	LIST_INIT(&mkdirlisthd);
2311	max_softdeps = desiredvnodes * 4;
2312	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash);
2313	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
2314	newblk_hashtbl = hashinit(desiredvnodes / 5,  M_NEWBLK, &newblk_hash);
2315	bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
2316	i = 1 << (ffs(desiredvnodes / 10) - 1);
2317	indir_hashtbl = malloc(i * sizeof(indir_hashtbl[0]), M_FREEWORK,
2318	    M_WAITOK);
2319	indir_hash = i - 1;
2320	for (i = 0; i <= indir_hash; i++)
2321		TAILQ_INIT(&indir_hashtbl[i]);
2322
2323	/* initialise bioops hack */
2324	bioops.io_start = softdep_disk_io_initiation;
2325	bioops.io_complete = softdep_disk_write_complete;
2326	bioops.io_deallocate = softdep_deallocate_dependencies;
2327	bioops.io_countdeps = softdep_count_dependencies;
2328
2329	/* Initialize the callout with an mtx. */
2330	callout_init_mtx(&softdep_callout, &lk, 0);
2331}
2332
2333/*
2334 * Executed after all filesystems have been unmounted during
2335 * filesystem module unload.
2336 */
2337void
2338softdep_uninitialize()
2339{
2340
2341	callout_drain(&softdep_callout);
2342	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
2343	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
2344	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
2345	hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash);
2346	free(indir_hashtbl, M_FREEWORK);
2347}
2348
2349/*
2350 * Called at mount time to notify the dependency code that a
2351 * filesystem wishes to use it.
2352 */
2353int
2354softdep_mount(devvp, mp, fs, cred)
2355	struct vnode *devvp;
2356	struct mount *mp;
2357	struct fs *fs;
2358	struct ucred *cred;
2359{
2360	struct csum_total cstotal;
2361	struct ufsmount *ump;
2362	struct cg *cgp;
2363	struct buf *bp;
2364	int error, cyl;
2365
2366	MNT_ILOCK(mp);
2367	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2368	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2369		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2370			MNTK_SOFTDEP | MNTK_NOASYNC;
2371	}
2372	MNT_IUNLOCK(mp);
2373	ump = VFSTOUFS(mp);
2374	LIST_INIT(&ump->softdep_workitem_pending);
2375	LIST_INIT(&ump->softdep_journal_pending);
2376	TAILQ_INIT(&ump->softdep_unlinked);
2377	LIST_INIT(&ump->softdep_dirtycg);
2378	ump->softdep_worklist_tail = NULL;
2379	ump->softdep_on_worklist = 0;
2380	ump->softdep_deps = 0;
2381	if ((fs->fs_flags & FS_SUJ) &&
2382	    (error = journal_mount(mp, fs, cred)) != 0) {
2383		printf("Failed to start journal: %d\n", error);
2384		return (error);
2385	}
2386	/*
2387	 * When doing soft updates, the counters in the
2388	 * superblock may have gotten out of sync. Recomputation
2389	 * can take a long time and can be deferred for background
2390	 * fsck.  However, the old behavior of scanning the cylinder
2391	 * groups and recalculating them at mount time is available
2392	 * by setting vfs.ffs.compute_summary_at_mount to one.
2393	 */
2394	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2395		return (0);
2396	bzero(&cstotal, sizeof cstotal);
2397	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2398		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2399		    fs->fs_cgsize, cred, &bp)) != 0) {
2400			brelse(bp);
2401			return (error);
2402		}
2403		cgp = (struct cg *)bp->b_data;
2404		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2405		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2406		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2407		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2408		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2409		brelse(bp);
2410	}
2411#ifdef DEBUG
2412	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2413		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2414#endif
2415	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2416	return (0);
2417}
2418
2419void
2420softdep_unmount(mp)
2421	struct mount *mp;
2422{
2423
2424	MNT_ILOCK(mp);
2425	mp->mnt_flag &= ~MNT_SOFTDEP;
2426	if (MOUNTEDSUJ(mp) == 0) {
2427		MNT_IUNLOCK(mp);
2428		return;
2429	}
2430	mp->mnt_flag &= ~MNT_SUJ;
2431	MNT_IUNLOCK(mp);
2432	journal_unmount(mp);
2433}
2434
2435struct jblocks {
2436	struct jseglst	jb_segs;	/* TAILQ of current segments. */
2437	struct jseg	*jb_writeseg;	/* Next write to complete. */
2438	struct jseg	*jb_oldestseg;	/* Oldest segment with valid entries. */
2439	struct jextent	*jb_extent;	/* Extent array. */
2440	uint64_t	jb_nextseq;	/* Next sequence number. */
2441	uint64_t	jb_oldestwrseq;	/* Oldest written sequence number. */
2442	uint8_t		jb_needseg;	/* Need a forced segment. */
2443	uint8_t		jb_suspended;	/* Did journal suspend writes? */
2444	int		jb_avail;	/* Available extents. */
2445	int		jb_used;	/* Last used extent. */
2446	int		jb_head;	/* Allocator head. */
2447	int		jb_off;		/* Allocator extent offset. */
2448	int		jb_blocks;	/* Total disk blocks covered. */
2449	int		jb_free;	/* Total disk blocks free. */
2450	int		jb_min;		/* Minimum free space. */
2451	int		jb_low;		/* Low on space. */
2452	int		jb_age;		/* Insertion time of oldest rec. */
2453};
2454
2455struct jextent {
2456	ufs2_daddr_t	je_daddr;	/* Disk block address. */
2457	int		je_blocks;	/* Disk block count. */
2458};
2459
2460static struct jblocks *
2461jblocks_create(void)
2462{
2463	struct jblocks *jblocks;
2464
2465	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2466	TAILQ_INIT(&jblocks->jb_segs);
2467	jblocks->jb_avail = 10;
2468	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2469	    M_JBLOCKS, M_WAITOK | M_ZERO);
2470
2471	return (jblocks);
2472}
2473
2474static ufs2_daddr_t
2475jblocks_alloc(jblocks, bytes, actual)
2476	struct jblocks *jblocks;
2477	int bytes;
2478	int *actual;
2479{
2480	ufs2_daddr_t daddr;
2481	struct jextent *jext;
2482	int freecnt;
2483	int blocks;
2484
2485	blocks = bytes / DEV_BSIZE;
2486	jext = &jblocks->jb_extent[jblocks->jb_head];
2487	freecnt = jext->je_blocks - jblocks->jb_off;
2488	if (freecnt == 0) {
2489		jblocks->jb_off = 0;
2490		if (++jblocks->jb_head > jblocks->jb_used)
2491			jblocks->jb_head = 0;
2492		jext = &jblocks->jb_extent[jblocks->jb_head];
2493		freecnt = jext->je_blocks;
2494	}
2495	if (freecnt > blocks)
2496		freecnt = blocks;
2497	*actual = freecnt * DEV_BSIZE;
2498	daddr = jext->je_daddr + jblocks->jb_off;
2499	jblocks->jb_off += freecnt;
2500	jblocks->jb_free -= freecnt;
2501
2502	return (daddr);
2503}
2504
2505static void
2506jblocks_free(jblocks, mp, bytes)
2507	struct jblocks *jblocks;
2508	struct mount *mp;
2509	int bytes;
2510{
2511
2512	jblocks->jb_free += bytes / DEV_BSIZE;
2513	if (jblocks->jb_suspended)
2514		worklist_speedup();
2515	wakeup(jblocks);
2516}
2517
2518static void
2519jblocks_destroy(jblocks)
2520	struct jblocks *jblocks;
2521{
2522
2523	if (jblocks->jb_extent)
2524		free(jblocks->jb_extent, M_JBLOCKS);
2525	free(jblocks, M_JBLOCKS);
2526}
2527
2528static void
2529jblocks_add(jblocks, daddr, blocks)
2530	struct jblocks *jblocks;
2531	ufs2_daddr_t daddr;
2532	int blocks;
2533{
2534	struct jextent *jext;
2535
2536	jblocks->jb_blocks += blocks;
2537	jblocks->jb_free += blocks;
2538	jext = &jblocks->jb_extent[jblocks->jb_used];
2539	/* Adding the first block. */
2540	if (jext->je_daddr == 0) {
2541		jext->je_daddr = daddr;
2542		jext->je_blocks = blocks;
2543		return;
2544	}
2545	/* Extending the last extent. */
2546	if (jext->je_daddr + jext->je_blocks == daddr) {
2547		jext->je_blocks += blocks;
2548		return;
2549	}
2550	/* Adding a new extent. */
2551	if (++jblocks->jb_used == jblocks->jb_avail) {
2552		jblocks->jb_avail *= 2;
2553		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2554		    M_JBLOCKS, M_WAITOK | M_ZERO);
2555		memcpy(jext, jblocks->jb_extent,
2556		    sizeof(struct jextent) * jblocks->jb_used);
2557		free(jblocks->jb_extent, M_JBLOCKS);
2558		jblocks->jb_extent = jext;
2559	}
2560	jext = &jblocks->jb_extent[jblocks->jb_used];
2561	jext->je_daddr = daddr;
2562	jext->je_blocks = blocks;
2563	return;
2564}
2565
2566int
2567softdep_journal_lookup(mp, vpp)
2568	struct mount *mp;
2569	struct vnode **vpp;
2570{
2571	struct componentname cnp;
2572	struct vnode *dvp;
2573	ino_t sujournal;
2574	int error;
2575
2576	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2577	if (error)
2578		return (error);
2579	bzero(&cnp, sizeof(cnp));
2580	cnp.cn_nameiop = LOOKUP;
2581	cnp.cn_flags = ISLASTCN;
2582	cnp.cn_thread = curthread;
2583	cnp.cn_cred = curthread->td_ucred;
2584	cnp.cn_pnbuf = SUJ_FILE;
2585	cnp.cn_nameptr = SUJ_FILE;
2586	cnp.cn_namelen = strlen(SUJ_FILE);
2587	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2588	vput(dvp);
2589	if (error != 0)
2590		return (error);
2591	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2592	return (error);
2593}
2594
2595/*
2596 * Open and verify the journal file.
2597 */
2598static int
2599journal_mount(mp, fs, cred)
2600	struct mount *mp;
2601	struct fs *fs;
2602	struct ucred *cred;
2603{
2604	struct jblocks *jblocks;
2605	struct vnode *vp;
2606	struct inode *ip;
2607	ufs2_daddr_t blkno;
2608	int bcount;
2609	int error;
2610	int i;
2611
2612	error = softdep_journal_lookup(mp, &vp);
2613	if (error != 0) {
2614		printf("Failed to find journal.  Use tunefs to create one\n");
2615		return (error);
2616	}
2617	ip = VTOI(vp);
2618	if (ip->i_size < SUJ_MIN) {
2619		error = ENOSPC;
2620		goto out;
2621	}
2622	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2623	jblocks = jblocks_create();
2624	for (i = 0; i < bcount; i++) {
2625		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2626		if (error)
2627			break;
2628		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2629	}
2630	if (error) {
2631		jblocks_destroy(jblocks);
2632		goto out;
2633	}
2634	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2635	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2636	VFSTOUFS(mp)->softdep_jblocks = jblocks;
2637out:
2638	if (error == 0) {
2639		MNT_ILOCK(mp);
2640		mp->mnt_flag |= MNT_SUJ;
2641		mp->mnt_flag &= ~MNT_SOFTDEP;
2642		MNT_IUNLOCK(mp);
2643		/*
2644		 * Only validate the journal contents if the
2645		 * filesystem is clean, otherwise we write the logs
2646		 * but they'll never be used.  If the filesystem was
2647		 * still dirty when we mounted it the journal is
2648		 * invalid and a new journal can only be valid if it
2649		 * starts from a clean mount.
2650		 */
2651		if (fs->fs_clean) {
2652			DIP_SET(ip, i_modrev, fs->fs_mtime);
2653			ip->i_flags |= IN_MODIFIED;
2654			ffs_update(vp, 1);
2655		}
2656	}
2657	vput(vp);
2658	return (error);
2659}
2660
2661static void
2662journal_unmount(mp)
2663	struct mount *mp;
2664{
2665	struct ufsmount *ump;
2666
2667	ump = VFSTOUFS(mp);
2668	if (ump->softdep_jblocks)
2669		jblocks_destroy(ump->softdep_jblocks);
2670	ump->softdep_jblocks = NULL;
2671}
2672
2673/*
2674 * Called when a journal record is ready to be written.  Space is allocated
2675 * and the journal entry is created when the journal is flushed to stable
2676 * store.
2677 */
2678static void
2679add_to_journal(wk)
2680	struct worklist *wk;
2681{
2682	struct ufsmount *ump;
2683
2684	mtx_assert(&lk, MA_OWNED);
2685	ump = VFSTOUFS(wk->wk_mp);
2686	if (wk->wk_state & ONWORKLIST)
2687		panic("add_to_journal: %s(0x%X) already on list",
2688		    TYPENAME(wk->wk_type), wk->wk_state);
2689	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2690	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2691		ump->softdep_jblocks->jb_age = ticks;
2692		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2693	} else
2694		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2695	ump->softdep_journal_tail = wk;
2696	ump->softdep_on_journal += 1;
2697}
2698
2699/*
2700 * Remove an arbitrary item for the journal worklist maintain the tail
2701 * pointer.  This happens when a new operation obviates the need to
2702 * journal an old operation.
2703 */
2704static void
2705remove_from_journal(wk)
2706	struct worklist *wk;
2707{
2708	struct ufsmount *ump;
2709
2710	mtx_assert(&lk, MA_OWNED);
2711	ump = VFSTOUFS(wk->wk_mp);
2712#ifdef SUJ_DEBUG
2713	{
2714		struct worklist *wkn;
2715
2716		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2717			if (wkn == wk)
2718				break;
2719		if (wkn == NULL)
2720			panic("remove_from_journal: %p is not in journal", wk);
2721	}
2722#endif
2723	/*
2724	 * We emulate a TAILQ to save space in most structures which do not
2725	 * require TAILQ semantics.  Here we must update the tail position
2726	 * when removing the tail which is not the final entry. This works
2727	 * only if the worklist linkage are at the beginning of the structure.
2728	 */
2729	if (ump->softdep_journal_tail == wk)
2730		ump->softdep_journal_tail =
2731		    (struct worklist *)wk->wk_list.le_prev;
2732
2733	WORKLIST_REMOVE(wk);
2734	ump->softdep_on_journal -= 1;
2735}
2736
2737/*
2738 * Check for journal space as well as dependency limits so the prelink
2739 * code can throttle both journaled and non-journaled filesystems.
2740 * Threshold is 0 for low and 1 for min.
2741 */
2742static int
2743journal_space(ump, thresh)
2744	struct ufsmount *ump;
2745	int thresh;
2746{
2747	struct jblocks *jblocks;
2748	int avail;
2749
2750	jblocks = ump->softdep_jblocks;
2751	if (jblocks == NULL)
2752		return (1);
2753	/*
2754	 * We use a tighter restriction here to prevent request_cleanup()
2755	 * running in threads from running into locks we currently hold.
2756	 */
2757	if (dep_current[D_INODEDEP] > (max_softdeps / 10) * 9)
2758		return (0);
2759	if (thresh)
2760		thresh = jblocks->jb_min;
2761	else
2762		thresh = jblocks->jb_low;
2763	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2764	avail = jblocks->jb_free - avail;
2765
2766	return (avail > thresh);
2767}
2768
2769static void
2770journal_suspend(ump)
2771	struct ufsmount *ump;
2772{
2773	struct jblocks *jblocks;
2774	struct mount *mp;
2775
2776	mp = UFSTOVFS(ump);
2777	jblocks = ump->softdep_jblocks;
2778	MNT_ILOCK(mp);
2779	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2780		stat_journal_min++;
2781		mp->mnt_kern_flag |= MNTK_SUSPEND;
2782		mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
2783	}
2784	jblocks->jb_suspended = 1;
2785	MNT_IUNLOCK(mp);
2786}
2787
2788static int
2789journal_unsuspend(struct ufsmount *ump)
2790{
2791	struct jblocks *jblocks;
2792	struct mount *mp;
2793
2794	mp = UFSTOVFS(ump);
2795	jblocks = ump->softdep_jblocks;
2796
2797	if (jblocks != NULL && jblocks->jb_suspended &&
2798	    journal_space(ump, jblocks->jb_min)) {
2799		jblocks->jb_suspended = 0;
2800		FREE_LOCK(&lk);
2801		mp->mnt_susp_owner = curthread;
2802		vfs_write_resume(mp);
2803		ACQUIRE_LOCK(&lk);
2804		return (1);
2805	}
2806	return (0);
2807}
2808
2809/*
2810 * Called before any allocation function to be certain that there is
2811 * sufficient space in the journal prior to creating any new records.
2812 * Since in the case of block allocation we may have multiple locked
2813 * buffers at the time of the actual allocation we can not block
2814 * when the journal records are created.  Doing so would create a deadlock
2815 * if any of these buffers needed to be flushed to reclaim space.  Instead
2816 * we require a sufficiently large amount of available space such that
2817 * each thread in the system could have passed this allocation check and
2818 * still have sufficient free space.  With 20% of a minimum journal size
2819 * of 1MB we have 6553 records available.
2820 */
2821int
2822softdep_prealloc(vp, waitok)
2823	struct vnode *vp;
2824	int waitok;
2825{
2826	struct ufsmount *ump;
2827
2828	/*
2829	 * Nothing to do if we are not running journaled soft updates.
2830	 * If we currently hold the snapshot lock, we must avoid handling
2831	 * other resources that could cause deadlock.
2832	 */
2833	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)))
2834		return (0);
2835	ump = VFSTOUFS(vp->v_mount);
2836	ACQUIRE_LOCK(&lk);
2837	if (journal_space(ump, 0)) {
2838		FREE_LOCK(&lk);
2839		return (0);
2840	}
2841	stat_journal_low++;
2842	FREE_LOCK(&lk);
2843	if (waitok == MNT_NOWAIT)
2844		return (ENOSPC);
2845	/*
2846	 * Attempt to sync this vnode once to flush any journal
2847	 * work attached to it.
2848	 */
2849	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
2850		ffs_syncvnode(vp, waitok, 0);
2851	ACQUIRE_LOCK(&lk);
2852	process_removes(vp);
2853	process_truncates(vp);
2854	if (journal_space(ump, 0) == 0) {
2855		softdep_speedup();
2856		if (journal_space(ump, 1) == 0)
2857			journal_suspend(ump);
2858	}
2859	FREE_LOCK(&lk);
2860
2861	return (0);
2862}
2863
2864/*
2865 * Before adjusting a link count on a vnode verify that we have sufficient
2866 * journal space.  If not, process operations that depend on the currently
2867 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
2868 * and softdep flush threads can not acquire these locks to reclaim space.
2869 */
2870static void
2871softdep_prelink(dvp, vp)
2872	struct vnode *dvp;
2873	struct vnode *vp;
2874{
2875	struct ufsmount *ump;
2876
2877	ump = VFSTOUFS(dvp->v_mount);
2878	mtx_assert(&lk, MA_OWNED);
2879	/*
2880	 * Nothing to do if we have sufficient journal space.
2881	 * If we currently hold the snapshot lock, we must avoid
2882	 * handling other resources that could cause deadlock.
2883	 */
2884	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
2885		return;
2886	stat_journal_low++;
2887	FREE_LOCK(&lk);
2888	if (vp)
2889		ffs_syncvnode(vp, MNT_NOWAIT, 0);
2890	ffs_syncvnode(dvp, MNT_WAIT, 0);
2891	ACQUIRE_LOCK(&lk);
2892	/* Process vp before dvp as it may create .. removes. */
2893	if (vp) {
2894		process_removes(vp);
2895		process_truncates(vp);
2896	}
2897	process_removes(dvp);
2898	process_truncates(dvp);
2899	softdep_speedup();
2900	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
2901	if (journal_space(ump, 0) == 0) {
2902		softdep_speedup();
2903		if (journal_space(ump, 1) == 0)
2904			journal_suspend(ump);
2905	}
2906}
2907
2908static void
2909jseg_write(ump, jseg, data)
2910	struct ufsmount *ump;
2911	struct jseg *jseg;
2912	uint8_t *data;
2913{
2914	struct jsegrec *rec;
2915
2916	rec = (struct jsegrec *)data;
2917	rec->jsr_seq = jseg->js_seq;
2918	rec->jsr_oldest = jseg->js_oldseq;
2919	rec->jsr_cnt = jseg->js_cnt;
2920	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
2921	rec->jsr_crc = 0;
2922	rec->jsr_time = ump->um_fs->fs_mtime;
2923}
2924
2925static inline void
2926inoref_write(inoref, jseg, rec)
2927	struct inoref *inoref;
2928	struct jseg *jseg;
2929	struct jrefrec *rec;
2930{
2931
2932	inoref->if_jsegdep->jd_seg = jseg;
2933	rec->jr_ino = inoref->if_ino;
2934	rec->jr_parent = inoref->if_parent;
2935	rec->jr_nlink = inoref->if_nlink;
2936	rec->jr_mode = inoref->if_mode;
2937	rec->jr_diroff = inoref->if_diroff;
2938}
2939
2940static void
2941jaddref_write(jaddref, jseg, data)
2942	struct jaddref *jaddref;
2943	struct jseg *jseg;
2944	uint8_t *data;
2945{
2946	struct jrefrec *rec;
2947
2948	rec = (struct jrefrec *)data;
2949	rec->jr_op = JOP_ADDREF;
2950	inoref_write(&jaddref->ja_ref, jseg, rec);
2951}
2952
2953static void
2954jremref_write(jremref, jseg, data)
2955	struct jremref *jremref;
2956	struct jseg *jseg;
2957	uint8_t *data;
2958{
2959	struct jrefrec *rec;
2960
2961	rec = (struct jrefrec *)data;
2962	rec->jr_op = JOP_REMREF;
2963	inoref_write(&jremref->jr_ref, jseg, rec);
2964}
2965
2966static void
2967jmvref_write(jmvref, jseg, data)
2968	struct jmvref *jmvref;
2969	struct jseg *jseg;
2970	uint8_t *data;
2971{
2972	struct jmvrec *rec;
2973
2974	rec = (struct jmvrec *)data;
2975	rec->jm_op = JOP_MVREF;
2976	rec->jm_ino = jmvref->jm_ino;
2977	rec->jm_parent = jmvref->jm_parent;
2978	rec->jm_oldoff = jmvref->jm_oldoff;
2979	rec->jm_newoff = jmvref->jm_newoff;
2980}
2981
2982static void
2983jnewblk_write(jnewblk, jseg, data)
2984	struct jnewblk *jnewblk;
2985	struct jseg *jseg;
2986	uint8_t *data;
2987{
2988	struct jblkrec *rec;
2989
2990	jnewblk->jn_jsegdep->jd_seg = jseg;
2991	rec = (struct jblkrec *)data;
2992	rec->jb_op = JOP_NEWBLK;
2993	rec->jb_ino = jnewblk->jn_ino;
2994	rec->jb_blkno = jnewblk->jn_blkno;
2995	rec->jb_lbn = jnewblk->jn_lbn;
2996	rec->jb_frags = jnewblk->jn_frags;
2997	rec->jb_oldfrags = jnewblk->jn_oldfrags;
2998}
2999
3000static void
3001jfreeblk_write(jfreeblk, jseg, data)
3002	struct jfreeblk *jfreeblk;
3003	struct jseg *jseg;
3004	uint8_t *data;
3005{
3006	struct jblkrec *rec;
3007
3008	jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3009	rec = (struct jblkrec *)data;
3010	rec->jb_op = JOP_FREEBLK;
3011	rec->jb_ino = jfreeblk->jf_ino;
3012	rec->jb_blkno = jfreeblk->jf_blkno;
3013	rec->jb_lbn = jfreeblk->jf_lbn;
3014	rec->jb_frags = jfreeblk->jf_frags;
3015	rec->jb_oldfrags = 0;
3016}
3017
3018static void
3019jfreefrag_write(jfreefrag, jseg, data)
3020	struct jfreefrag *jfreefrag;
3021	struct jseg *jseg;
3022	uint8_t *data;
3023{
3024	struct jblkrec *rec;
3025
3026	jfreefrag->fr_jsegdep->jd_seg = jseg;
3027	rec = (struct jblkrec *)data;
3028	rec->jb_op = JOP_FREEBLK;
3029	rec->jb_ino = jfreefrag->fr_ino;
3030	rec->jb_blkno = jfreefrag->fr_blkno;
3031	rec->jb_lbn = jfreefrag->fr_lbn;
3032	rec->jb_frags = jfreefrag->fr_frags;
3033	rec->jb_oldfrags = 0;
3034}
3035
3036static void
3037jtrunc_write(jtrunc, jseg, data)
3038	struct jtrunc *jtrunc;
3039	struct jseg *jseg;
3040	uint8_t *data;
3041{
3042	struct jtrncrec *rec;
3043
3044	jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3045	rec = (struct jtrncrec *)data;
3046	rec->jt_op = JOP_TRUNC;
3047	rec->jt_ino = jtrunc->jt_ino;
3048	rec->jt_size = jtrunc->jt_size;
3049	rec->jt_extsize = jtrunc->jt_extsize;
3050}
3051
3052static void
3053jfsync_write(jfsync, jseg, data)
3054	struct jfsync *jfsync;
3055	struct jseg *jseg;
3056	uint8_t *data;
3057{
3058	struct jtrncrec *rec;
3059
3060	rec = (struct jtrncrec *)data;
3061	rec->jt_op = JOP_SYNC;
3062	rec->jt_ino = jfsync->jfs_ino;
3063	rec->jt_size = jfsync->jfs_size;
3064	rec->jt_extsize = jfsync->jfs_extsize;
3065}
3066
3067static void
3068softdep_flushjournal(mp)
3069	struct mount *mp;
3070{
3071	struct jblocks *jblocks;
3072	struct ufsmount *ump;
3073
3074	if (MOUNTEDSUJ(mp) == 0)
3075		return;
3076	ump = VFSTOUFS(mp);
3077	jblocks = ump->softdep_jblocks;
3078	ACQUIRE_LOCK(&lk);
3079	while (ump->softdep_on_journal) {
3080		jblocks->jb_needseg = 1;
3081		softdep_process_journal(mp, NULL, MNT_WAIT);
3082	}
3083	FREE_LOCK(&lk);
3084}
3085
3086/*
3087 * Flush some journal records to disk.
3088 */
3089static void
3090softdep_process_journal(mp, needwk, flags)
3091	struct mount *mp;
3092	struct worklist *needwk;
3093	int flags;
3094{
3095	struct jblocks *jblocks;
3096	struct ufsmount *ump;
3097	struct worklist *wk;
3098	struct jseg *jseg;
3099	struct buf *bp;
3100	uint8_t *data;
3101	struct fs *fs;
3102	int segwritten;
3103	int jrecmin;	/* Minimum records per block. */
3104	int jrecmax;	/* Maximum records per block. */
3105	int size;
3106	int cnt;
3107	int off;
3108	int devbsize;
3109
3110	if (MOUNTEDSUJ(mp) == 0)
3111		return;
3112	ump = VFSTOUFS(mp);
3113	fs = ump->um_fs;
3114	jblocks = ump->softdep_jblocks;
3115	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3116	/*
3117	 * We write anywhere between a disk block and fs block.  The upper
3118	 * bound is picked to prevent buffer cache fragmentation and limit
3119	 * processing time per I/O.
3120	 */
3121	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3122	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3123	segwritten = 0;
3124	for (;;) {
3125		cnt = ump->softdep_on_journal;
3126		/*
3127		 * Criteria for writing a segment:
3128		 * 1) We have a full block.
3129		 * 2) We're called from jwait() and haven't found the
3130		 *    journal item yet.
3131		 * 3) Always write if needseg is set.
3132		 * 4) If we are called from process_worklist and have
3133		 *    not yet written anything we write a partial block
3134		 *    to enforce a 1 second maximum latency on journal
3135		 *    entries.
3136		 */
3137		if (cnt < (jrecmax - 1) && needwk == NULL &&
3138		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3139			break;
3140		cnt++;
3141		/*
3142		 * Verify some free journal space.  softdep_prealloc() should
3143	 	 * guarantee that we don't run out so this is indicative of
3144		 * a problem with the flow control.  Try to recover
3145		 * gracefully in any event.
3146		 */
3147		while (jblocks->jb_free == 0) {
3148			if (flags != MNT_WAIT)
3149				break;
3150			printf("softdep: Out of journal space!\n");
3151			softdep_speedup();
3152			msleep(jblocks, &lk, PRIBIO, "jblocks", hz);
3153		}
3154		FREE_LOCK(&lk);
3155		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3156		workitem_alloc(&jseg->js_list, D_JSEG, mp);
3157		LIST_INIT(&jseg->js_entries);
3158		LIST_INIT(&jseg->js_indirs);
3159		jseg->js_state = ATTACHED;
3160		jseg->js_jblocks = jblocks;
3161		bp = geteblk(fs->fs_bsize, 0);
3162		ACQUIRE_LOCK(&lk);
3163		/*
3164		 * If there was a race while we were allocating the block
3165		 * and jseg the entry we care about was likely written.
3166		 * We bail out in both the WAIT and NOWAIT case and assume
3167		 * the caller will loop if the entry it cares about is
3168		 * not written.
3169		 */
3170		cnt = ump->softdep_on_journal;
3171		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3172			bp->b_flags |= B_INVAL | B_NOCACHE;
3173			WORKITEM_FREE(jseg, D_JSEG);
3174			FREE_LOCK(&lk);
3175			brelse(bp);
3176			ACQUIRE_LOCK(&lk);
3177			break;
3178		}
3179		/*
3180		 * Calculate the disk block size required for the available
3181		 * records rounded to the min size.
3182		 */
3183		if (cnt == 0)
3184			size = devbsize;
3185		else if (cnt < jrecmax)
3186			size = howmany(cnt, jrecmin) * devbsize;
3187		else
3188			size = fs->fs_bsize;
3189		/*
3190		 * Allocate a disk block for this journal data and account
3191		 * for truncation of the requested size if enough contiguous
3192		 * space was not available.
3193		 */
3194		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3195		bp->b_lblkno = bp->b_blkno;
3196		bp->b_offset = bp->b_blkno * DEV_BSIZE;
3197		bp->b_bcount = size;
3198		bp->b_bufobj = &ump->um_devvp->v_bufobj;
3199		bp->b_flags &= ~B_INVAL;
3200		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3201		/*
3202		 * Initialize our jseg with cnt records.  Assign the next
3203		 * sequence number to it and link it in-order.
3204		 */
3205		cnt = MIN(cnt, (size / devbsize) * jrecmin);
3206		jseg->js_buf = bp;
3207		jseg->js_cnt = cnt;
3208		jseg->js_refs = cnt + 1;	/* Self ref. */
3209		jseg->js_size = size;
3210		jseg->js_seq = jblocks->jb_nextseq++;
3211		if (jblocks->jb_oldestseg == NULL)
3212			jblocks->jb_oldestseg = jseg;
3213		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3214		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3215		if (jblocks->jb_writeseg == NULL)
3216			jblocks->jb_writeseg = jseg;
3217		/*
3218		 * Start filling in records from the pending list.
3219		 */
3220		data = bp->b_data;
3221		off = 0;
3222		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3223		    != NULL) {
3224			if (cnt == 0)
3225				break;
3226			/* Place a segment header on every device block. */
3227			if ((off % devbsize) == 0) {
3228				jseg_write(ump, jseg, data);
3229				off += JREC_SIZE;
3230				data = bp->b_data + off;
3231			}
3232			if (wk == needwk)
3233				needwk = NULL;
3234			remove_from_journal(wk);
3235			wk->wk_state |= INPROGRESS;
3236			WORKLIST_INSERT(&jseg->js_entries, wk);
3237			switch (wk->wk_type) {
3238			case D_JADDREF:
3239				jaddref_write(WK_JADDREF(wk), jseg, data);
3240				break;
3241			case D_JREMREF:
3242				jremref_write(WK_JREMREF(wk), jseg, data);
3243				break;
3244			case D_JMVREF:
3245				jmvref_write(WK_JMVREF(wk), jseg, data);
3246				break;
3247			case D_JNEWBLK:
3248				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3249				break;
3250			case D_JFREEBLK:
3251				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3252				break;
3253			case D_JFREEFRAG:
3254				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3255				break;
3256			case D_JTRUNC:
3257				jtrunc_write(WK_JTRUNC(wk), jseg, data);
3258				break;
3259			case D_JFSYNC:
3260				jfsync_write(WK_JFSYNC(wk), jseg, data);
3261				break;
3262			default:
3263				panic("process_journal: Unknown type %s",
3264				    TYPENAME(wk->wk_type));
3265				/* NOTREACHED */
3266			}
3267			off += JREC_SIZE;
3268			data = bp->b_data + off;
3269			cnt--;
3270		}
3271		/*
3272		 * Write this one buffer and continue.
3273		 */
3274		segwritten = 1;
3275		jblocks->jb_needseg = 0;
3276		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3277		FREE_LOCK(&lk);
3278		BO_LOCK(bp->b_bufobj);
3279		bgetvp(ump->um_devvp, bp);
3280		BO_UNLOCK(bp->b_bufobj);
3281		/*
3282		 * We only do the blocking wait once we find the journal
3283		 * entry we're looking for.
3284		 */
3285		if (needwk == NULL && flags == MNT_WAIT)
3286			bwrite(bp);
3287		else
3288			bawrite(bp);
3289		ACQUIRE_LOCK(&lk);
3290	}
3291	/*
3292	 * If we've suspended the filesystem because we ran out of journal
3293	 * space either try to sync it here to make some progress or
3294	 * unsuspend it if we already have.
3295	 */
3296	if (flags == 0 && jblocks->jb_suspended) {
3297		if (journal_unsuspend(ump))
3298			return;
3299		FREE_LOCK(&lk);
3300		VFS_SYNC(mp, MNT_NOWAIT);
3301		ffs_sbupdate(ump, MNT_WAIT, 0);
3302		ACQUIRE_LOCK(&lk);
3303	}
3304}
3305
3306/*
3307 * Complete a jseg, allowing all dependencies awaiting journal writes
3308 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3309 * structures so that the journal segment can be freed to reclaim space.
3310 */
3311static void
3312complete_jseg(jseg)
3313	struct jseg *jseg;
3314{
3315	struct worklist *wk;
3316	struct jmvref *jmvref;
3317	int waiting;
3318#ifdef INVARIANTS
3319	int i = 0;
3320#endif
3321
3322	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3323		WORKLIST_REMOVE(wk);
3324		waiting = wk->wk_state & IOWAITING;
3325		wk->wk_state &= ~(INPROGRESS | IOWAITING);
3326		wk->wk_state |= COMPLETE;
3327		KASSERT(i++ < jseg->js_cnt,
3328		    ("handle_written_jseg: overflow %d >= %d",
3329		    i - 1, jseg->js_cnt));
3330		switch (wk->wk_type) {
3331		case D_JADDREF:
3332			handle_written_jaddref(WK_JADDREF(wk));
3333			break;
3334		case D_JREMREF:
3335			handle_written_jremref(WK_JREMREF(wk));
3336			break;
3337		case D_JMVREF:
3338			rele_jseg(jseg);	/* No jsegdep. */
3339			jmvref = WK_JMVREF(wk);
3340			LIST_REMOVE(jmvref, jm_deps);
3341			if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3342				free_pagedep(jmvref->jm_pagedep);
3343			WORKITEM_FREE(jmvref, D_JMVREF);
3344			break;
3345		case D_JNEWBLK:
3346			handle_written_jnewblk(WK_JNEWBLK(wk));
3347			break;
3348		case D_JFREEBLK:
3349			handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3350			break;
3351		case D_JTRUNC:
3352			handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3353			break;
3354		case D_JFSYNC:
3355			rele_jseg(jseg);	/* No jsegdep. */
3356			WORKITEM_FREE(wk, D_JFSYNC);
3357			break;
3358		case D_JFREEFRAG:
3359			handle_written_jfreefrag(WK_JFREEFRAG(wk));
3360			break;
3361		default:
3362			panic("handle_written_jseg: Unknown type %s",
3363			    TYPENAME(wk->wk_type));
3364			/* NOTREACHED */
3365		}
3366		if (waiting)
3367			wakeup(wk);
3368	}
3369	/* Release the self reference so the structure may be freed. */
3370	rele_jseg(jseg);
3371}
3372
3373/*
3374 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Handle jseg
3375 * completions in order only.
3376 */
3377static void
3378handle_written_jseg(jseg, bp)
3379	struct jseg *jseg;
3380	struct buf *bp;
3381{
3382	struct jblocks *jblocks;
3383	struct jseg *jsegn;
3384
3385	if (jseg->js_refs == 0)
3386		panic("handle_written_jseg: No self-reference on %p", jseg);
3387	jseg->js_state |= DEPCOMPLETE;
3388	/*
3389	 * We'll never need this buffer again, set flags so it will be
3390	 * discarded.
3391	 */
3392	bp->b_flags |= B_INVAL | B_NOCACHE;
3393	jblocks = jseg->js_jblocks;
3394	/*
3395	 * Don't allow out of order completions.  If this isn't the first
3396	 * block wait for it to write before we're done.
3397	 */
3398	if (jseg != jblocks->jb_writeseg)
3399		return;
3400	/* Iterate through available jsegs processing their entries. */
3401	do {
3402		jblocks->jb_oldestwrseq = jseg->js_oldseq;
3403		jsegn = TAILQ_NEXT(jseg, js_next);
3404		complete_jseg(jseg);
3405		jseg = jsegn;
3406	} while (jseg && jseg->js_state & DEPCOMPLETE);
3407	jblocks->jb_writeseg = jseg;
3408	/*
3409	 * Attempt to free jsegs now that oldestwrseq may have advanced.
3410	 */
3411	free_jsegs(jblocks);
3412}
3413
3414static inline struct jsegdep *
3415inoref_jseg(inoref)
3416	struct inoref *inoref;
3417{
3418	struct jsegdep *jsegdep;
3419
3420	jsegdep = inoref->if_jsegdep;
3421	inoref->if_jsegdep = NULL;
3422
3423	return (jsegdep);
3424}
3425
3426/*
3427 * Called once a jremref has made it to stable store.  The jremref is marked
3428 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3429 * for the jremref to complete will be awoken by free_jremref.
3430 */
3431static void
3432handle_written_jremref(jremref)
3433	struct jremref *jremref;
3434{
3435	struct inodedep *inodedep;
3436	struct jsegdep *jsegdep;
3437	struct dirrem *dirrem;
3438
3439	/* Grab the jsegdep. */
3440	jsegdep = inoref_jseg(&jremref->jr_ref);
3441	/*
3442	 * Remove us from the inoref list.
3443	 */
3444	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3445	    0, &inodedep) == 0)
3446		panic("handle_written_jremref: Lost inodedep");
3447	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3448	/*
3449	 * Complete the dirrem.
3450	 */
3451	dirrem = jremref->jr_dirrem;
3452	jremref->jr_dirrem = NULL;
3453	LIST_REMOVE(jremref, jr_deps);
3454	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3455	jwork_insert(&dirrem->dm_jwork, jsegdep);
3456	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3457	    (dirrem->dm_state & COMPLETE) != 0)
3458		add_to_worklist(&dirrem->dm_list, 0);
3459	free_jremref(jremref);
3460}
3461
3462/*
3463 * Called once a jaddref has made it to stable store.  The dependency is
3464 * marked complete and any dependent structures are added to the inode
3465 * bufwait list to be completed as soon as it is written.  If a bitmap write
3466 * depends on this entry we move the inode into the inodedephd of the
3467 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3468 */
3469static void
3470handle_written_jaddref(jaddref)
3471	struct jaddref *jaddref;
3472{
3473	struct jsegdep *jsegdep;
3474	struct inodedep *inodedep;
3475	struct diradd *diradd;
3476	struct mkdir *mkdir;
3477
3478	/* Grab the jsegdep. */
3479	jsegdep = inoref_jseg(&jaddref->ja_ref);
3480	mkdir = NULL;
3481	diradd = NULL;
3482	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3483	    0, &inodedep) == 0)
3484		panic("handle_written_jaddref: Lost inodedep.");
3485	if (jaddref->ja_diradd == NULL)
3486		panic("handle_written_jaddref: No dependency");
3487	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3488		diradd = jaddref->ja_diradd;
3489		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3490	} else if (jaddref->ja_state & MKDIR_PARENT) {
3491		mkdir = jaddref->ja_mkdir;
3492		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3493	} else if (jaddref->ja_state & MKDIR_BODY)
3494		mkdir = jaddref->ja_mkdir;
3495	else
3496		panic("handle_written_jaddref: Unknown dependency %p",
3497		    jaddref->ja_diradd);
3498	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3499	/*
3500	 * Remove us from the inode list.
3501	 */
3502	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3503	/*
3504	 * The mkdir may be waiting on the jaddref to clear before freeing.
3505	 */
3506	if (mkdir) {
3507		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3508		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3509		    TYPENAME(mkdir->md_list.wk_type)));
3510		mkdir->md_jaddref = NULL;
3511		diradd = mkdir->md_diradd;
3512		mkdir->md_state |= DEPCOMPLETE;
3513		complete_mkdir(mkdir);
3514	}
3515	jwork_insert(&diradd->da_jwork, jsegdep);
3516	if (jaddref->ja_state & NEWBLOCK) {
3517		inodedep->id_state |= ONDEPLIST;
3518		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3519		    inodedep, id_deps);
3520	}
3521	free_jaddref(jaddref);
3522}
3523
3524/*
3525 * Called once a jnewblk journal is written.  The allocdirect or allocindir
3526 * is placed in the bmsafemap to await notification of a written bitmap.  If
3527 * the operation was canceled we add the segdep to the appropriate
3528 * dependency to free the journal space once the canceling operation
3529 * completes.
3530 */
3531static void
3532handle_written_jnewblk(jnewblk)
3533	struct jnewblk *jnewblk;
3534{
3535	struct bmsafemap *bmsafemap;
3536	struct freefrag *freefrag;
3537	struct freework *freework;
3538	struct jsegdep *jsegdep;
3539	struct newblk *newblk;
3540
3541	/* Grab the jsegdep. */
3542	jsegdep = jnewblk->jn_jsegdep;
3543	jnewblk->jn_jsegdep = NULL;
3544	if (jnewblk->jn_dep == NULL)
3545		panic("handle_written_jnewblk: No dependency for the segdep.");
3546	switch (jnewblk->jn_dep->wk_type) {
3547	case D_NEWBLK:
3548	case D_ALLOCDIRECT:
3549	case D_ALLOCINDIR:
3550		/*
3551		 * Add the written block to the bmsafemap so it can
3552		 * be notified when the bitmap is on disk.
3553		 */
3554		newblk = WK_NEWBLK(jnewblk->jn_dep);
3555		newblk->nb_jnewblk = NULL;
3556		if ((newblk->nb_state & GOINGAWAY) == 0) {
3557			bmsafemap = newblk->nb_bmsafemap;
3558			newblk->nb_state |= ONDEPLIST;
3559			LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3560			    nb_deps);
3561		}
3562		jwork_insert(&newblk->nb_jwork, jsegdep);
3563		break;
3564	case D_FREEFRAG:
3565		/*
3566		 * A newblock being removed by a freefrag when replaced by
3567		 * frag extension.
3568		 */
3569		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3570		freefrag->ff_jdep = NULL;
3571		WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list);
3572		break;
3573	case D_FREEWORK:
3574		/*
3575		 * A direct block was removed by truncate.
3576		 */
3577		freework = WK_FREEWORK(jnewblk->jn_dep);
3578		freework->fw_jnewblk = NULL;
3579		WORKLIST_INSERT(&freework->fw_freeblks->fb_jwork,
3580		    &jsegdep->jd_list);
3581		break;
3582	default:
3583		panic("handle_written_jnewblk: Unknown type %d.",
3584		    jnewblk->jn_dep->wk_type);
3585	}
3586	jnewblk->jn_dep = NULL;
3587	free_jnewblk(jnewblk);
3588}
3589
3590/*
3591 * Cancel a jfreefrag that won't be needed, probably due to colliding with
3592 * an in-flight allocation that has not yet been committed.  Divorce us
3593 * from the freefrag and mark it DEPCOMPLETE so that it may be added
3594 * to the worklist.
3595 */
3596static void
3597cancel_jfreefrag(jfreefrag)
3598	struct jfreefrag *jfreefrag;
3599{
3600	struct freefrag *freefrag;
3601
3602	if (jfreefrag->fr_jsegdep) {
3603		free_jsegdep(jfreefrag->fr_jsegdep);
3604		jfreefrag->fr_jsegdep = NULL;
3605	}
3606	freefrag = jfreefrag->fr_freefrag;
3607	jfreefrag->fr_freefrag = NULL;
3608	free_jfreefrag(jfreefrag);
3609	freefrag->ff_state |= DEPCOMPLETE;
3610}
3611
3612/*
3613 * Free a jfreefrag when the parent freefrag is rendered obsolete.
3614 */
3615static void
3616free_jfreefrag(jfreefrag)
3617	struct jfreefrag *jfreefrag;
3618{
3619
3620	if (jfreefrag->fr_state & INPROGRESS)
3621		WORKLIST_REMOVE(&jfreefrag->fr_list);
3622	else if (jfreefrag->fr_state & ONWORKLIST)
3623		remove_from_journal(&jfreefrag->fr_list);
3624	if (jfreefrag->fr_freefrag != NULL)
3625		panic("free_jfreefrag:  Still attached to a freefrag.");
3626	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3627}
3628
3629/*
3630 * Called when the journal write for a jfreefrag completes.  The parent
3631 * freefrag is added to the worklist if this completes its dependencies.
3632 */
3633static void
3634handle_written_jfreefrag(jfreefrag)
3635	struct jfreefrag *jfreefrag;
3636{
3637	struct jsegdep *jsegdep;
3638	struct freefrag *freefrag;
3639
3640	/* Grab the jsegdep. */
3641	jsegdep = jfreefrag->fr_jsegdep;
3642	jfreefrag->fr_jsegdep = NULL;
3643	freefrag = jfreefrag->fr_freefrag;
3644	if (freefrag == NULL)
3645		panic("handle_written_jfreefrag: No freefrag.");
3646	freefrag->ff_state |= DEPCOMPLETE;
3647	freefrag->ff_jdep = NULL;
3648	jwork_insert(&freefrag->ff_jwork, jsegdep);
3649	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3650		add_to_worklist(&freefrag->ff_list, 0);
3651	jfreefrag->fr_freefrag = NULL;
3652	free_jfreefrag(jfreefrag);
3653}
3654
3655/*
3656 * Called when the journal write for a jfreeblk completes.  The jfreeblk
3657 * is removed from the freeblks list of pending journal writes and the
3658 * jsegdep is moved to the freeblks jwork to be completed when all blocks
3659 * have been reclaimed.
3660 */
3661static void
3662handle_written_jblkdep(jblkdep)
3663	struct jblkdep *jblkdep;
3664{
3665	struct freeblks *freeblks;
3666	struct jsegdep *jsegdep;
3667
3668	/* Grab the jsegdep. */
3669	jsegdep = jblkdep->jb_jsegdep;
3670	jblkdep->jb_jsegdep = NULL;
3671	freeblks = jblkdep->jb_freeblks;
3672	LIST_REMOVE(jblkdep, jb_deps);
3673	WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list);
3674	/*
3675	 * If the freeblks is all journaled, we can add it to the worklist.
3676	 */
3677	if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
3678	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
3679		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
3680
3681	free_jblkdep(jblkdep);
3682}
3683
3684static struct jsegdep *
3685newjsegdep(struct worklist *wk)
3686{
3687	struct jsegdep *jsegdep;
3688
3689	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3690	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3691	jsegdep->jd_seg = NULL;
3692
3693	return (jsegdep);
3694}
3695
3696static struct jmvref *
3697newjmvref(dp, ino, oldoff, newoff)
3698	struct inode *dp;
3699	ino_t ino;
3700	off_t oldoff;
3701	off_t newoff;
3702{
3703	struct jmvref *jmvref;
3704
3705	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3706	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3707	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3708	jmvref->jm_parent = dp->i_number;
3709	jmvref->jm_ino = ino;
3710	jmvref->jm_oldoff = oldoff;
3711	jmvref->jm_newoff = newoff;
3712
3713	return (jmvref);
3714}
3715
3716/*
3717 * Allocate a new jremref that tracks the removal of ip from dp with the
3718 * directory entry offset of diroff.  Mark the entry as ATTACHED and
3719 * DEPCOMPLETE as we have all the information required for the journal write
3720 * and the directory has already been removed from the buffer.  The caller
3721 * is responsible for linking the jremref into the pagedep and adding it
3722 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
3723 * a DOTDOT addition so handle_workitem_remove() can properly assign
3724 * the jsegdep when we're done.
3725 */
3726static struct jremref *
3727newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
3728    off_t diroff, nlink_t nlink)
3729{
3730	struct jremref *jremref;
3731
3732	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
3733	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
3734	jremref->jr_state = ATTACHED;
3735	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
3736	   nlink, ip->i_mode);
3737	jremref->jr_dirrem = dirrem;
3738
3739	return (jremref);
3740}
3741
3742static inline void
3743newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
3744    nlink_t nlink, uint16_t mode)
3745{
3746
3747	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
3748	inoref->if_diroff = diroff;
3749	inoref->if_ino = ino;
3750	inoref->if_parent = parent;
3751	inoref->if_nlink = nlink;
3752	inoref->if_mode = mode;
3753}
3754
3755/*
3756 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
3757 * directory offset may not be known until later.  The caller is responsible
3758 * adding the entry to the journal when this information is available.  nlink
3759 * should be the link count prior to the addition and mode is only required
3760 * to have the correct FMT.
3761 */
3762static struct jaddref *
3763newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
3764    uint16_t mode)
3765{
3766	struct jaddref *jaddref;
3767
3768	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
3769	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
3770	jaddref->ja_state = ATTACHED;
3771	jaddref->ja_mkdir = NULL;
3772	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
3773
3774	return (jaddref);
3775}
3776
3777/*
3778 * Create a new free dependency for a freework.  The caller is responsible
3779 * for adjusting the reference count when it has the lock held.  The freedep
3780 * will track an outstanding bitmap write that will ultimately clear the
3781 * freework to continue.
3782 */
3783static struct freedep *
3784newfreedep(struct freework *freework)
3785{
3786	struct freedep *freedep;
3787
3788	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
3789	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
3790	freedep->fd_freework = freework;
3791
3792	return (freedep);
3793}
3794
3795/*
3796 * Free a freedep structure once the buffer it is linked to is written.  If
3797 * this is the last reference to the freework schedule it for completion.
3798 */
3799static void
3800free_freedep(freedep)
3801	struct freedep *freedep;
3802{
3803	struct freework *freework;
3804
3805	freework = freedep->fd_freework;
3806	freework->fw_freeblks->fb_cgwait--;
3807	if (--freework->fw_ref == 0)
3808		freework_enqueue(freework);
3809	WORKITEM_FREE(freedep, D_FREEDEP);
3810}
3811
3812/*
3813 * Allocate a new freework structure that may be a level in an indirect
3814 * when parent is not NULL or a top level block when it is.  The top level
3815 * freework structures are allocated without lk held and before the freeblks
3816 * is visible outside of softdep_setup_freeblocks().
3817 */
3818static struct freework *
3819newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
3820	struct ufsmount *ump;
3821	struct freeblks *freeblks;
3822	struct freework *parent;
3823	ufs_lbn_t lbn;
3824	ufs2_daddr_t nb;
3825	int frags;
3826	int off;
3827	int journal;
3828{
3829	struct freework *freework;
3830
3831	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
3832	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
3833	freework->fw_state = ATTACHED;
3834	freework->fw_jnewblk = NULL;
3835	freework->fw_freeblks = freeblks;
3836	freework->fw_parent = parent;
3837	freework->fw_lbn = lbn;
3838	freework->fw_blkno = nb;
3839	freework->fw_frags = frags;
3840	freework->fw_indir = NULL;
3841	freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR)
3842		? 0 : NINDIR(ump->um_fs) + 1;
3843	freework->fw_start = freework->fw_off = off;
3844	if (journal)
3845		newjfreeblk(freeblks, lbn, nb, frags);
3846	if (parent == NULL) {
3847		ACQUIRE_LOCK(&lk);
3848		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
3849		freeblks->fb_ref++;
3850		FREE_LOCK(&lk);
3851	}
3852
3853	return (freework);
3854}
3855
3856/*
3857 * Eliminate a jfreeblk for a block that does not need journaling.
3858 */
3859static void
3860cancel_jfreeblk(freeblks, blkno)
3861	struct freeblks *freeblks;
3862	ufs2_daddr_t blkno;
3863{
3864	struct jfreeblk *jfreeblk;
3865	struct jblkdep *jblkdep;
3866
3867	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
3868		if (jblkdep->jb_list.wk_type != D_JFREEBLK)
3869			continue;
3870		jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
3871		if (jfreeblk->jf_blkno == blkno)
3872			break;
3873	}
3874	if (jblkdep == NULL)
3875		return;
3876	free_jsegdep(jblkdep->jb_jsegdep);
3877	LIST_REMOVE(jblkdep, jb_deps);
3878	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
3879}
3880
3881/*
3882 * Allocate a new jfreeblk to journal top level block pointer when truncating
3883 * a file.  The caller must add this to the worklist when lk is held.
3884 */
3885static struct jfreeblk *
3886newjfreeblk(freeblks, lbn, blkno, frags)
3887	struct freeblks *freeblks;
3888	ufs_lbn_t lbn;
3889	ufs2_daddr_t blkno;
3890	int frags;
3891{
3892	struct jfreeblk *jfreeblk;
3893
3894	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
3895	workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
3896	    freeblks->fb_list.wk_mp);
3897	jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
3898	jfreeblk->jf_dep.jb_freeblks = freeblks;
3899	jfreeblk->jf_ino = freeblks->fb_inum;
3900	jfreeblk->jf_lbn = lbn;
3901	jfreeblk->jf_blkno = blkno;
3902	jfreeblk->jf_frags = frags;
3903	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
3904
3905	return (jfreeblk);
3906}
3907
3908/*
3909 * Allocate a new jtrunc to track a partial truncation.
3910 */
3911static struct jtrunc *
3912newjtrunc(freeblks, size, extsize)
3913	struct freeblks *freeblks;
3914	off_t size;
3915	int extsize;
3916{
3917	struct jtrunc *jtrunc;
3918
3919	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
3920	workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
3921	    freeblks->fb_list.wk_mp);
3922	jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
3923	jtrunc->jt_dep.jb_freeblks = freeblks;
3924	jtrunc->jt_ino = freeblks->fb_inum;
3925	jtrunc->jt_size = size;
3926	jtrunc->jt_extsize = extsize;
3927	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
3928
3929	return (jtrunc);
3930}
3931
3932/*
3933 * If we're canceling a new bitmap we have to search for another ref
3934 * to move into the bmsafemap dep.  This might be better expressed
3935 * with another structure.
3936 */
3937static void
3938move_newblock_dep(jaddref, inodedep)
3939	struct jaddref *jaddref;
3940	struct inodedep *inodedep;
3941{
3942	struct inoref *inoref;
3943	struct jaddref *jaddrefn;
3944
3945	jaddrefn = NULL;
3946	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
3947	    inoref = TAILQ_NEXT(inoref, if_deps)) {
3948		if ((jaddref->ja_state & NEWBLOCK) &&
3949		    inoref->if_list.wk_type == D_JADDREF) {
3950			jaddrefn = (struct jaddref *)inoref;
3951			break;
3952		}
3953	}
3954	if (jaddrefn == NULL)
3955		return;
3956	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
3957	jaddrefn->ja_state |= jaddref->ja_state &
3958	    (ATTACHED | UNDONE | NEWBLOCK);
3959	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
3960	jaddref->ja_state |= ATTACHED;
3961	LIST_REMOVE(jaddref, ja_bmdeps);
3962	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
3963	    ja_bmdeps);
3964}
3965
3966/*
3967 * Cancel a jaddref either before it has been written or while it is being
3968 * written.  This happens when a link is removed before the add reaches
3969 * the disk.  The jaddref dependency is kept linked into the bmsafemap
3970 * and inode to prevent the link count or bitmap from reaching the disk
3971 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
3972 * required.
3973 *
3974 * Returns 1 if the canceled addref requires journaling of the remove and
3975 * 0 otherwise.
3976 */
3977static int
3978cancel_jaddref(jaddref, inodedep, wkhd)
3979	struct jaddref *jaddref;
3980	struct inodedep *inodedep;
3981	struct workhead *wkhd;
3982{
3983	struct inoref *inoref;
3984	struct jsegdep *jsegdep;
3985	int needsj;
3986
3987	KASSERT((jaddref->ja_state & COMPLETE) == 0,
3988	    ("cancel_jaddref: Canceling complete jaddref"));
3989	if (jaddref->ja_state & (INPROGRESS | COMPLETE))
3990		needsj = 1;
3991	else
3992		needsj = 0;
3993	if (inodedep == NULL)
3994		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3995		    0, &inodedep) == 0)
3996			panic("cancel_jaddref: Lost inodedep");
3997	/*
3998	 * We must adjust the nlink of any reference operation that follows
3999	 * us so that it is consistent with the in-memory reference.  This
4000	 * ensures that inode nlink rollbacks always have the correct link.
4001	 */
4002	if (needsj == 0) {
4003		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4004		    inoref = TAILQ_NEXT(inoref, if_deps)) {
4005			if (inoref->if_state & GOINGAWAY)
4006				break;
4007			inoref->if_nlink--;
4008		}
4009	}
4010	jsegdep = inoref_jseg(&jaddref->ja_ref);
4011	if (jaddref->ja_state & NEWBLOCK)
4012		move_newblock_dep(jaddref, inodedep);
4013	wake_worklist(&jaddref->ja_list);
4014	jaddref->ja_mkdir = NULL;
4015	if (jaddref->ja_state & INPROGRESS) {
4016		jaddref->ja_state &= ~INPROGRESS;
4017		WORKLIST_REMOVE(&jaddref->ja_list);
4018		jwork_insert(wkhd, jsegdep);
4019	} else {
4020		free_jsegdep(jsegdep);
4021		if (jaddref->ja_state & DEPCOMPLETE)
4022			remove_from_journal(&jaddref->ja_list);
4023	}
4024	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4025	/*
4026	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4027	 * can arrange for them to be freed with the bitmap.  Otherwise we
4028	 * no longer need this addref attached to the inoreflst and it
4029	 * will incorrectly adjust nlink if we leave it.
4030	 */
4031	if ((jaddref->ja_state & NEWBLOCK) == 0) {
4032		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4033		    if_deps);
4034		jaddref->ja_state |= COMPLETE;
4035		free_jaddref(jaddref);
4036		return (needsj);
4037	}
4038	/*
4039	 * Leave the head of the list for jsegdeps for fast merging.
4040	 */
4041	if (LIST_FIRST(wkhd) != NULL) {
4042		jaddref->ja_state |= ONWORKLIST;
4043		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4044	} else
4045		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4046
4047	return (needsj);
4048}
4049
4050/*
4051 * Attempt to free a jaddref structure when some work completes.  This
4052 * should only succeed once the entry is written and all dependencies have
4053 * been notified.
4054 */
4055static void
4056free_jaddref(jaddref)
4057	struct jaddref *jaddref;
4058{
4059
4060	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4061		return;
4062	if (jaddref->ja_ref.if_jsegdep)
4063		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4064		    jaddref, jaddref->ja_state);
4065	if (jaddref->ja_state & NEWBLOCK)
4066		LIST_REMOVE(jaddref, ja_bmdeps);
4067	if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4068		panic("free_jaddref: Bad state %p(0x%X)",
4069		    jaddref, jaddref->ja_state);
4070	if (jaddref->ja_mkdir != NULL)
4071		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4072	WORKITEM_FREE(jaddref, D_JADDREF);
4073}
4074
4075/*
4076 * Free a jremref structure once it has been written or discarded.
4077 */
4078static void
4079free_jremref(jremref)
4080	struct jremref *jremref;
4081{
4082
4083	if (jremref->jr_ref.if_jsegdep)
4084		free_jsegdep(jremref->jr_ref.if_jsegdep);
4085	if (jremref->jr_state & INPROGRESS)
4086		panic("free_jremref: IO still pending");
4087	WORKITEM_FREE(jremref, D_JREMREF);
4088}
4089
4090/*
4091 * Free a jnewblk structure.
4092 */
4093static void
4094free_jnewblk(jnewblk)
4095	struct jnewblk *jnewblk;
4096{
4097
4098	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4099		return;
4100	LIST_REMOVE(jnewblk, jn_deps);
4101	if (jnewblk->jn_dep != NULL)
4102		panic("free_jnewblk: Dependency still attached.");
4103	WORKITEM_FREE(jnewblk, D_JNEWBLK);
4104}
4105
4106/*
4107 * Cancel a jnewblk which has been been made redundant by frag extension.
4108 */
4109static void
4110cancel_jnewblk(jnewblk, wkhd)
4111	struct jnewblk *jnewblk;
4112	struct workhead *wkhd;
4113{
4114	struct jsegdep *jsegdep;
4115
4116	jsegdep = jnewblk->jn_jsegdep;
4117	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4118		panic("cancel_jnewblk: Invalid state");
4119	jnewblk->jn_jsegdep  = NULL;
4120	jnewblk->jn_dep = NULL;
4121	jnewblk->jn_state |= GOINGAWAY;
4122	if (jnewblk->jn_state & INPROGRESS) {
4123		jnewblk->jn_state &= ~INPROGRESS;
4124		WORKLIST_REMOVE(&jnewblk->jn_list);
4125		jwork_insert(wkhd, jsegdep);
4126	} else {
4127		free_jsegdep(jsegdep);
4128		remove_from_journal(&jnewblk->jn_list);
4129	}
4130	wake_worklist(&jnewblk->jn_list);
4131	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4132}
4133
4134static void
4135free_jblkdep(jblkdep)
4136	struct jblkdep *jblkdep;
4137{
4138
4139	if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4140		WORKITEM_FREE(jblkdep, D_JFREEBLK);
4141	else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4142		WORKITEM_FREE(jblkdep, D_JTRUNC);
4143	else
4144		panic("free_jblkdep: Unexpected type %s",
4145		    TYPENAME(jblkdep->jb_list.wk_type));
4146}
4147
4148/*
4149 * Free a single jseg once it is no longer referenced in memory or on
4150 * disk.  Reclaim journal blocks and dependencies waiting for the segment
4151 * to disappear.
4152 */
4153static void
4154free_jseg(jseg, jblocks)
4155	struct jseg *jseg;
4156	struct jblocks *jblocks;
4157{
4158	struct freework *freework;
4159
4160	/*
4161	 * Free freework structures that were lingering to indicate freed
4162	 * indirect blocks that forced journal write ordering on reallocate.
4163	 */
4164	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4165		indirblk_remove(freework);
4166	if (jblocks->jb_oldestseg == jseg)
4167		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4168	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4169	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4170	KASSERT(LIST_EMPTY(&jseg->js_entries),
4171	    ("free_jseg: Freed jseg has valid entries."));
4172	WORKITEM_FREE(jseg, D_JSEG);
4173}
4174
4175/*
4176 * Free all jsegs that meet the criteria for being reclaimed and update
4177 * oldestseg.
4178 */
4179static void
4180free_jsegs(jblocks)
4181	struct jblocks *jblocks;
4182{
4183	struct jseg *jseg;
4184
4185	/*
4186	 * Free only those jsegs which have none allocated before them to
4187	 * preserve the journal space ordering.
4188	 */
4189	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4190		/*
4191		 * Only reclaim space when nothing depends on this journal
4192		 * set and another set has written that it is no longer
4193		 * valid.
4194		 */
4195		if (jseg->js_refs != 0) {
4196			jblocks->jb_oldestseg = jseg;
4197			return;
4198		}
4199		if (!LIST_EMPTY(&jseg->js_indirs) &&
4200		    jseg->js_seq >= jblocks->jb_oldestwrseq)
4201			break;
4202		free_jseg(jseg, jblocks);
4203	}
4204	/*
4205	 * If we exited the loop above we still must discover the
4206	 * oldest valid segment.
4207	 */
4208	if (jseg)
4209		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4210		     jseg = TAILQ_NEXT(jseg, js_next))
4211			if (jseg->js_refs != 0)
4212				break;
4213	jblocks->jb_oldestseg = jseg;
4214	/*
4215	 * The journal has no valid records but some jsegs may still be
4216	 * waiting on oldestwrseq to advance.  We force a small record
4217	 * out to permit these lingering records to be reclaimed.
4218	 */
4219	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4220		jblocks->jb_needseg = 1;
4221}
4222
4223/*
4224 * Release one reference to a jseg and free it if the count reaches 0.  This
4225 * should eventually reclaim journal space as well.
4226 */
4227static void
4228rele_jseg(jseg)
4229	struct jseg *jseg;
4230{
4231
4232	KASSERT(jseg->js_refs > 0,
4233	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4234	if (--jseg->js_refs != 0)
4235		return;
4236	free_jsegs(jseg->js_jblocks);
4237}
4238
4239/*
4240 * Release a jsegdep and decrement the jseg count.
4241 */
4242static void
4243free_jsegdep(jsegdep)
4244	struct jsegdep *jsegdep;
4245{
4246
4247	if (jsegdep->jd_seg)
4248		rele_jseg(jsegdep->jd_seg);
4249	WORKITEM_FREE(jsegdep, D_JSEGDEP);
4250}
4251
4252/*
4253 * Wait for a journal item to make it to disk.  Initiate journal processing
4254 * if required.
4255 */
4256static int
4257jwait(wk, waitfor)
4258	struct worklist *wk;
4259	int waitfor;
4260{
4261
4262	/*
4263	 * Blocking journal waits cause slow synchronous behavior.  Record
4264	 * stats on the frequency of these blocking operations.
4265	 */
4266	if (waitfor == MNT_WAIT) {
4267		stat_journal_wait++;
4268		switch (wk->wk_type) {
4269		case D_JREMREF:
4270		case D_JMVREF:
4271			stat_jwait_filepage++;
4272			break;
4273		case D_JTRUNC:
4274		case D_JFREEBLK:
4275			stat_jwait_freeblks++;
4276			break;
4277		case D_JNEWBLK:
4278			stat_jwait_newblk++;
4279			break;
4280		case D_JADDREF:
4281			stat_jwait_inode++;
4282			break;
4283		default:
4284			break;
4285		}
4286	}
4287	/*
4288	 * If IO has not started we process the journal.  We can't mark the
4289	 * worklist item as IOWAITING because we drop the lock while
4290	 * processing the journal and the worklist entry may be freed after
4291	 * this point.  The caller may call back in and re-issue the request.
4292	 */
4293	if ((wk->wk_state & INPROGRESS) == 0) {
4294		softdep_process_journal(wk->wk_mp, wk, waitfor);
4295		if (waitfor != MNT_WAIT)
4296			return (EBUSY);
4297		return (0);
4298	}
4299	if (waitfor != MNT_WAIT)
4300		return (EBUSY);
4301	wait_worklist(wk, "jwait");
4302	return (0);
4303}
4304
4305/*
4306 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4307 * appropriate.  This is a convenience function to reduce duplicate code
4308 * for the setup and revert functions below.
4309 */
4310static struct inodedep *
4311inodedep_lookup_ip(ip)
4312	struct inode *ip;
4313{
4314	struct inodedep *inodedep;
4315	int dflags;
4316
4317	KASSERT(ip->i_nlink >= ip->i_effnlink,
4318	    ("inodedep_lookup_ip: bad delta"));
4319	dflags = DEPALLOC;
4320	if (IS_SNAPSHOT(ip))
4321		dflags |= NODELAY;
4322	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags,
4323	    &inodedep);
4324	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4325	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4326
4327	return (inodedep);
4328}
4329
4330/*
4331 * Called prior to creating a new inode and linking it to a directory.  The
4332 * jaddref structure must already be allocated by softdep_setup_inomapdep
4333 * and it is discovered here so we can initialize the mode and update
4334 * nlinkdelta.
4335 */
4336void
4337softdep_setup_create(dp, ip)
4338	struct inode *dp;
4339	struct inode *ip;
4340{
4341	struct inodedep *inodedep;
4342	struct jaddref *jaddref;
4343	struct vnode *dvp;
4344
4345	KASSERT(ip->i_nlink == 1,
4346	    ("softdep_setup_create: Invalid link count."));
4347	dvp = ITOV(dp);
4348	ACQUIRE_LOCK(&lk);
4349	inodedep = inodedep_lookup_ip(ip);
4350	if (DOINGSUJ(dvp)) {
4351		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4352		    inoreflst);
4353		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4354		    ("softdep_setup_create: No addref structure present."));
4355	}
4356	softdep_prelink(dvp, NULL);
4357	FREE_LOCK(&lk);
4358}
4359
4360/*
4361 * Create a jaddref structure to track the addition of a DOTDOT link when
4362 * we are reparenting an inode as part of a rename.  This jaddref will be
4363 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4364 * non-journaling softdep.
4365 */
4366void
4367softdep_setup_dotdot_link(dp, ip)
4368	struct inode *dp;
4369	struct inode *ip;
4370{
4371	struct inodedep *inodedep;
4372	struct jaddref *jaddref;
4373	struct vnode *dvp;
4374	struct vnode *vp;
4375
4376	dvp = ITOV(dp);
4377	vp = ITOV(ip);
4378	jaddref = NULL;
4379	/*
4380	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4381	 * is used as a normal link would be.
4382	 */
4383	if (DOINGSUJ(dvp))
4384		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4385		    dp->i_effnlink - 1, dp->i_mode);
4386	ACQUIRE_LOCK(&lk);
4387	inodedep = inodedep_lookup_ip(dp);
4388	if (jaddref)
4389		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4390		    if_deps);
4391	softdep_prelink(dvp, ITOV(ip));
4392	FREE_LOCK(&lk);
4393}
4394
4395/*
4396 * Create a jaddref structure to track a new link to an inode.  The directory
4397 * offset is not known until softdep_setup_directory_add or
4398 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4399 * softdep.
4400 */
4401void
4402softdep_setup_link(dp, ip)
4403	struct inode *dp;
4404	struct inode *ip;
4405{
4406	struct inodedep *inodedep;
4407	struct jaddref *jaddref;
4408	struct vnode *dvp;
4409
4410	dvp = ITOV(dp);
4411	jaddref = NULL;
4412	if (DOINGSUJ(dvp))
4413		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4414		    ip->i_mode);
4415	ACQUIRE_LOCK(&lk);
4416	inodedep = inodedep_lookup_ip(ip);
4417	if (jaddref)
4418		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4419		    if_deps);
4420	softdep_prelink(dvp, ITOV(ip));
4421	FREE_LOCK(&lk);
4422}
4423
4424/*
4425 * Called to create the jaddref structures to track . and .. references as
4426 * well as lookup and further initialize the incomplete jaddref created
4427 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4428 * nlinkdelta for non-journaling softdep.
4429 */
4430void
4431softdep_setup_mkdir(dp, ip)
4432	struct inode *dp;
4433	struct inode *ip;
4434{
4435	struct inodedep *inodedep;
4436	struct jaddref *dotdotaddref;
4437	struct jaddref *dotaddref;
4438	struct jaddref *jaddref;
4439	struct vnode *dvp;
4440
4441	dvp = ITOV(dp);
4442	dotaddref = dotdotaddref = NULL;
4443	if (DOINGSUJ(dvp)) {
4444		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4445		    ip->i_mode);
4446		dotaddref->ja_state |= MKDIR_BODY;
4447		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4448		    dp->i_effnlink - 1, dp->i_mode);
4449		dotdotaddref->ja_state |= MKDIR_PARENT;
4450	}
4451	ACQUIRE_LOCK(&lk);
4452	inodedep = inodedep_lookup_ip(ip);
4453	if (DOINGSUJ(dvp)) {
4454		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4455		    inoreflst);
4456		KASSERT(jaddref != NULL,
4457		    ("softdep_setup_mkdir: No addref structure present."));
4458		KASSERT(jaddref->ja_parent == dp->i_number,
4459		    ("softdep_setup_mkdir: bad parent %ju",
4460		    (uintmax_t)jaddref->ja_parent));
4461		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4462		    if_deps);
4463	}
4464	inodedep = inodedep_lookup_ip(dp);
4465	if (DOINGSUJ(dvp))
4466		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4467		    &dotdotaddref->ja_ref, if_deps);
4468	softdep_prelink(ITOV(dp), NULL);
4469	FREE_LOCK(&lk);
4470}
4471
4472/*
4473 * Called to track nlinkdelta of the inode and parent directories prior to
4474 * unlinking a directory.
4475 */
4476void
4477softdep_setup_rmdir(dp, ip)
4478	struct inode *dp;
4479	struct inode *ip;
4480{
4481	struct vnode *dvp;
4482
4483	dvp = ITOV(dp);
4484	ACQUIRE_LOCK(&lk);
4485	(void) inodedep_lookup_ip(ip);
4486	(void) inodedep_lookup_ip(dp);
4487	softdep_prelink(dvp, ITOV(ip));
4488	FREE_LOCK(&lk);
4489}
4490
4491/*
4492 * Called to track nlinkdelta of the inode and parent directories prior to
4493 * unlink.
4494 */
4495void
4496softdep_setup_unlink(dp, ip)
4497	struct inode *dp;
4498	struct inode *ip;
4499{
4500	struct vnode *dvp;
4501
4502	dvp = ITOV(dp);
4503	ACQUIRE_LOCK(&lk);
4504	(void) inodedep_lookup_ip(ip);
4505	(void) inodedep_lookup_ip(dp);
4506	softdep_prelink(dvp, ITOV(ip));
4507	FREE_LOCK(&lk);
4508}
4509
4510/*
4511 * Called to release the journal structures created by a failed non-directory
4512 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4513 */
4514void
4515softdep_revert_create(dp, ip)
4516	struct inode *dp;
4517	struct inode *ip;
4518{
4519	struct inodedep *inodedep;
4520	struct jaddref *jaddref;
4521	struct vnode *dvp;
4522
4523	dvp = ITOV(dp);
4524	ACQUIRE_LOCK(&lk);
4525	inodedep = inodedep_lookup_ip(ip);
4526	if (DOINGSUJ(dvp)) {
4527		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4528		    inoreflst);
4529		KASSERT(jaddref->ja_parent == dp->i_number,
4530		    ("softdep_revert_create: addref parent mismatch"));
4531		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4532	}
4533	FREE_LOCK(&lk);
4534}
4535
4536/*
4537 * Called to release the journal structures created by a failed dotdot link
4538 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4539 */
4540void
4541softdep_revert_dotdot_link(dp, ip)
4542	struct inode *dp;
4543	struct inode *ip;
4544{
4545	struct inodedep *inodedep;
4546	struct jaddref *jaddref;
4547	struct vnode *dvp;
4548
4549	dvp = ITOV(dp);
4550	ACQUIRE_LOCK(&lk);
4551	inodedep = inodedep_lookup_ip(dp);
4552	if (DOINGSUJ(dvp)) {
4553		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4554		    inoreflst);
4555		KASSERT(jaddref->ja_parent == ip->i_number,
4556		    ("softdep_revert_dotdot_link: addref parent mismatch"));
4557		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4558	}
4559	FREE_LOCK(&lk);
4560}
4561
4562/*
4563 * Called to release the journal structures created by a failed link
4564 * addition.  Adjusts nlinkdelta for non-journaling softdep.
4565 */
4566void
4567softdep_revert_link(dp, ip)
4568	struct inode *dp;
4569	struct inode *ip;
4570{
4571	struct inodedep *inodedep;
4572	struct jaddref *jaddref;
4573	struct vnode *dvp;
4574
4575	dvp = ITOV(dp);
4576	ACQUIRE_LOCK(&lk);
4577	inodedep = inodedep_lookup_ip(ip);
4578	if (DOINGSUJ(dvp)) {
4579		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4580		    inoreflst);
4581		KASSERT(jaddref->ja_parent == dp->i_number,
4582		    ("softdep_revert_link: addref parent mismatch"));
4583		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4584	}
4585	FREE_LOCK(&lk);
4586}
4587
4588/*
4589 * Called to release the journal structures created by a failed mkdir
4590 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4591 */
4592void
4593softdep_revert_mkdir(dp, ip)
4594	struct inode *dp;
4595	struct inode *ip;
4596{
4597	struct inodedep *inodedep;
4598	struct jaddref *jaddref;
4599	struct jaddref *dotaddref;
4600	struct vnode *dvp;
4601
4602	dvp = ITOV(dp);
4603
4604	ACQUIRE_LOCK(&lk);
4605	inodedep = inodedep_lookup_ip(dp);
4606	if (DOINGSUJ(dvp)) {
4607		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4608		    inoreflst);
4609		KASSERT(jaddref->ja_parent == ip->i_number,
4610		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4611		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4612	}
4613	inodedep = inodedep_lookup_ip(ip);
4614	if (DOINGSUJ(dvp)) {
4615		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4616		    inoreflst);
4617		KASSERT(jaddref->ja_parent == dp->i_number,
4618		    ("softdep_revert_mkdir: addref parent mismatch"));
4619		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4620		    inoreflst, if_deps);
4621		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4622		KASSERT(dotaddref->ja_parent == ip->i_number,
4623		    ("softdep_revert_mkdir: dot addref parent mismatch"));
4624		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4625	}
4626	FREE_LOCK(&lk);
4627}
4628
4629/*
4630 * Called to correct nlinkdelta after a failed rmdir.
4631 */
4632void
4633softdep_revert_rmdir(dp, ip)
4634	struct inode *dp;
4635	struct inode *ip;
4636{
4637
4638	ACQUIRE_LOCK(&lk);
4639	(void) inodedep_lookup_ip(ip);
4640	(void) inodedep_lookup_ip(dp);
4641	FREE_LOCK(&lk);
4642}
4643
4644/*
4645 * Protecting the freemaps (or bitmaps).
4646 *
4647 * To eliminate the need to execute fsck before mounting a filesystem
4648 * after a power failure, one must (conservatively) guarantee that the
4649 * on-disk copy of the bitmaps never indicate that a live inode or block is
4650 * free.  So, when a block or inode is allocated, the bitmap should be
4651 * updated (on disk) before any new pointers.  When a block or inode is
4652 * freed, the bitmap should not be updated until all pointers have been
4653 * reset.  The latter dependency is handled by the delayed de-allocation
4654 * approach described below for block and inode de-allocation.  The former
4655 * dependency is handled by calling the following procedure when a block or
4656 * inode is allocated. When an inode is allocated an "inodedep" is created
4657 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4658 * Each "inodedep" is also inserted into the hash indexing structure so
4659 * that any additional link additions can be made dependent on the inode
4660 * allocation.
4661 *
4662 * The ufs filesystem maintains a number of free block counts (e.g., per
4663 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4664 * in addition to the bitmaps.  These counts are used to improve efficiency
4665 * during allocation and therefore must be consistent with the bitmaps.
4666 * There is no convenient way to guarantee post-crash consistency of these
4667 * counts with simple update ordering, for two main reasons: (1) The counts
4668 * and bitmaps for a single cylinder group block are not in the same disk
4669 * sector.  If a disk write is interrupted (e.g., by power failure), one may
4670 * be written and the other not.  (2) Some of the counts are located in the
4671 * superblock rather than the cylinder group block. So, we focus our soft
4672 * updates implementation on protecting the bitmaps. When mounting a
4673 * filesystem, we recompute the auxiliary counts from the bitmaps.
4674 */
4675
4676/*
4677 * Called just after updating the cylinder group block to allocate an inode.
4678 */
4679void
4680softdep_setup_inomapdep(bp, ip, newinum, mode)
4681	struct buf *bp;		/* buffer for cylgroup block with inode map */
4682	struct inode *ip;	/* inode related to allocation */
4683	ino_t newinum;		/* new inode number being allocated */
4684	int mode;
4685{
4686	struct inodedep *inodedep;
4687	struct bmsafemap *bmsafemap;
4688	struct jaddref *jaddref;
4689	struct mount *mp;
4690	struct fs *fs;
4691
4692	mp = UFSTOVFS(ip->i_ump);
4693	fs = ip->i_ump->um_fs;
4694	jaddref = NULL;
4695
4696	/*
4697	 * Allocate the journal reference add structure so that the bitmap
4698	 * can be dependent on it.
4699	 */
4700	if (MOUNTEDSUJ(mp)) {
4701		jaddref = newjaddref(ip, newinum, 0, 0, mode);
4702		jaddref->ja_state |= NEWBLOCK;
4703	}
4704
4705	/*
4706	 * Create a dependency for the newly allocated inode.
4707	 * Panic if it already exists as something is seriously wrong.
4708	 * Otherwise add it to the dependency list for the buffer holding
4709	 * the cylinder group map from which it was allocated.
4710	 *
4711	 * We have to preallocate a bmsafemap entry in case it is needed
4712	 * in bmsafemap_lookup since once we allocate the inodedep, we
4713	 * have to finish initializing it before we can FREE_LOCK().
4714	 * By preallocating, we avoid FREE_LOCK() while doing a malloc
4715	 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
4716	 * creating the inodedep as it can be freed during the time
4717	 * that we FREE_LOCK() while allocating the inodedep. We must
4718	 * call workitem_alloc() before entering the locked section as
4719	 * it also acquires the lock and we must avoid trying doing so
4720	 * recursively.
4721	 */
4722	bmsafemap = malloc(sizeof(struct bmsafemap),
4723	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
4724	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
4725	ACQUIRE_LOCK(&lk);
4726	if ((inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep)))
4727		panic("softdep_setup_inomapdep: dependency %p for new"
4728		    "inode already exists", inodedep);
4729	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
4730	if (jaddref) {
4731		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
4732		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4733		    if_deps);
4734	} else {
4735		inodedep->id_state |= ONDEPLIST;
4736		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
4737	}
4738	inodedep->id_bmsafemap = bmsafemap;
4739	inodedep->id_state &= ~DEPCOMPLETE;
4740	FREE_LOCK(&lk);
4741}
4742
4743/*
4744 * Called just after updating the cylinder group block to
4745 * allocate block or fragment.
4746 */
4747void
4748softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
4749	struct buf *bp;		/* buffer for cylgroup block with block map */
4750	struct mount *mp;	/* filesystem doing allocation */
4751	ufs2_daddr_t newblkno;	/* number of newly allocated block */
4752	int frags;		/* Number of fragments. */
4753	int oldfrags;		/* Previous number of fragments for extend. */
4754{
4755	struct newblk *newblk;
4756	struct bmsafemap *bmsafemap;
4757	struct jnewblk *jnewblk;
4758	struct fs *fs;
4759
4760	fs = VFSTOUFS(mp)->um_fs;
4761	jnewblk = NULL;
4762	/*
4763	 * Create a dependency for the newly allocated block.
4764	 * Add it to the dependency list for the buffer holding
4765	 * the cylinder group map from which it was allocated.
4766	 */
4767	if (MOUNTEDSUJ(mp)) {
4768		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
4769		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
4770		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
4771		jnewblk->jn_state = ATTACHED;
4772		jnewblk->jn_blkno = newblkno;
4773		jnewblk->jn_frags = frags;
4774		jnewblk->jn_oldfrags = oldfrags;
4775#ifdef SUJ_DEBUG
4776		{
4777			struct cg *cgp;
4778			uint8_t *blksfree;
4779			long bno;
4780			int i;
4781
4782			cgp = (struct cg *)bp->b_data;
4783			blksfree = cg_blksfree(cgp);
4784			bno = dtogd(fs, jnewblk->jn_blkno);
4785			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
4786			    i++) {
4787				if (isset(blksfree, bno + i))
4788					panic("softdep_setup_blkmapdep: "
4789					    "free fragment %d from %d-%d "
4790					    "state 0x%X dep %p", i,
4791					    jnewblk->jn_oldfrags,
4792					    jnewblk->jn_frags,
4793					    jnewblk->jn_state,
4794					    jnewblk->jn_dep);
4795			}
4796		}
4797#endif
4798	}
4799	ACQUIRE_LOCK(&lk);
4800	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
4801		panic("softdep_setup_blkmapdep: found block");
4802	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
4803	    dtog(fs, newblkno), NULL);
4804	if (jnewblk) {
4805		jnewblk->jn_dep = (struct worklist *)newblk;
4806		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
4807	} else {
4808		newblk->nb_state |= ONDEPLIST;
4809		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
4810	}
4811	newblk->nb_bmsafemap = bmsafemap;
4812	newblk->nb_jnewblk = jnewblk;
4813	FREE_LOCK(&lk);
4814}
4815
4816#define	BMSAFEMAP_HASH(fs, cg) \
4817      (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash])
4818
4819static int
4820bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp)
4821	struct bmsafemap_hashhead *bmsafemaphd;
4822	struct mount *mp;
4823	int cg;
4824	struct bmsafemap **bmsafemapp;
4825{
4826	struct bmsafemap *bmsafemap;
4827
4828	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
4829		if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg)
4830			break;
4831	if (bmsafemap) {
4832		*bmsafemapp = bmsafemap;
4833		return (1);
4834	}
4835	*bmsafemapp = NULL;
4836
4837	return (0);
4838}
4839
4840/*
4841 * Find the bmsafemap associated with a cylinder group buffer.
4842 * If none exists, create one. The buffer must be locked when
4843 * this routine is called and this routine must be called with
4844 * the softdep lock held. To avoid giving up the lock while
4845 * allocating a new bmsafemap, a preallocated bmsafemap may be
4846 * provided. If it is provided but not needed, it is freed.
4847 */
4848static struct bmsafemap *
4849bmsafemap_lookup(mp, bp, cg, newbmsafemap)
4850	struct mount *mp;
4851	struct buf *bp;
4852	int cg;
4853	struct bmsafemap *newbmsafemap;
4854{
4855	struct bmsafemap_hashhead *bmsafemaphd;
4856	struct bmsafemap *bmsafemap, *collision;
4857	struct worklist *wk;
4858	struct fs *fs;
4859
4860	mtx_assert(&lk, MA_OWNED);
4861	if (bp)
4862		LIST_FOREACH(wk, &bp->b_dep, wk_list)
4863			if (wk->wk_type == D_BMSAFEMAP) {
4864				if (newbmsafemap)
4865					WORKITEM_FREE(newbmsafemap,D_BMSAFEMAP);
4866				return (WK_BMSAFEMAP(wk));
4867			}
4868	fs = VFSTOUFS(mp)->um_fs;
4869	bmsafemaphd = BMSAFEMAP_HASH(fs, cg);
4870	if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) {
4871		if (newbmsafemap)
4872			WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
4873		return (bmsafemap);
4874	}
4875	if (newbmsafemap) {
4876		bmsafemap = newbmsafemap;
4877	} else {
4878		FREE_LOCK(&lk);
4879		bmsafemap = malloc(sizeof(struct bmsafemap),
4880			M_BMSAFEMAP, M_SOFTDEP_FLAGS);
4881		workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
4882		ACQUIRE_LOCK(&lk);
4883	}
4884	bmsafemap->sm_buf = bp;
4885	LIST_INIT(&bmsafemap->sm_inodedephd);
4886	LIST_INIT(&bmsafemap->sm_inodedepwr);
4887	LIST_INIT(&bmsafemap->sm_newblkhd);
4888	LIST_INIT(&bmsafemap->sm_newblkwr);
4889	LIST_INIT(&bmsafemap->sm_jaddrefhd);
4890	LIST_INIT(&bmsafemap->sm_jnewblkhd);
4891	LIST_INIT(&bmsafemap->sm_freehd);
4892	LIST_INIT(&bmsafemap->sm_freewr);
4893	if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) {
4894		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
4895		return (collision);
4896	}
4897	bmsafemap->sm_cg = cg;
4898	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
4899	LIST_INSERT_HEAD(&VFSTOUFS(mp)->softdep_dirtycg, bmsafemap, sm_next);
4900	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
4901	return (bmsafemap);
4902}
4903
4904/*
4905 * Direct block allocation dependencies.
4906 *
4907 * When a new block is allocated, the corresponding disk locations must be
4908 * initialized (with zeros or new data) before the on-disk inode points to
4909 * them.  Also, the freemap from which the block was allocated must be
4910 * updated (on disk) before the inode's pointer. These two dependencies are
4911 * independent of each other and are needed for all file blocks and indirect
4912 * blocks that are pointed to directly by the inode.  Just before the
4913 * "in-core" version of the inode is updated with a newly allocated block
4914 * number, a procedure (below) is called to setup allocation dependency
4915 * structures.  These structures are removed when the corresponding
4916 * dependencies are satisfied or when the block allocation becomes obsolete
4917 * (i.e., the file is deleted, the block is de-allocated, or the block is a
4918 * fragment that gets upgraded).  All of these cases are handled in
4919 * procedures described later.
4920 *
4921 * When a file extension causes a fragment to be upgraded, either to a larger
4922 * fragment or to a full block, the on-disk location may change (if the
4923 * previous fragment could not simply be extended). In this case, the old
4924 * fragment must be de-allocated, but not until after the inode's pointer has
4925 * been updated. In most cases, this is handled by later procedures, which
4926 * will construct a "freefrag" structure to be added to the workitem queue
4927 * when the inode update is complete (or obsolete).  The main exception to
4928 * this is when an allocation occurs while a pending allocation dependency
4929 * (for the same block pointer) remains.  This case is handled in the main
4930 * allocation dependency setup procedure by immediately freeing the
4931 * unreferenced fragments.
4932 */
4933void
4934softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
4935	struct inode *ip;	/* inode to which block is being added */
4936	ufs_lbn_t off;		/* block pointer within inode */
4937	ufs2_daddr_t newblkno;	/* disk block number being added */
4938	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
4939	long newsize;		/* size of new block */
4940	long oldsize;		/* size of new block */
4941	struct buf *bp;		/* bp for allocated block */
4942{
4943	struct allocdirect *adp, *oldadp;
4944	struct allocdirectlst *adphead;
4945	struct freefrag *freefrag;
4946	struct inodedep *inodedep;
4947	struct pagedep *pagedep;
4948	struct jnewblk *jnewblk;
4949	struct newblk *newblk;
4950	struct mount *mp;
4951	ufs_lbn_t lbn;
4952
4953	lbn = bp->b_lblkno;
4954	mp = UFSTOVFS(ip->i_ump);
4955	if (oldblkno && oldblkno != newblkno)
4956		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
4957	else
4958		freefrag = NULL;
4959
4960	ACQUIRE_LOCK(&lk);
4961	if (off >= NDADDR) {
4962		if (lbn > 0)
4963			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
4964			    lbn, off);
4965		/* allocating an indirect block */
4966		if (oldblkno != 0)
4967			panic("softdep_setup_allocdirect: non-zero indir");
4968	} else {
4969		if (off != lbn)
4970			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
4971			    lbn, off);
4972		/*
4973		 * Allocating a direct block.
4974		 *
4975		 * If we are allocating a directory block, then we must
4976		 * allocate an associated pagedep to track additions and
4977		 * deletions.
4978		 */
4979		if ((ip->i_mode & IFMT) == IFDIR)
4980			pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
4981			    &pagedep);
4982	}
4983	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
4984		panic("softdep_setup_allocdirect: lost block");
4985	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4986	    ("softdep_setup_allocdirect: newblk already initialized"));
4987	/*
4988	 * Convert the newblk to an allocdirect.
4989	 */
4990	newblk->nb_list.wk_type = D_ALLOCDIRECT;
4991	adp = (struct allocdirect *)newblk;
4992	newblk->nb_freefrag = freefrag;
4993	adp->ad_offset = off;
4994	adp->ad_oldblkno = oldblkno;
4995	adp->ad_newsize = newsize;
4996	adp->ad_oldsize = oldsize;
4997
4998	/*
4999	 * Finish initializing the journal.
5000	 */
5001	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5002		jnewblk->jn_ino = ip->i_number;
5003		jnewblk->jn_lbn = lbn;
5004		add_to_journal(&jnewblk->jn_list);
5005	}
5006	if (freefrag && freefrag->ff_jdep != NULL &&
5007	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5008		add_to_journal(freefrag->ff_jdep);
5009	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5010	adp->ad_inodedep = inodedep;
5011
5012	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5013	/*
5014	 * The list of allocdirects must be kept in sorted and ascending
5015	 * order so that the rollback routines can quickly determine the
5016	 * first uncommitted block (the size of the file stored on disk
5017	 * ends at the end of the lowest committed fragment, or if there
5018	 * are no fragments, at the end of the highest committed block).
5019	 * Since files generally grow, the typical case is that the new
5020	 * block is to be added at the end of the list. We speed this
5021	 * special case by checking against the last allocdirect in the
5022	 * list before laboriously traversing the list looking for the
5023	 * insertion point.
5024	 */
5025	adphead = &inodedep->id_newinoupdt;
5026	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5027	if (oldadp == NULL || oldadp->ad_offset <= off) {
5028		/* insert at end of list */
5029		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5030		if (oldadp != NULL && oldadp->ad_offset == off)
5031			allocdirect_merge(adphead, adp, oldadp);
5032		FREE_LOCK(&lk);
5033		return;
5034	}
5035	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5036		if (oldadp->ad_offset >= off)
5037			break;
5038	}
5039	if (oldadp == NULL)
5040		panic("softdep_setup_allocdirect: lost entry");
5041	/* insert in middle of list */
5042	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5043	if (oldadp->ad_offset == off)
5044		allocdirect_merge(adphead, adp, oldadp);
5045
5046	FREE_LOCK(&lk);
5047}
5048
5049/*
5050 * Merge a newer and older journal record to be stored either in a
5051 * newblock or freefrag.  This handles aggregating journal records for
5052 * fragment allocation into a second record as well as replacing a
5053 * journal free with an aborted journal allocation.  A segment for the
5054 * oldest record will be placed on wkhd if it has been written.  If not
5055 * the segment for the newer record will suffice.
5056 */
5057static struct worklist *
5058jnewblk_merge(new, old, wkhd)
5059	struct worklist *new;
5060	struct worklist *old;
5061	struct workhead *wkhd;
5062{
5063	struct jnewblk *njnewblk;
5064	struct jnewblk *jnewblk;
5065
5066	/* Handle NULLs to simplify callers. */
5067	if (new == NULL)
5068		return (old);
5069	if (old == NULL)
5070		return (new);
5071	/* Replace a jfreefrag with a jnewblk. */
5072	if (new->wk_type == D_JFREEFRAG) {
5073		cancel_jfreefrag(WK_JFREEFRAG(new));
5074		return (old);
5075	}
5076	/*
5077	 * Handle merging of two jnewblk records that describe
5078	 * different sets of fragments in the same block.
5079	 */
5080	jnewblk = WK_JNEWBLK(old);
5081	njnewblk = WK_JNEWBLK(new);
5082	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5083		panic("jnewblk_merge: Merging disparate blocks.");
5084	/*
5085	 * The record may be rolled back in the cg.
5086	 */
5087	if (jnewblk->jn_state & UNDONE) {
5088		jnewblk->jn_state &= ~UNDONE;
5089		njnewblk->jn_state |= UNDONE;
5090		njnewblk->jn_state &= ~ATTACHED;
5091	}
5092	/*
5093	 * We modify the newer addref and free the older so that if neither
5094	 * has been written the most up-to-date copy will be on disk.  If
5095	 * both have been written but rolled back we only temporarily need
5096	 * one of them to fix the bits when the cg write completes.
5097	 */
5098	jnewblk->jn_state |= ATTACHED | COMPLETE;
5099	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5100	cancel_jnewblk(jnewblk, wkhd);
5101	WORKLIST_REMOVE(&jnewblk->jn_list);
5102	free_jnewblk(jnewblk);
5103	return (new);
5104}
5105
5106/*
5107 * Replace an old allocdirect dependency with a newer one.
5108 * This routine must be called with splbio interrupts blocked.
5109 */
5110static void
5111allocdirect_merge(adphead, newadp, oldadp)
5112	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
5113	struct allocdirect *newadp;	/* allocdirect being added */
5114	struct allocdirect *oldadp;	/* existing allocdirect being checked */
5115{
5116	struct worklist *wk;
5117	struct freefrag *freefrag;
5118
5119	freefrag = NULL;
5120	mtx_assert(&lk, MA_OWNED);
5121	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5122	    newadp->ad_oldsize != oldadp->ad_newsize ||
5123	    newadp->ad_offset >= NDADDR)
5124		panic("%s %jd != new %jd || old size %ld != new %ld",
5125		    "allocdirect_merge: old blkno",
5126		    (intmax_t)newadp->ad_oldblkno,
5127		    (intmax_t)oldadp->ad_newblkno,
5128		    newadp->ad_oldsize, oldadp->ad_newsize);
5129	newadp->ad_oldblkno = oldadp->ad_oldblkno;
5130	newadp->ad_oldsize = oldadp->ad_oldsize;
5131	/*
5132	 * If the old dependency had a fragment to free or had never
5133	 * previously had a block allocated, then the new dependency
5134	 * can immediately post its freefrag and adopt the old freefrag.
5135	 * This action is done by swapping the freefrag dependencies.
5136	 * The new dependency gains the old one's freefrag, and the
5137	 * old one gets the new one and then immediately puts it on
5138	 * the worklist when it is freed by free_newblk. It is
5139	 * not possible to do this swap when the old dependency had a
5140	 * non-zero size but no previous fragment to free. This condition
5141	 * arises when the new block is an extension of the old block.
5142	 * Here, the first part of the fragment allocated to the new
5143	 * dependency is part of the block currently claimed on disk by
5144	 * the old dependency, so cannot legitimately be freed until the
5145	 * conditions for the new dependency are fulfilled.
5146	 */
5147	freefrag = newadp->ad_freefrag;
5148	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5149		newadp->ad_freefrag = oldadp->ad_freefrag;
5150		oldadp->ad_freefrag = freefrag;
5151	}
5152	/*
5153	 * If we are tracking a new directory-block allocation,
5154	 * move it from the old allocdirect to the new allocdirect.
5155	 */
5156	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5157		WORKLIST_REMOVE(wk);
5158		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5159			panic("allocdirect_merge: extra newdirblk");
5160		WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5161	}
5162	TAILQ_REMOVE(adphead, oldadp, ad_next);
5163	/*
5164	 * We need to move any journal dependencies over to the freefrag
5165	 * that releases this block if it exists.  Otherwise we are
5166	 * extending an existing block and we'll wait until that is
5167	 * complete to release the journal space and extend the
5168	 * new journal to cover this old space as well.
5169	 */
5170	if (freefrag == NULL) {
5171		if (oldadp->ad_newblkno != newadp->ad_newblkno)
5172			panic("allocdirect_merge: %jd != %jd",
5173			    oldadp->ad_newblkno, newadp->ad_newblkno);
5174		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5175		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5176		    &oldadp->ad_block.nb_jnewblk->jn_list,
5177		    &newadp->ad_block.nb_jwork);
5178		oldadp->ad_block.nb_jnewblk = NULL;
5179		cancel_newblk(&oldadp->ad_block, NULL,
5180		    &newadp->ad_block.nb_jwork);
5181	} else {
5182		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5183		    &freefrag->ff_list, &freefrag->ff_jwork);
5184		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5185		    &freefrag->ff_jwork);
5186	}
5187	free_newblk(&oldadp->ad_block);
5188}
5189
5190/*
5191 * Allocate a jfreefrag structure to journal a single block free.
5192 */
5193static struct jfreefrag *
5194newjfreefrag(freefrag, ip, blkno, size, lbn)
5195	struct freefrag *freefrag;
5196	struct inode *ip;
5197	ufs2_daddr_t blkno;
5198	long size;
5199	ufs_lbn_t lbn;
5200{
5201	struct jfreefrag *jfreefrag;
5202	struct fs *fs;
5203
5204	fs = ip->i_fs;
5205	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5206	    M_SOFTDEP_FLAGS);
5207	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
5208	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5209	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5210	jfreefrag->fr_ino = ip->i_number;
5211	jfreefrag->fr_lbn = lbn;
5212	jfreefrag->fr_blkno = blkno;
5213	jfreefrag->fr_frags = numfrags(fs, size);
5214	jfreefrag->fr_freefrag = freefrag;
5215
5216	return (jfreefrag);
5217}
5218
5219/*
5220 * Allocate a new freefrag structure.
5221 */
5222static struct freefrag *
5223newfreefrag(ip, blkno, size, lbn)
5224	struct inode *ip;
5225	ufs2_daddr_t blkno;
5226	long size;
5227	ufs_lbn_t lbn;
5228{
5229	struct freefrag *freefrag;
5230	struct fs *fs;
5231
5232	fs = ip->i_fs;
5233	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5234		panic("newfreefrag: frag size");
5235	freefrag = malloc(sizeof(struct freefrag),
5236	    M_FREEFRAG, M_SOFTDEP_FLAGS);
5237	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
5238	freefrag->ff_state = ATTACHED;
5239	LIST_INIT(&freefrag->ff_jwork);
5240	freefrag->ff_inum = ip->i_number;
5241	freefrag->ff_vtype = ITOV(ip)->v_type;
5242	freefrag->ff_blkno = blkno;
5243	freefrag->ff_fragsize = size;
5244
5245	if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) {
5246		freefrag->ff_jdep = (struct worklist *)
5247		    newjfreefrag(freefrag, ip, blkno, size, lbn);
5248	} else {
5249		freefrag->ff_state |= DEPCOMPLETE;
5250		freefrag->ff_jdep = NULL;
5251	}
5252
5253	return (freefrag);
5254}
5255
5256/*
5257 * This workitem de-allocates fragments that were replaced during
5258 * file block allocation.
5259 */
5260static void
5261handle_workitem_freefrag(freefrag)
5262	struct freefrag *freefrag;
5263{
5264	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5265	struct workhead wkhd;
5266
5267	/*
5268	 * It would be illegal to add new completion items to the
5269	 * freefrag after it was schedule to be done so it must be
5270	 * safe to modify the list head here.
5271	 */
5272	LIST_INIT(&wkhd);
5273	ACQUIRE_LOCK(&lk);
5274	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5275	/*
5276	 * If the journal has not been written we must cancel it here.
5277	 */
5278	if (freefrag->ff_jdep) {
5279		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5280			panic("handle_workitem_freefrag: Unexpected type %d\n",
5281			    freefrag->ff_jdep->wk_type);
5282		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5283	}
5284	FREE_LOCK(&lk);
5285	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5286	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
5287	ACQUIRE_LOCK(&lk);
5288	WORKITEM_FREE(freefrag, D_FREEFRAG);
5289	FREE_LOCK(&lk);
5290}
5291
5292/*
5293 * Set up a dependency structure for an external attributes data block.
5294 * This routine follows much of the structure of softdep_setup_allocdirect.
5295 * See the description of softdep_setup_allocdirect above for details.
5296 */
5297void
5298softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5299	struct inode *ip;
5300	ufs_lbn_t off;
5301	ufs2_daddr_t newblkno;
5302	ufs2_daddr_t oldblkno;
5303	long newsize;
5304	long oldsize;
5305	struct buf *bp;
5306{
5307	struct allocdirect *adp, *oldadp;
5308	struct allocdirectlst *adphead;
5309	struct freefrag *freefrag;
5310	struct inodedep *inodedep;
5311	struct jnewblk *jnewblk;
5312	struct newblk *newblk;
5313	struct mount *mp;
5314	ufs_lbn_t lbn;
5315
5316	if (off >= NXADDR)
5317		panic("softdep_setup_allocext: lbn %lld > NXADDR",
5318		    (long long)off);
5319
5320	lbn = bp->b_lblkno;
5321	mp = UFSTOVFS(ip->i_ump);
5322	if (oldblkno && oldblkno != newblkno)
5323		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5324	else
5325		freefrag = NULL;
5326
5327	ACQUIRE_LOCK(&lk);
5328	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5329		panic("softdep_setup_allocext: lost block");
5330	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5331	    ("softdep_setup_allocext: newblk already initialized"));
5332	/*
5333	 * Convert the newblk to an allocdirect.
5334	 */
5335	newblk->nb_list.wk_type = D_ALLOCDIRECT;
5336	adp = (struct allocdirect *)newblk;
5337	newblk->nb_freefrag = freefrag;
5338	adp->ad_offset = off;
5339	adp->ad_oldblkno = oldblkno;
5340	adp->ad_newsize = newsize;
5341	adp->ad_oldsize = oldsize;
5342	adp->ad_state |=  EXTDATA;
5343
5344	/*
5345	 * Finish initializing the journal.
5346	 */
5347	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5348		jnewblk->jn_ino = ip->i_number;
5349		jnewblk->jn_lbn = lbn;
5350		add_to_journal(&jnewblk->jn_list);
5351	}
5352	if (freefrag && freefrag->ff_jdep != NULL &&
5353	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5354		add_to_journal(freefrag->ff_jdep);
5355	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5356	adp->ad_inodedep = inodedep;
5357
5358	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5359	/*
5360	 * The list of allocdirects must be kept in sorted and ascending
5361	 * order so that the rollback routines can quickly determine the
5362	 * first uncommitted block (the size of the file stored on disk
5363	 * ends at the end of the lowest committed fragment, or if there
5364	 * are no fragments, at the end of the highest committed block).
5365	 * Since files generally grow, the typical case is that the new
5366	 * block is to be added at the end of the list. We speed this
5367	 * special case by checking against the last allocdirect in the
5368	 * list before laboriously traversing the list looking for the
5369	 * insertion point.
5370	 */
5371	adphead = &inodedep->id_newextupdt;
5372	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5373	if (oldadp == NULL || oldadp->ad_offset <= off) {
5374		/* insert at end of list */
5375		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5376		if (oldadp != NULL && oldadp->ad_offset == off)
5377			allocdirect_merge(adphead, adp, oldadp);
5378		FREE_LOCK(&lk);
5379		return;
5380	}
5381	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5382		if (oldadp->ad_offset >= off)
5383			break;
5384	}
5385	if (oldadp == NULL)
5386		panic("softdep_setup_allocext: lost entry");
5387	/* insert in middle of list */
5388	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5389	if (oldadp->ad_offset == off)
5390		allocdirect_merge(adphead, adp, oldadp);
5391	FREE_LOCK(&lk);
5392}
5393
5394/*
5395 * Indirect block allocation dependencies.
5396 *
5397 * The same dependencies that exist for a direct block also exist when
5398 * a new block is allocated and pointed to by an entry in a block of
5399 * indirect pointers. The undo/redo states described above are also
5400 * used here. Because an indirect block contains many pointers that
5401 * may have dependencies, a second copy of the entire in-memory indirect
5402 * block is kept. The buffer cache copy is always completely up-to-date.
5403 * The second copy, which is used only as a source for disk writes,
5404 * contains only the safe pointers (i.e., those that have no remaining
5405 * update dependencies). The second copy is freed when all pointers
5406 * are safe. The cache is not allowed to replace indirect blocks with
5407 * pending update dependencies. If a buffer containing an indirect
5408 * block with dependencies is written, these routines will mark it
5409 * dirty again. It can only be successfully written once all the
5410 * dependencies are removed. The ffs_fsync routine in conjunction with
5411 * softdep_sync_metadata work together to get all the dependencies
5412 * removed so that a file can be successfully written to disk. Three
5413 * procedures are used when setting up indirect block pointer
5414 * dependencies. The division is necessary because of the organization
5415 * of the "balloc" routine and because of the distinction between file
5416 * pages and file metadata blocks.
5417 */
5418
5419/*
5420 * Allocate a new allocindir structure.
5421 */
5422static struct allocindir *
5423newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
5424	struct inode *ip;	/* inode for file being extended */
5425	int ptrno;		/* offset of pointer in indirect block */
5426	ufs2_daddr_t newblkno;	/* disk block number being added */
5427	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5428	ufs_lbn_t lbn;
5429{
5430	struct newblk *newblk;
5431	struct allocindir *aip;
5432	struct freefrag *freefrag;
5433	struct jnewblk *jnewblk;
5434
5435	if (oldblkno)
5436		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
5437	else
5438		freefrag = NULL;
5439	ACQUIRE_LOCK(&lk);
5440	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
5441		panic("new_allocindir: lost block");
5442	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5443	    ("newallocindir: newblk already initialized"));
5444	newblk->nb_list.wk_type = D_ALLOCINDIR;
5445	newblk->nb_freefrag = freefrag;
5446	aip = (struct allocindir *)newblk;
5447	aip->ai_offset = ptrno;
5448	aip->ai_oldblkno = oldblkno;
5449	aip->ai_lbn = lbn;
5450	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5451		jnewblk->jn_ino = ip->i_number;
5452		jnewblk->jn_lbn = lbn;
5453		add_to_journal(&jnewblk->jn_list);
5454	}
5455	if (freefrag && freefrag->ff_jdep != NULL &&
5456	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5457		add_to_journal(freefrag->ff_jdep);
5458	return (aip);
5459}
5460
5461/*
5462 * Called just before setting an indirect block pointer
5463 * to a newly allocated file page.
5464 */
5465void
5466softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
5467	struct inode *ip;	/* inode for file being extended */
5468	ufs_lbn_t lbn;		/* allocated block number within file */
5469	struct buf *bp;		/* buffer with indirect blk referencing page */
5470	int ptrno;		/* offset of pointer in indirect block */
5471	ufs2_daddr_t newblkno;	/* disk block number being added */
5472	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5473	struct buf *nbp;	/* buffer holding allocated page */
5474{
5475	struct inodedep *inodedep;
5476	struct freefrag *freefrag;
5477	struct allocindir *aip;
5478	struct pagedep *pagedep;
5479	struct mount *mp;
5480	int dflags;
5481
5482	if (lbn != nbp->b_lblkno)
5483		panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
5484		    lbn, bp->b_lblkno);
5485	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
5486	mp = UFSTOVFS(ip->i_ump);
5487	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
5488	dflags = DEPALLOC;
5489	if (IS_SNAPSHOT(ip))
5490		dflags |= NODELAY;
5491	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
5492	/*
5493	 * If we are allocating a directory page, then we must
5494	 * allocate an associated pagedep to track additions and
5495	 * deletions.
5496	 */
5497	if ((ip->i_mode & IFMT) == IFDIR)
5498		pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
5499	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5500	freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5501	FREE_LOCK(&lk);
5502	if (freefrag)
5503		handle_workitem_freefrag(freefrag);
5504}
5505
5506/*
5507 * Called just before setting an indirect block pointer to a
5508 * newly allocated indirect block.
5509 */
5510void
5511softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
5512	struct buf *nbp;	/* newly allocated indirect block */
5513	struct inode *ip;	/* inode for file being extended */
5514	struct buf *bp;		/* indirect block referencing allocated block */
5515	int ptrno;		/* offset of pointer in indirect block */
5516	ufs2_daddr_t newblkno;	/* disk block number being added */
5517{
5518	struct inodedep *inodedep;
5519	struct allocindir *aip;
5520	ufs_lbn_t lbn;
5521	int dflags;
5522
5523	lbn = nbp->b_lblkno;
5524	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
5525	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
5526	dflags = DEPALLOC;
5527	if (IS_SNAPSHOT(ip))
5528		dflags |= NODELAY;
5529	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
5530	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5531	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
5532		panic("softdep_setup_allocindir_meta: Block already existed");
5533	FREE_LOCK(&lk);
5534}
5535
5536static void
5537indirdep_complete(indirdep)
5538	struct indirdep *indirdep;
5539{
5540	struct allocindir *aip;
5541
5542	LIST_REMOVE(indirdep, ir_next);
5543	indirdep->ir_state |= DEPCOMPLETE;
5544
5545	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5546		LIST_REMOVE(aip, ai_next);
5547		free_newblk(&aip->ai_block);
5548	}
5549	/*
5550	 * If this indirdep is not attached to a buf it was simply waiting
5551	 * on completion to clear completehd.  free_indirdep() asserts
5552	 * that nothing is dangling.
5553	 */
5554	if ((indirdep->ir_state & ONWORKLIST) == 0)
5555		free_indirdep(indirdep);
5556}
5557
5558static struct indirdep *
5559indirdep_lookup(mp, ip, bp)
5560	struct mount *mp;
5561	struct inode *ip;
5562	struct buf *bp;
5563{
5564	struct indirdep *indirdep, *newindirdep;
5565	struct newblk *newblk;
5566	struct worklist *wk;
5567	struct fs *fs;
5568	ufs2_daddr_t blkno;
5569
5570	mtx_assert(&lk, MA_OWNED);
5571	indirdep = NULL;
5572	newindirdep = NULL;
5573	fs = ip->i_fs;
5574	for (;;) {
5575		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5576			if (wk->wk_type != D_INDIRDEP)
5577				continue;
5578			indirdep = WK_INDIRDEP(wk);
5579			break;
5580		}
5581		/* Found on the buffer worklist, no new structure to free. */
5582		if (indirdep != NULL && newindirdep == NULL)
5583			return (indirdep);
5584		if (indirdep != NULL && newindirdep != NULL)
5585			panic("indirdep_lookup: simultaneous create");
5586		/* None found on the buffer and a new structure is ready. */
5587		if (indirdep == NULL && newindirdep != NULL)
5588			break;
5589		/* None found and no new structure available. */
5590		FREE_LOCK(&lk);
5591		newindirdep = malloc(sizeof(struct indirdep),
5592		    M_INDIRDEP, M_SOFTDEP_FLAGS);
5593		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5594		newindirdep->ir_state = ATTACHED;
5595		if (ip->i_ump->um_fstype == UFS1)
5596			newindirdep->ir_state |= UFS1FMT;
5597		TAILQ_INIT(&newindirdep->ir_trunc);
5598		newindirdep->ir_saveddata = NULL;
5599		LIST_INIT(&newindirdep->ir_deplisthd);
5600		LIST_INIT(&newindirdep->ir_donehd);
5601		LIST_INIT(&newindirdep->ir_writehd);
5602		LIST_INIT(&newindirdep->ir_completehd);
5603		if (bp->b_blkno == bp->b_lblkno) {
5604			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5605			    NULL, NULL);
5606			bp->b_blkno = blkno;
5607		}
5608		newindirdep->ir_freeblks = NULL;
5609		newindirdep->ir_savebp =
5610		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5611		newindirdep->ir_bp = bp;
5612		BUF_KERNPROC(newindirdep->ir_savebp);
5613		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5614		ACQUIRE_LOCK(&lk);
5615	}
5616	indirdep = newindirdep;
5617	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
5618	/*
5619	 * If the block is not yet allocated we don't set DEPCOMPLETE so
5620	 * that we don't free dependencies until the pointers are valid.
5621	 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
5622	 * than using the hash.
5623	 */
5624	if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
5625		LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
5626	else
5627		indirdep->ir_state |= DEPCOMPLETE;
5628	return (indirdep);
5629}
5630
5631/*
5632 * Called to finish the allocation of the "aip" allocated
5633 * by one of the two routines above.
5634 */
5635static struct freefrag *
5636setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
5637	struct buf *bp;		/* in-memory copy of the indirect block */
5638	struct inode *ip;	/* inode for file being extended */
5639	struct inodedep *inodedep; /* Inodedep for ip */
5640	struct allocindir *aip;	/* allocindir allocated by the above routines */
5641	ufs_lbn_t lbn;		/* Logical block number for this block. */
5642{
5643	struct fs *fs;
5644	struct indirdep *indirdep;
5645	struct allocindir *oldaip;
5646	struct freefrag *freefrag;
5647	struct mount *mp;
5648
5649	mtx_assert(&lk, MA_OWNED);
5650	mp = UFSTOVFS(ip->i_ump);
5651	fs = ip->i_fs;
5652	if (bp->b_lblkno >= 0)
5653		panic("setup_allocindir_phase2: not indir blk");
5654	KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
5655	    ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
5656	indirdep = indirdep_lookup(mp, ip, bp);
5657	KASSERT(indirdep->ir_savebp != NULL,
5658	    ("setup_allocindir_phase2 NULL ir_savebp"));
5659	aip->ai_indirdep = indirdep;
5660	/*
5661	 * Check for an unwritten dependency for this indirect offset.  If
5662	 * there is, merge the old dependency into the new one.  This happens
5663	 * as a result of reallocblk only.
5664	 */
5665	freefrag = NULL;
5666	if (aip->ai_oldblkno != 0) {
5667		LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
5668			if (oldaip->ai_offset == aip->ai_offset) {
5669				freefrag = allocindir_merge(aip, oldaip);
5670				goto done;
5671			}
5672		}
5673		LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
5674			if (oldaip->ai_offset == aip->ai_offset) {
5675				freefrag = allocindir_merge(aip, oldaip);
5676				goto done;
5677			}
5678		}
5679	}
5680done:
5681	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
5682	return (freefrag);
5683}
5684
5685/*
5686 * Merge two allocindirs which refer to the same block.  Move newblock
5687 * dependencies and setup the freefrags appropriately.
5688 */
5689static struct freefrag *
5690allocindir_merge(aip, oldaip)
5691	struct allocindir *aip;
5692	struct allocindir *oldaip;
5693{
5694	struct freefrag *freefrag;
5695	struct worklist *wk;
5696
5697	if (oldaip->ai_newblkno != aip->ai_oldblkno)
5698		panic("allocindir_merge: blkno");
5699	aip->ai_oldblkno = oldaip->ai_oldblkno;
5700	freefrag = aip->ai_freefrag;
5701	aip->ai_freefrag = oldaip->ai_freefrag;
5702	oldaip->ai_freefrag = NULL;
5703	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
5704	/*
5705	 * If we are tracking a new directory-block allocation,
5706	 * move it from the old allocindir to the new allocindir.
5707	 */
5708	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
5709		WORKLIST_REMOVE(wk);
5710		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
5711			panic("allocindir_merge: extra newdirblk");
5712		WORKLIST_INSERT(&aip->ai_newdirblk, wk);
5713	}
5714	/*
5715	 * We can skip journaling for this freefrag and just complete
5716	 * any pending journal work for the allocindir that is being
5717	 * removed after the freefrag completes.
5718	 */
5719	if (freefrag->ff_jdep)
5720		cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
5721	LIST_REMOVE(oldaip, ai_next);
5722	freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
5723	    &freefrag->ff_list, &freefrag->ff_jwork);
5724	free_newblk(&oldaip->ai_block);
5725
5726	return (freefrag);
5727}
5728
5729static inline void
5730setup_freedirect(freeblks, ip, i, needj)
5731	struct freeblks *freeblks;
5732	struct inode *ip;
5733	int i;
5734	int needj;
5735{
5736	ufs2_daddr_t blkno;
5737	int frags;
5738
5739	blkno = DIP(ip, i_db[i]);
5740	if (blkno == 0)
5741		return;
5742	DIP_SET(ip, i_db[i], 0);
5743	frags = sblksize(ip->i_fs, ip->i_size, i);
5744	frags = numfrags(ip->i_fs, frags);
5745	newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj);
5746}
5747
5748static inline void
5749setup_freeext(freeblks, ip, i, needj)
5750	struct freeblks *freeblks;
5751	struct inode *ip;
5752	int i;
5753	int needj;
5754{
5755	ufs2_daddr_t blkno;
5756	int frags;
5757
5758	blkno = ip->i_din2->di_extb[i];
5759	if (blkno == 0)
5760		return;
5761	ip->i_din2->di_extb[i] = 0;
5762	frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
5763	frags = numfrags(ip->i_fs, frags);
5764	newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
5765}
5766
5767static inline void
5768setup_freeindir(freeblks, ip, i, lbn, needj)
5769	struct freeblks *freeblks;
5770	struct inode *ip;
5771	int i;
5772	ufs_lbn_t lbn;
5773	int needj;
5774{
5775	ufs2_daddr_t blkno;
5776
5777	blkno = DIP(ip, i_ib[i]);
5778	if (blkno == 0)
5779		return;
5780	DIP_SET(ip, i_ib[i], 0);
5781	newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
5782	    0, needj);
5783}
5784
5785static inline struct freeblks *
5786newfreeblks(mp, ip)
5787	struct mount *mp;
5788	struct inode *ip;
5789{
5790	struct freeblks *freeblks;
5791
5792	freeblks = malloc(sizeof(struct freeblks),
5793		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
5794	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
5795	LIST_INIT(&freeblks->fb_jblkdephd);
5796	LIST_INIT(&freeblks->fb_jwork);
5797	freeblks->fb_ref = 0;
5798	freeblks->fb_cgwait = 0;
5799	freeblks->fb_state = ATTACHED;
5800	freeblks->fb_uid = ip->i_uid;
5801	freeblks->fb_inum = ip->i_number;
5802	freeblks->fb_vtype = ITOV(ip)->v_type;
5803	freeblks->fb_modrev = DIP(ip, i_modrev);
5804	freeblks->fb_devvp = ip->i_devvp;
5805	freeblks->fb_chkcnt = 0;
5806	freeblks->fb_len = 0;
5807
5808	return (freeblks);
5809}
5810
5811static void
5812trunc_indirdep(indirdep, freeblks, bp, off)
5813	struct indirdep *indirdep;
5814	struct freeblks *freeblks;
5815	struct buf *bp;
5816	int off;
5817{
5818	struct allocindir *aip, *aipn;
5819
5820	/*
5821	 * The first set of allocindirs won't be in savedbp.
5822	 */
5823	LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
5824		if (aip->ai_offset > off)
5825			cancel_allocindir(aip, bp, freeblks, 1);
5826	LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
5827		if (aip->ai_offset > off)
5828			cancel_allocindir(aip, bp, freeblks, 1);
5829	/*
5830	 * These will exist in savedbp.
5831	 */
5832	LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
5833		if (aip->ai_offset > off)
5834			cancel_allocindir(aip, NULL, freeblks, 0);
5835	LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
5836		if (aip->ai_offset > off)
5837			cancel_allocindir(aip, NULL, freeblks, 0);
5838}
5839
5840/*
5841 * Follow the chain of indirects down to lastlbn creating a freework
5842 * structure for each.  This will be used to start indir_trunc() at
5843 * the right offset and create the journal records for the parrtial
5844 * truncation.  A second step will handle the truncated dependencies.
5845 */
5846static int
5847setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
5848	struct freeblks *freeblks;
5849	struct inode *ip;
5850	ufs_lbn_t lbn;
5851	ufs_lbn_t lastlbn;
5852	ufs2_daddr_t blkno;
5853{
5854	struct indirdep *indirdep;
5855	struct indirdep *indirn;
5856	struct freework *freework;
5857	struct newblk *newblk;
5858	struct mount *mp;
5859	struct buf *bp;
5860	uint8_t *start;
5861	uint8_t *end;
5862	ufs_lbn_t lbnadd;
5863	int level;
5864	int error;
5865	int off;
5866
5867
5868	freework = NULL;
5869	if (blkno == 0)
5870		return (0);
5871	mp = freeblks->fb_list.wk_mp;
5872	bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
5873	if ((bp->b_flags & B_CACHE) == 0) {
5874		bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
5875		bp->b_iocmd = BIO_READ;
5876		bp->b_flags &= ~B_INVAL;
5877		bp->b_ioflags &= ~BIO_ERROR;
5878		vfs_busy_pages(bp, 0);
5879		bp->b_iooffset = dbtob(bp->b_blkno);
5880		bstrategy(bp);
5881		curthread->td_ru.ru_inblock++;
5882		error = bufwait(bp);
5883		if (error) {
5884			brelse(bp);
5885			return (error);
5886		}
5887	}
5888	level = lbn_level(lbn);
5889	lbnadd = lbn_offset(ip->i_fs, level);
5890	/*
5891	 * Compute the offset of the last block we want to keep.  Store
5892	 * in the freework the first block we want to completely free.
5893	 */
5894	off = (lastlbn - -(lbn + level)) / lbnadd;
5895	if (off + 1 == NINDIR(ip->i_fs))
5896		goto nowork;
5897	freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1,
5898	    0);
5899	/*
5900	 * Link the freework into the indirdep.  This will prevent any new
5901	 * allocations from proceeding until we are finished with the
5902	 * truncate and the block is written.
5903	 */
5904	ACQUIRE_LOCK(&lk);
5905	indirdep = indirdep_lookup(mp, ip, bp);
5906	if (indirdep->ir_freeblks)
5907		panic("setup_trunc_indir: indirdep already truncated.");
5908	TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
5909	freework->fw_indir = indirdep;
5910	/*
5911	 * Cancel any allocindirs that will not make it to disk.
5912	 * We have to do this for all copies of the indirdep that
5913	 * live on this newblk.
5914	 */
5915	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
5916		newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk);
5917		LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
5918			trunc_indirdep(indirn, freeblks, bp, off);
5919	} else
5920		trunc_indirdep(indirdep, freeblks, bp, off);
5921	FREE_LOCK(&lk);
5922	/*
5923	 * Creation is protected by the buf lock. The saveddata is only
5924	 * needed if a full truncation follows a partial truncation but it
5925	 * is difficult to allocate in that case so we fetch it anyway.
5926	 */
5927	if (indirdep->ir_saveddata == NULL)
5928		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
5929		    M_SOFTDEP_FLAGS);
5930nowork:
5931	/* Fetch the blkno of the child and the zero start offset. */
5932	if (ip->i_ump->um_fstype == UFS1) {
5933		blkno = ((ufs1_daddr_t *)bp->b_data)[off];
5934		start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
5935	} else {
5936		blkno = ((ufs2_daddr_t *)bp->b_data)[off];
5937		start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
5938	}
5939	if (freework) {
5940		/* Zero the truncated pointers. */
5941		end = bp->b_data + bp->b_bcount;
5942		bzero(start, end - start);
5943		bdwrite(bp);
5944	} else
5945		bqrelse(bp);
5946	if (level == 0)
5947		return (0);
5948	lbn++; /* adjust level */
5949	lbn -= (off * lbnadd);
5950	return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
5951}
5952
5953/*
5954 * Complete the partial truncation of an indirect block setup by
5955 * setup_trunc_indir().  This zeros the truncated pointers in the saved
5956 * copy and writes them to disk before the freeblks is allowed to complete.
5957 */
5958static void
5959complete_trunc_indir(freework)
5960	struct freework *freework;
5961{
5962	struct freework *fwn;
5963	struct indirdep *indirdep;
5964	struct buf *bp;
5965	uintptr_t start;
5966	int count;
5967
5968	indirdep = freework->fw_indir;
5969	for (;;) {
5970		bp = indirdep->ir_bp;
5971		/* See if the block was discarded. */
5972		if (bp == NULL)
5973			break;
5974		/* Inline part of getdirtybuf().  We dont want bremfree. */
5975		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
5976			break;
5977		if (BUF_LOCK(bp,
5978		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, &lk) == 0)
5979			BUF_UNLOCK(bp);
5980		ACQUIRE_LOCK(&lk);
5981	}
5982	mtx_assert(&lk, MA_OWNED);
5983	freework->fw_state |= DEPCOMPLETE;
5984	TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
5985	/*
5986	 * Zero the pointers in the saved copy.
5987	 */
5988	if (indirdep->ir_state & UFS1FMT)
5989		start = sizeof(ufs1_daddr_t);
5990	else
5991		start = sizeof(ufs2_daddr_t);
5992	start *= freework->fw_start;
5993	count = indirdep->ir_savebp->b_bcount - start;
5994	start += (uintptr_t)indirdep->ir_savebp->b_data;
5995	bzero((char *)start, count);
5996	/*
5997	 * We need to start the next truncation in the list if it has not
5998	 * been started yet.
5999	 */
6000	fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6001	if (fwn != NULL) {
6002		if (fwn->fw_freeblks == indirdep->ir_freeblks)
6003			TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6004		if ((fwn->fw_state & ONWORKLIST) == 0)
6005			freework_enqueue(fwn);
6006	}
6007	/*
6008	 * If bp is NULL the block was fully truncated, restore
6009	 * the saved block list otherwise free it if it is no
6010	 * longer needed.
6011	 */
6012	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6013		if (bp == NULL)
6014			bcopy(indirdep->ir_saveddata,
6015			    indirdep->ir_savebp->b_data,
6016			    indirdep->ir_savebp->b_bcount);
6017		free(indirdep->ir_saveddata, M_INDIRDEP);
6018		indirdep->ir_saveddata = NULL;
6019	}
6020	/*
6021	 * When bp is NULL there is a full truncation pending.  We
6022	 * must wait for this full truncation to be journaled before
6023	 * we can release this freework because the disk pointers will
6024	 * never be written as zero.
6025	 */
6026	if (bp == NULL)  {
6027		if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6028			handle_written_freework(freework);
6029		else
6030			WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6031			   &freework->fw_list);
6032	} else {
6033		/* Complete when the real copy is written. */
6034		WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6035		BUF_UNLOCK(bp);
6036	}
6037}
6038
6039/*
6040 * Calculate the number of blocks we are going to release where datablocks
6041 * is the current total and length is the new file size.
6042 */
6043ufs2_daddr_t
6044blkcount(fs, datablocks, length)
6045	struct fs *fs;
6046	ufs2_daddr_t datablocks;
6047	off_t length;
6048{
6049	off_t totblks, numblks;
6050
6051	totblks = 0;
6052	numblks = howmany(length, fs->fs_bsize);
6053	if (numblks <= NDADDR) {
6054		totblks = howmany(length, fs->fs_fsize);
6055		goto out;
6056	}
6057        totblks = blkstofrags(fs, numblks);
6058	numblks -= NDADDR;
6059	/*
6060	 * Count all single, then double, then triple indirects required.
6061	 * Subtracting one indirects worth of blocks for each pass
6062	 * acknowledges one of each pointed to by the inode.
6063	 */
6064	for (;;) {
6065		totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6066		numblks -= NINDIR(fs);
6067		if (numblks <= 0)
6068			break;
6069		numblks = howmany(numblks, NINDIR(fs));
6070	}
6071out:
6072	totblks = fsbtodb(fs, totblks);
6073	/*
6074	 * Handle sparse files.  We can't reclaim more blocks than the inode
6075	 * references.  We will correct it later in handle_complete_freeblks()
6076	 * when we know the real count.
6077	 */
6078	if (totblks > datablocks)
6079		return (0);
6080	return (datablocks - totblks);
6081}
6082
6083/*
6084 * Handle freeblocks for journaled softupdate filesystems.
6085 *
6086 * Contrary to normal softupdates, we must preserve the block pointers in
6087 * indirects until their subordinates are free.  This is to avoid journaling
6088 * every block that is freed which may consume more space than the journal
6089 * itself.  The recovery program will see the free block journals at the
6090 * base of the truncated area and traverse them to reclaim space.  The
6091 * pointers in the inode may be cleared immediately after the journal
6092 * records are written because each direct and indirect pointer in the
6093 * inode is recorded in a journal.  This permits full truncation to proceed
6094 * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6095 *
6096 * The algorithm is as follows:
6097 * 1) Traverse the in-memory state and create journal entries to release
6098 *    the relevant blocks and full indirect trees.
6099 * 2) Traverse the indirect block chain adding partial truncation freework
6100 *    records to indirects in the path to lastlbn.  The freework will
6101 *    prevent new allocation dependencies from being satisfied in this
6102 *    indirect until the truncation completes.
6103 * 3) Read and lock the inode block, performing an update with the new size
6104 *    and pointers.  This prevents truncated data from becoming valid on
6105 *    disk through step 4.
6106 * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6107 *    eliminate journal work for those records that do not require it.
6108 * 5) Schedule the journal records to be written followed by the inode block.
6109 * 6) Allocate any necessary frags for the end of file.
6110 * 7) Zero any partially truncated blocks.
6111 *
6112 * From this truncation proceeds asynchronously using the freework and
6113 * indir_trunc machinery.  The file will not be extended again into a
6114 * partially truncated indirect block until all work is completed but
6115 * the normal dependency mechanism ensures that it is rolled back/forward
6116 * as appropriate.  Further truncation may occur without delay and is
6117 * serialized in indir_trunc().
6118 */
6119void
6120softdep_journal_freeblocks(ip, cred, length, flags)
6121	struct inode *ip;	/* The inode whose length is to be reduced */
6122	struct ucred *cred;
6123	off_t length;		/* The new length for the file */
6124	int flags;		/* IO_EXT and/or IO_NORMAL */
6125{
6126	struct freeblks *freeblks, *fbn;
6127	struct inodedep *inodedep;
6128	struct jblkdep *jblkdep;
6129	struct allocdirect *adp, *adpn;
6130	struct fs *fs;
6131	struct buf *bp;
6132	struct vnode *vp;
6133	struct mount *mp;
6134	ufs2_daddr_t extblocks, datablocks;
6135	ufs_lbn_t tmpval, lbn, lastlbn;
6136	int frags, lastoff, iboff, allocblock, needj, dflags, error, i;
6137
6138	fs = ip->i_fs;
6139	mp = UFSTOVFS(ip->i_ump);
6140	vp = ITOV(ip);
6141	needj = 1;
6142	iboff = -1;
6143	allocblock = 0;
6144	extblocks = 0;
6145	datablocks = 0;
6146	frags = 0;
6147	freeblks = newfreeblks(mp, ip);
6148	ACQUIRE_LOCK(&lk);
6149	/*
6150	 * If we're truncating a removed file that will never be written
6151	 * we don't need to journal the block frees.  The canceled journals
6152	 * for the allocations will suffice.
6153	 */
6154	dflags = DEPALLOC;
6155	if (IS_SNAPSHOT(ip))
6156		dflags |= NODELAY;
6157	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6158	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6159	    length == 0)
6160		needj = 0;
6161	FREE_LOCK(&lk);
6162	/*
6163	 * Calculate the lbn that we are truncating to.  This results in -1
6164	 * if we're truncating the 0 bytes.  So it is the last lbn we want
6165	 * to keep, not the first lbn we want to truncate.
6166	 */
6167	lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6168	lastoff = blkoff(fs, length);
6169	/*
6170	 * Compute frags we are keeping in lastlbn.  0 means all.
6171	 */
6172	if (lastlbn >= 0 && lastlbn < NDADDR) {
6173		frags = fragroundup(fs, lastoff);
6174		/* adp offset of last valid allocdirect. */
6175		iboff = lastlbn;
6176	} else if (lastlbn > 0)
6177		iboff = NDADDR;
6178	if (fs->fs_magic == FS_UFS2_MAGIC)
6179		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6180	/*
6181	 * Handle normal data blocks and indirects.  This section saves
6182	 * values used after the inode update to complete frag and indirect
6183	 * truncation.
6184	 */
6185	if ((flags & IO_NORMAL) != 0) {
6186		/*
6187		 * Handle truncation of whole direct and indirect blocks.
6188		 */
6189		for (i = iboff + 1; i < NDADDR; i++)
6190			setup_freedirect(freeblks, ip, i, needj);
6191		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6192		    i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6193			/* Release a whole indirect tree. */
6194			if (lbn > lastlbn) {
6195				setup_freeindir(freeblks, ip, i, -lbn -i,
6196				    needj);
6197				continue;
6198			}
6199			iboff = i + NDADDR;
6200			/*
6201			 * Traverse partially truncated indirect tree.
6202			 */
6203			if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6204				setup_trunc_indir(freeblks, ip, -lbn - i,
6205				    lastlbn, DIP(ip, i_ib[i]));
6206		}
6207		/*
6208		 * Handle partial truncation to a frag boundary.
6209		 */
6210		if (frags) {
6211			ufs2_daddr_t blkno;
6212			long oldfrags;
6213
6214			oldfrags = blksize(fs, ip, lastlbn);
6215			blkno = DIP(ip, i_db[lastlbn]);
6216			if (blkno && oldfrags != frags) {
6217				oldfrags -= frags;
6218				oldfrags = numfrags(ip->i_fs, oldfrags);
6219				blkno += numfrags(ip->i_fs, frags);
6220				newfreework(ip->i_ump, freeblks, NULL, lastlbn,
6221				    blkno, oldfrags, 0, needj);
6222			} else if (blkno == 0)
6223				allocblock = 1;
6224		}
6225		/*
6226		 * Add a journal record for partial truncate if we are
6227		 * handling indirect blocks.  Non-indirects need no extra
6228		 * journaling.
6229		 */
6230		if (length != 0 && lastlbn >= NDADDR) {
6231			ip->i_flag |= IN_TRUNCATED;
6232			newjtrunc(freeblks, length, 0);
6233		}
6234		ip->i_size = length;
6235		DIP_SET(ip, i_size, ip->i_size);
6236		datablocks = DIP(ip, i_blocks) - extblocks;
6237		if (length != 0)
6238			datablocks = blkcount(ip->i_fs, datablocks, length);
6239		freeblks->fb_len = length;
6240	}
6241	if ((flags & IO_EXT) != 0) {
6242		for (i = 0; i < NXADDR; i++)
6243			setup_freeext(freeblks, ip, i, needj);
6244		ip->i_din2->di_extsize = 0;
6245		datablocks += extblocks;
6246	}
6247#ifdef QUOTA
6248	/* Reference the quotas in case the block count is wrong in the end. */
6249	quotaref(vp, freeblks->fb_quota);
6250	(void) chkdq(ip, -datablocks, NOCRED, 0);
6251#endif
6252	freeblks->fb_chkcnt = -datablocks;
6253	UFS_LOCK(ip->i_ump);
6254	fs->fs_pendingblocks += datablocks;
6255	UFS_UNLOCK(ip->i_ump);
6256	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6257	/*
6258	 * Handle truncation of incomplete alloc direct dependencies.  We
6259	 * hold the inode block locked to prevent incomplete dependencies
6260	 * from reaching the disk while we are eliminating those that
6261	 * have been truncated.  This is a partially inlined ffs_update().
6262	 */
6263	ufs_itimes(vp);
6264	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6265	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6266	    (int)fs->fs_bsize, cred, &bp);
6267	if (error) {
6268		brelse(bp);
6269		softdep_error("softdep_journal_freeblocks", error);
6270		return;
6271	}
6272	if (bp->b_bufsize == fs->fs_bsize)
6273		bp->b_flags |= B_CLUSTEROK;
6274	softdep_update_inodeblock(ip, bp, 0);
6275	if (ip->i_ump->um_fstype == UFS1)
6276		*((struct ufs1_dinode *)bp->b_data +
6277		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6278	else
6279		*((struct ufs2_dinode *)bp->b_data +
6280		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6281	ACQUIRE_LOCK(&lk);
6282	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6283	if ((inodedep->id_state & IOSTARTED) != 0)
6284		panic("softdep_setup_freeblocks: inode busy");
6285	/*
6286	 * Add the freeblks structure to the list of operations that
6287	 * must await the zero'ed inode being written to disk. If we
6288	 * still have a bitmap dependency (needj), then the inode
6289	 * has never been written to disk, so we can process the
6290	 * freeblks below once we have deleted the dependencies.
6291	 */
6292	if (needj)
6293		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6294	else
6295		freeblks->fb_state |= COMPLETE;
6296	if ((flags & IO_NORMAL) != 0) {
6297		TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6298			if (adp->ad_offset > iboff)
6299				cancel_allocdirect(&inodedep->id_inoupdt, adp,
6300				    freeblks);
6301			/*
6302			 * Truncate the allocdirect.  We could eliminate
6303			 * or modify journal records as well.
6304			 */
6305			else if (adp->ad_offset == iboff && frags)
6306				adp->ad_newsize = frags;
6307		}
6308	}
6309	if ((flags & IO_EXT) != 0)
6310		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6311			cancel_allocdirect(&inodedep->id_extupdt, adp,
6312			    freeblks);
6313	/*
6314	 * Add journal work.
6315	 */
6316	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6317		add_to_journal(&jblkdep->jb_list);
6318	FREE_LOCK(&lk);
6319	bdwrite(bp);
6320	/*
6321	 * Truncate dependency structures beyond length.
6322	 */
6323	trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6324	/*
6325	 * This is only set when we need to allocate a fragment because
6326	 * none existed at the end of a frag-sized file.  It handles only
6327	 * allocating a new, zero filled block.
6328	 */
6329	if (allocblock) {
6330		ip->i_size = length - lastoff;
6331		DIP_SET(ip, i_size, ip->i_size);
6332		error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6333		if (error != 0) {
6334			softdep_error("softdep_journal_freeblks", error);
6335			return;
6336		}
6337		ip->i_size = length;
6338		DIP_SET(ip, i_size, length);
6339		ip->i_flag |= IN_CHANGE | IN_UPDATE;
6340		allocbuf(bp, frags);
6341		ffs_update(vp, 0);
6342		bawrite(bp);
6343	} else if (lastoff != 0 && vp->v_type != VDIR) {
6344		int size;
6345
6346		/*
6347		 * Zero the end of a truncated frag or block.
6348		 */
6349		size = sblksize(fs, length, lastlbn);
6350		error = bread(vp, lastlbn, size, cred, &bp);
6351		if (error) {
6352			softdep_error("softdep_journal_freeblks", error);
6353			return;
6354		}
6355		bzero((char *)bp->b_data + lastoff, size - lastoff);
6356		bawrite(bp);
6357
6358	}
6359	ACQUIRE_LOCK(&lk);
6360	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6361	TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
6362	freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
6363	/*
6364	 * We zero earlier truncations so they don't erroneously
6365	 * update i_blocks.
6366	 */
6367	if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
6368		TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
6369			fbn->fb_len = 0;
6370	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
6371	    LIST_EMPTY(&freeblks->fb_jblkdephd))
6372		freeblks->fb_state |= INPROGRESS;
6373	else
6374		freeblks = NULL;
6375	FREE_LOCK(&lk);
6376	if (freeblks)
6377		handle_workitem_freeblocks(freeblks, 0);
6378	trunc_pages(ip, length, extblocks, flags);
6379
6380}
6381
6382/*
6383 * Flush a JOP_SYNC to the journal.
6384 */
6385void
6386softdep_journal_fsync(ip)
6387	struct inode *ip;
6388{
6389	struct jfsync *jfsync;
6390
6391	if ((ip->i_flag & IN_TRUNCATED) == 0)
6392		return;
6393	ip->i_flag &= ~IN_TRUNCATED;
6394	jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
6395	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump));
6396	jfsync->jfs_size = ip->i_size;
6397	jfsync->jfs_ino = ip->i_number;
6398	ACQUIRE_LOCK(&lk);
6399	add_to_journal(&jfsync->jfs_list);
6400	jwait(&jfsync->jfs_list, MNT_WAIT);
6401	FREE_LOCK(&lk);
6402}
6403
6404/*
6405 * Block de-allocation dependencies.
6406 *
6407 * When blocks are de-allocated, the on-disk pointers must be nullified before
6408 * the blocks are made available for use by other files.  (The true
6409 * requirement is that old pointers must be nullified before new on-disk
6410 * pointers are set.  We chose this slightly more stringent requirement to
6411 * reduce complexity.) Our implementation handles this dependency by updating
6412 * the inode (or indirect block) appropriately but delaying the actual block
6413 * de-allocation (i.e., freemap and free space count manipulation) until
6414 * after the updated versions reach stable storage.  After the disk is
6415 * updated, the blocks can be safely de-allocated whenever it is convenient.
6416 * This implementation handles only the common case of reducing a file's
6417 * length to zero. Other cases are handled by the conventional synchronous
6418 * write approach.
6419 *
6420 * The ffs implementation with which we worked double-checks
6421 * the state of the block pointers and file size as it reduces
6422 * a file's length.  Some of this code is replicated here in our
6423 * soft updates implementation.  The freeblks->fb_chkcnt field is
6424 * used to transfer a part of this information to the procedure
6425 * that eventually de-allocates the blocks.
6426 *
6427 * This routine should be called from the routine that shortens
6428 * a file's length, before the inode's size or block pointers
6429 * are modified. It will save the block pointer information for
6430 * later release and zero the inode so that the calling routine
6431 * can release it.
6432 */
6433void
6434softdep_setup_freeblocks(ip, length, flags)
6435	struct inode *ip;	/* The inode whose length is to be reduced */
6436	off_t length;		/* The new length for the file */
6437	int flags;		/* IO_EXT and/or IO_NORMAL */
6438{
6439	struct ufs1_dinode *dp1;
6440	struct ufs2_dinode *dp2;
6441	struct freeblks *freeblks;
6442	struct inodedep *inodedep;
6443	struct allocdirect *adp;
6444	struct buf *bp;
6445	struct fs *fs;
6446	ufs2_daddr_t extblocks, datablocks;
6447	struct mount *mp;
6448	int i, delay, error, dflags;
6449	ufs_lbn_t tmpval;
6450	ufs_lbn_t lbn;
6451
6452	fs = ip->i_fs;
6453	mp = UFSTOVFS(ip->i_ump);
6454	if (length != 0)
6455		panic("softdep_setup_freeblocks: non-zero length");
6456	freeblks = newfreeblks(mp, ip);
6457	extblocks = 0;
6458	datablocks = 0;
6459	if (fs->fs_magic == FS_UFS2_MAGIC)
6460		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6461	if ((flags & IO_NORMAL) != 0) {
6462		for (i = 0; i < NDADDR; i++)
6463			setup_freedirect(freeblks, ip, i, 0);
6464		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6465		    i++, lbn += tmpval, tmpval *= NINDIR(fs))
6466			setup_freeindir(freeblks, ip, i, -lbn -i, 0);
6467		ip->i_size = 0;
6468		DIP_SET(ip, i_size, 0);
6469		datablocks = DIP(ip, i_blocks) - extblocks;
6470	}
6471	if ((flags & IO_EXT) != 0) {
6472		for (i = 0; i < NXADDR; i++)
6473			setup_freeext(freeblks, ip, i, 0);
6474		ip->i_din2->di_extsize = 0;
6475		datablocks += extblocks;
6476	}
6477#ifdef QUOTA
6478	/* Reference the quotas in case the block count is wrong in the end. */
6479	quotaref(ITOV(ip), freeblks->fb_quota);
6480	(void) chkdq(ip, -datablocks, NOCRED, 0);
6481#endif
6482	freeblks->fb_chkcnt = -datablocks;
6483	UFS_LOCK(ip->i_ump);
6484	fs->fs_pendingblocks += datablocks;
6485	UFS_UNLOCK(ip->i_ump);
6486	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6487	/*
6488	 * Push the zero'ed inode to to its disk buffer so that we are free
6489	 * to delete its dependencies below. Once the dependencies are gone
6490	 * the buffer can be safely released.
6491	 */
6492	if ((error = bread(ip->i_devvp,
6493	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6494	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
6495		brelse(bp);
6496		softdep_error("softdep_setup_freeblocks", error);
6497	}
6498	if (ip->i_ump->um_fstype == UFS1) {
6499		dp1 = ((struct ufs1_dinode *)bp->b_data +
6500		    ino_to_fsbo(fs, ip->i_number));
6501		ip->i_din1->di_freelink = dp1->di_freelink;
6502		*dp1 = *ip->i_din1;
6503	} else {
6504		dp2 = ((struct ufs2_dinode *)bp->b_data +
6505		    ino_to_fsbo(fs, ip->i_number));
6506		ip->i_din2->di_freelink = dp2->di_freelink;
6507		*dp2 = *ip->i_din2;
6508	}
6509	/*
6510	 * Find and eliminate any inode dependencies.
6511	 */
6512	ACQUIRE_LOCK(&lk);
6513	dflags = DEPALLOC;
6514	if (IS_SNAPSHOT(ip))
6515		dflags |= NODELAY;
6516	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6517	if ((inodedep->id_state & IOSTARTED) != 0)
6518		panic("softdep_setup_freeblocks: inode busy");
6519	/*
6520	 * Add the freeblks structure to the list of operations that
6521	 * must await the zero'ed inode being written to disk. If we
6522	 * still have a bitmap dependency (delay == 0), then the inode
6523	 * has never been written to disk, so we can process the
6524	 * freeblks below once we have deleted the dependencies.
6525	 */
6526	delay = (inodedep->id_state & DEPCOMPLETE);
6527	if (delay)
6528		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6529	else
6530		freeblks->fb_state |= COMPLETE;
6531	/*
6532	 * Because the file length has been truncated to zero, any
6533	 * pending block allocation dependency structures associated
6534	 * with this inode are obsolete and can simply be de-allocated.
6535	 * We must first merge the two dependency lists to get rid of
6536	 * any duplicate freefrag structures, then purge the merged list.
6537	 * If we still have a bitmap dependency, then the inode has never
6538	 * been written to disk, so we can free any fragments without delay.
6539	 */
6540	if (flags & IO_NORMAL) {
6541		merge_inode_lists(&inodedep->id_newinoupdt,
6542		    &inodedep->id_inoupdt);
6543		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
6544			cancel_allocdirect(&inodedep->id_inoupdt, adp,
6545			    freeblks);
6546	}
6547	if (flags & IO_EXT) {
6548		merge_inode_lists(&inodedep->id_newextupdt,
6549		    &inodedep->id_extupdt);
6550		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6551			cancel_allocdirect(&inodedep->id_extupdt, adp,
6552			    freeblks);
6553	}
6554	FREE_LOCK(&lk);
6555	bdwrite(bp);
6556	trunc_dependencies(ip, freeblks, -1, 0, flags);
6557	ACQUIRE_LOCK(&lk);
6558	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
6559		(void) free_inodedep(inodedep);
6560	freeblks->fb_state |= DEPCOMPLETE;
6561	/*
6562	 * If the inode with zeroed block pointers is now on disk
6563	 * we can start freeing blocks.
6564	 */
6565	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
6566		freeblks->fb_state |= INPROGRESS;
6567	else
6568		freeblks = NULL;
6569	FREE_LOCK(&lk);
6570	if (freeblks)
6571		handle_workitem_freeblocks(freeblks, 0);
6572	trunc_pages(ip, length, extblocks, flags);
6573}
6574
6575/*
6576 * Eliminate pages from the page cache that back parts of this inode and
6577 * adjust the vnode pager's idea of our size.  This prevents stale data
6578 * from hanging around in the page cache.
6579 */
6580static void
6581trunc_pages(ip, length, extblocks, flags)
6582	struct inode *ip;
6583	off_t length;
6584	ufs2_daddr_t extblocks;
6585	int flags;
6586{
6587	struct vnode *vp;
6588	struct fs *fs;
6589	ufs_lbn_t lbn;
6590	off_t end, extend;
6591
6592	vp = ITOV(ip);
6593	fs = ip->i_fs;
6594	extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
6595	if ((flags & IO_EXT) != 0)
6596		vn_pages_remove(vp, extend, 0);
6597	if ((flags & IO_NORMAL) == 0)
6598		return;
6599	BO_LOCK(&vp->v_bufobj);
6600	drain_output(vp);
6601	BO_UNLOCK(&vp->v_bufobj);
6602	/*
6603	 * The vnode pager eliminates file pages we eliminate indirects
6604	 * below.
6605	 */
6606	vnode_pager_setsize(vp, length);
6607	/*
6608	 * Calculate the end based on the last indirect we want to keep.  If
6609	 * the block extends into indirects we can just use the negative of
6610	 * its lbn.  Doubles and triples exist at lower numbers so we must
6611	 * be careful not to remove those, if they exist.  double and triple
6612	 * indirect lbns do not overlap with others so it is not important
6613	 * to verify how many levels are required.
6614	 */
6615	lbn = lblkno(fs, length);
6616	if (lbn >= NDADDR) {
6617		/* Calculate the virtual lbn of the triple indirect. */
6618		lbn = -lbn - (NIADDR - 1);
6619		end = OFF_TO_IDX(lblktosize(fs, lbn));
6620	} else
6621		end = extend;
6622	vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
6623}
6624
6625/*
6626 * See if the buf bp is in the range eliminated by truncation.
6627 */
6628static int
6629trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
6630	struct buf *bp;
6631	int *blkoffp;
6632	ufs_lbn_t lastlbn;
6633	int lastoff;
6634	int flags;
6635{
6636	ufs_lbn_t lbn;
6637
6638	*blkoffp = 0;
6639	/* Only match ext/normal blocks as appropriate. */
6640	if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
6641	    ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
6642		return (0);
6643	/* ALTDATA is always a full truncation. */
6644	if ((bp->b_xflags & BX_ALTDATA) != 0)
6645		return (1);
6646	/* -1 is full truncation. */
6647	if (lastlbn == -1)
6648		return (1);
6649	/*
6650	 * If this is a partial truncate we only want those
6651	 * blocks and indirect blocks that cover the range
6652	 * we're after.
6653	 */
6654	lbn = bp->b_lblkno;
6655	if (lbn < 0)
6656		lbn = -(lbn + lbn_level(lbn));
6657	if (lbn < lastlbn)
6658		return (0);
6659	/* Here we only truncate lblkno if it's partial. */
6660	if (lbn == lastlbn) {
6661		if (lastoff == 0)
6662			return (0);
6663		*blkoffp = lastoff;
6664	}
6665	return (1);
6666}
6667
6668/*
6669 * Eliminate any dependencies that exist in memory beyond lblkno:off
6670 */
6671static void
6672trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
6673	struct inode *ip;
6674	struct freeblks *freeblks;
6675	ufs_lbn_t lastlbn;
6676	int lastoff;
6677	int flags;
6678{
6679	struct bufobj *bo;
6680	struct vnode *vp;
6681	struct buf *bp;
6682	struct fs *fs;
6683	int blkoff;
6684
6685	/*
6686	 * We must wait for any I/O in progress to finish so that
6687	 * all potential buffers on the dirty list will be visible.
6688	 * Once they are all there, walk the list and get rid of
6689	 * any dependencies.
6690	 */
6691	fs = ip->i_fs;
6692	vp = ITOV(ip);
6693	bo = &vp->v_bufobj;
6694	BO_LOCK(bo);
6695	drain_output(vp);
6696	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
6697		bp->b_vflags &= ~BV_SCANNED;
6698restart:
6699	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
6700		if (bp->b_vflags & BV_SCANNED)
6701			continue;
6702		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
6703			bp->b_vflags |= BV_SCANNED;
6704			continue;
6705		}
6706		if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL)
6707			goto restart;
6708		BO_UNLOCK(bo);
6709		if (deallocate_dependencies(bp, freeblks, blkoff))
6710			bqrelse(bp);
6711		else
6712			brelse(bp);
6713		BO_LOCK(bo);
6714		goto restart;
6715	}
6716	/*
6717	 * Now do the work of vtruncbuf while also matching indirect blocks.
6718	 */
6719	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
6720		bp->b_vflags &= ~BV_SCANNED;
6721cleanrestart:
6722	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
6723		if (bp->b_vflags & BV_SCANNED)
6724			continue;
6725		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
6726			bp->b_vflags |= BV_SCANNED;
6727			continue;
6728		}
6729		if (BUF_LOCK(bp,
6730		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6731		    BO_MTX(bo)) == ENOLCK) {
6732			BO_LOCK(bo);
6733			goto cleanrestart;
6734		}
6735		bp->b_vflags |= BV_SCANNED;
6736		BO_LOCK(bo);
6737		bremfree(bp);
6738		BO_UNLOCK(bo);
6739		if (blkoff != 0) {
6740			allocbuf(bp, blkoff);
6741			bqrelse(bp);
6742		} else {
6743			bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
6744			brelse(bp);
6745		}
6746		BO_LOCK(bo);
6747		goto cleanrestart;
6748	}
6749	drain_output(vp);
6750	BO_UNLOCK(bo);
6751}
6752
6753static int
6754cancel_pagedep(pagedep, freeblks, blkoff)
6755	struct pagedep *pagedep;
6756	struct freeblks *freeblks;
6757	int blkoff;
6758{
6759	struct jremref *jremref;
6760	struct jmvref *jmvref;
6761	struct dirrem *dirrem, *tmp;
6762	int i;
6763
6764	/*
6765	 * Copy any directory remove dependencies to the list
6766	 * to be processed after the freeblks proceeds.  If
6767	 * directory entry never made it to disk they
6768	 * can be dumped directly onto the work list.
6769	 */
6770	LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
6771		/* Skip this directory removal if it is intended to remain. */
6772		if (dirrem->dm_offset < blkoff)
6773			continue;
6774		/*
6775		 * If there are any dirrems we wait for the journal write
6776		 * to complete and then restart the buf scan as the lock
6777		 * has been dropped.
6778		 */
6779		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
6780			jwait(&jremref->jr_list, MNT_WAIT);
6781			return (ERESTART);
6782		}
6783		LIST_REMOVE(dirrem, dm_next);
6784		dirrem->dm_dirinum = pagedep->pd_ino;
6785		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
6786	}
6787	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
6788		jwait(&jmvref->jm_list, MNT_WAIT);
6789		return (ERESTART);
6790	}
6791	/*
6792	 * When we're partially truncating a pagedep we just want to flush
6793	 * journal entries and return.  There can not be any adds in the
6794	 * truncated portion of the directory and newblk must remain if
6795	 * part of the block remains.
6796	 */
6797	if (blkoff != 0) {
6798		struct diradd *dap;
6799
6800		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
6801			if (dap->da_offset > blkoff)
6802				panic("cancel_pagedep: diradd %p off %d > %d",
6803				    dap, dap->da_offset, blkoff);
6804		for (i = 0; i < DAHASHSZ; i++)
6805			LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
6806				if (dap->da_offset > blkoff)
6807					panic("cancel_pagedep: diradd %p off %d > %d",
6808					    dap, dap->da_offset, blkoff);
6809		return (0);
6810	}
6811	/*
6812	 * There should be no directory add dependencies present
6813	 * as the directory could not be truncated until all
6814	 * children were removed.
6815	 */
6816	KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
6817	    ("deallocate_dependencies: pendinghd != NULL"));
6818	for (i = 0; i < DAHASHSZ; i++)
6819		KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
6820		    ("deallocate_dependencies: diraddhd != NULL"));
6821	if ((pagedep->pd_state & NEWBLOCK) != 0)
6822		free_newdirblk(pagedep->pd_newdirblk);
6823	if (free_pagedep(pagedep) == 0)
6824		panic("Failed to free pagedep %p", pagedep);
6825	return (0);
6826}
6827
6828/*
6829 * Reclaim any dependency structures from a buffer that is about to
6830 * be reallocated to a new vnode. The buffer must be locked, thus,
6831 * no I/O completion operations can occur while we are manipulating
6832 * its associated dependencies. The mutex is held so that other I/O's
6833 * associated with related dependencies do not occur.
6834 */
6835static int
6836deallocate_dependencies(bp, freeblks, off)
6837	struct buf *bp;
6838	struct freeblks *freeblks;
6839	int off;
6840{
6841	struct indirdep *indirdep;
6842	struct pagedep *pagedep;
6843	struct allocdirect *adp;
6844	struct worklist *wk, *wkn;
6845
6846	ACQUIRE_LOCK(&lk);
6847	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
6848		switch (wk->wk_type) {
6849		case D_INDIRDEP:
6850			indirdep = WK_INDIRDEP(wk);
6851			if (bp->b_lblkno >= 0 ||
6852			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
6853				panic("deallocate_dependencies: not indir");
6854			cancel_indirdep(indirdep, bp, freeblks);
6855			continue;
6856
6857		case D_PAGEDEP:
6858			pagedep = WK_PAGEDEP(wk);
6859			if (cancel_pagedep(pagedep, freeblks, off)) {
6860				FREE_LOCK(&lk);
6861				return (ERESTART);
6862			}
6863			continue;
6864
6865		case D_ALLOCINDIR:
6866			/*
6867			 * Simply remove the allocindir, we'll find it via
6868			 * the indirdep where we can clear pointers if
6869			 * needed.
6870			 */
6871			WORKLIST_REMOVE(wk);
6872			continue;
6873
6874		case D_FREEWORK:
6875			/*
6876			 * A truncation is waiting for the zero'd pointers
6877			 * to be written.  It can be freed when the freeblks
6878			 * is journaled.
6879			 */
6880			WORKLIST_REMOVE(wk);
6881			wk->wk_state |= ONDEPLIST;
6882			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6883			break;
6884
6885		case D_ALLOCDIRECT:
6886			adp = WK_ALLOCDIRECT(wk);
6887			if (off != 0)
6888				continue;
6889			/* FALLTHROUGH */
6890		default:
6891			panic("deallocate_dependencies: Unexpected type %s",
6892			    TYPENAME(wk->wk_type));
6893			/* NOTREACHED */
6894		}
6895	}
6896	FREE_LOCK(&lk);
6897	/*
6898	 * Don't throw away this buf, we were partially truncating and
6899	 * some deps may always remain.
6900	 */
6901	if (off) {
6902		allocbuf(bp, off);
6903		bp->b_vflags |= BV_SCANNED;
6904		return (EBUSY);
6905	}
6906	bp->b_flags |= B_INVAL | B_NOCACHE;
6907
6908	return (0);
6909}
6910
6911/*
6912 * An allocdirect is being canceled due to a truncate.  We must make sure
6913 * the journal entry is released in concert with the blkfree that releases
6914 * the storage.  Completed journal entries must not be released until the
6915 * space is no longer pointed to by the inode or in the bitmap.
6916 */
6917static void
6918cancel_allocdirect(adphead, adp, freeblks)
6919	struct allocdirectlst *adphead;
6920	struct allocdirect *adp;
6921	struct freeblks *freeblks;
6922{
6923	struct freework *freework;
6924	struct newblk *newblk;
6925	struct worklist *wk;
6926
6927	TAILQ_REMOVE(adphead, adp, ad_next);
6928	newblk = (struct newblk *)adp;
6929	freework = NULL;
6930	/*
6931	 * Find the correct freework structure.
6932	 */
6933	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
6934		if (wk->wk_type != D_FREEWORK)
6935			continue;
6936		freework = WK_FREEWORK(wk);
6937		if (freework->fw_blkno == newblk->nb_newblkno)
6938			break;
6939	}
6940	if (freework == NULL)
6941		panic("cancel_allocdirect: Freework not found");
6942	/*
6943	 * If a newblk exists at all we still have the journal entry that
6944	 * initiated the allocation so we do not need to journal the free.
6945	 */
6946	cancel_jfreeblk(freeblks, freework->fw_blkno);
6947	/*
6948	 * If the journal hasn't been written the jnewblk must be passed
6949	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
6950	 * this by linking the journal dependency into the freework to be
6951	 * freed when freework_freeblock() is called.  If the journal has
6952	 * been written we can simply reclaim the journal space when the
6953	 * freeblks work is complete.
6954	 */
6955	freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
6956	    &freeblks->fb_jwork);
6957	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
6958}
6959
6960
6961/*
6962 * Cancel a new block allocation.  May be an indirect or direct block.  We
6963 * remove it from various lists and return any journal record that needs to
6964 * be resolved by the caller.
6965 *
6966 * A special consideration is made for indirects which were never pointed
6967 * at on disk and will never be found once this block is released.
6968 */
6969static struct jnewblk *
6970cancel_newblk(newblk, wk, wkhd)
6971	struct newblk *newblk;
6972	struct worklist *wk;
6973	struct workhead *wkhd;
6974{
6975	struct jnewblk *jnewblk;
6976
6977	newblk->nb_state |= GOINGAWAY;
6978	/*
6979	 * Previously we traversed the completedhd on each indirdep
6980	 * attached to this newblk to cancel them and gather journal
6981	 * work.  Since we need only the oldest journal segment and
6982	 * the lowest point on the tree will always have the oldest
6983	 * journal segment we are free to release the segments
6984	 * of any subordinates and may leave the indirdep list to
6985	 * indirdep_complete() when this newblk is freed.
6986	 */
6987	if (newblk->nb_state & ONDEPLIST) {
6988		newblk->nb_state &= ~ONDEPLIST;
6989		LIST_REMOVE(newblk, nb_deps);
6990	}
6991	if (newblk->nb_state & ONWORKLIST)
6992		WORKLIST_REMOVE(&newblk->nb_list);
6993	/*
6994	 * If the journal entry hasn't been written we save a pointer to
6995	 * the dependency that frees it until it is written or the
6996	 * superseding operation completes.
6997	 */
6998	jnewblk = newblk->nb_jnewblk;
6999	if (jnewblk != NULL && wk != NULL) {
7000		newblk->nb_jnewblk = NULL;
7001		jnewblk->jn_dep = wk;
7002	}
7003	if (!LIST_EMPTY(&newblk->nb_jwork))
7004		jwork_move(wkhd, &newblk->nb_jwork);
7005	/*
7006	 * When truncating we must free the newdirblk early to remove
7007	 * the pagedep from the hash before returning.
7008	 */
7009	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7010		free_newdirblk(WK_NEWDIRBLK(wk));
7011	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7012		panic("cancel_newblk: extra newdirblk");
7013
7014	return (jnewblk);
7015}
7016
7017/*
7018 * Schedule the freefrag associated with a newblk to be released once
7019 * the pointers are written and the previous block is no longer needed.
7020 */
7021static void
7022newblk_freefrag(newblk)
7023	struct newblk *newblk;
7024{
7025	struct freefrag *freefrag;
7026
7027	if (newblk->nb_freefrag == NULL)
7028		return;
7029	freefrag = newblk->nb_freefrag;
7030	newblk->nb_freefrag = NULL;
7031	freefrag->ff_state |= COMPLETE;
7032	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7033		add_to_worklist(&freefrag->ff_list, 0);
7034}
7035
7036/*
7037 * Free a newblk. Generate a new freefrag work request if appropriate.
7038 * This must be called after the inode pointer and any direct block pointers
7039 * are valid or fully removed via truncate or frag extension.
7040 */
7041static void
7042free_newblk(newblk)
7043	struct newblk *newblk;
7044{
7045	struct indirdep *indirdep;
7046	struct worklist *wk;
7047
7048	KASSERT(newblk->nb_jnewblk == NULL,
7049	    ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk));
7050	mtx_assert(&lk, MA_OWNED);
7051	newblk_freefrag(newblk);
7052	if (newblk->nb_state & ONDEPLIST)
7053		LIST_REMOVE(newblk, nb_deps);
7054	if (newblk->nb_state & ONWORKLIST)
7055		WORKLIST_REMOVE(&newblk->nb_list);
7056	LIST_REMOVE(newblk, nb_hash);
7057	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7058		free_newdirblk(WK_NEWDIRBLK(wk));
7059	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7060		panic("free_newblk: extra newdirblk");
7061	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7062		indirdep_complete(indirdep);
7063	handle_jwork(&newblk->nb_jwork);
7064	newblk->nb_list.wk_type = D_NEWBLK;
7065	WORKITEM_FREE(newblk, D_NEWBLK);
7066}
7067
7068/*
7069 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7070 * This routine must be called with splbio interrupts blocked.
7071 */
7072static void
7073free_newdirblk(newdirblk)
7074	struct newdirblk *newdirblk;
7075{
7076	struct pagedep *pagedep;
7077	struct diradd *dap;
7078	struct worklist *wk;
7079
7080	mtx_assert(&lk, MA_OWNED);
7081	WORKLIST_REMOVE(&newdirblk->db_list);
7082	/*
7083	 * If the pagedep is still linked onto the directory buffer
7084	 * dependency chain, then some of the entries on the
7085	 * pd_pendinghd list may not be committed to disk yet. In
7086	 * this case, we will simply clear the NEWBLOCK flag and
7087	 * let the pd_pendinghd list be processed when the pagedep
7088	 * is next written. If the pagedep is no longer on the buffer
7089	 * dependency chain, then all the entries on the pd_pending
7090	 * list are committed to disk and we can free them here.
7091	 */
7092	pagedep = newdirblk->db_pagedep;
7093	pagedep->pd_state &= ~NEWBLOCK;
7094	if ((pagedep->pd_state & ONWORKLIST) == 0) {
7095		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7096			free_diradd(dap, NULL);
7097		/*
7098		 * If no dependencies remain, the pagedep will be freed.
7099		 */
7100		free_pagedep(pagedep);
7101	}
7102	/* Should only ever be one item in the list. */
7103	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7104		WORKLIST_REMOVE(wk);
7105		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7106	}
7107	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7108}
7109
7110/*
7111 * Prepare an inode to be freed. The actual free operation is not
7112 * done until the zero'ed inode has been written to disk.
7113 */
7114void
7115softdep_freefile(pvp, ino, mode)
7116	struct vnode *pvp;
7117	ino_t ino;
7118	int mode;
7119{
7120	struct inode *ip = VTOI(pvp);
7121	struct inodedep *inodedep;
7122	struct freefile *freefile;
7123	struct freeblks *freeblks;
7124
7125	/*
7126	 * This sets up the inode de-allocation dependency.
7127	 */
7128	freefile = malloc(sizeof(struct freefile),
7129		M_FREEFILE, M_SOFTDEP_FLAGS);
7130	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7131	freefile->fx_mode = mode;
7132	freefile->fx_oldinum = ino;
7133	freefile->fx_devvp = ip->i_devvp;
7134	LIST_INIT(&freefile->fx_jwork);
7135	UFS_LOCK(ip->i_ump);
7136	ip->i_fs->fs_pendinginodes += 1;
7137	UFS_UNLOCK(ip->i_ump);
7138
7139	/*
7140	 * If the inodedep does not exist, then the zero'ed inode has
7141	 * been written to disk. If the allocated inode has never been
7142	 * written to disk, then the on-disk inode is zero'ed. In either
7143	 * case we can free the file immediately.  If the journal was
7144	 * canceled before being written the inode will never make it to
7145	 * disk and we must send the canceled journal entrys to
7146	 * ffs_freefile() to be cleared in conjunction with the bitmap.
7147	 * Any blocks waiting on the inode to write can be safely freed
7148	 * here as it will never been written.
7149	 */
7150	ACQUIRE_LOCK(&lk);
7151	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7152	if (inodedep) {
7153		/*
7154		 * Clear out freeblks that no longer need to reference
7155		 * this inode.
7156		 */
7157		while ((freeblks =
7158		    TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7159			TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7160			    fb_next);
7161			freeblks->fb_state &= ~ONDEPLIST;
7162		}
7163		/*
7164		 * Remove this inode from the unlinked list.
7165		 */
7166		if (inodedep->id_state & UNLINKED) {
7167			/*
7168			 * Save the journal work to be freed with the bitmap
7169			 * before we clear UNLINKED.  Otherwise it can be lost
7170			 * if the inode block is written.
7171			 */
7172			handle_bufwait(inodedep, &freefile->fx_jwork);
7173			clear_unlinked_inodedep(inodedep);
7174			/* Re-acquire inodedep as we've dropped lk. */
7175			inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7176		}
7177	}
7178	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7179		FREE_LOCK(&lk);
7180		handle_workitem_freefile(freefile);
7181		return;
7182	}
7183	if ((inodedep->id_state & DEPCOMPLETE) == 0)
7184		inodedep->id_state |= GOINGAWAY;
7185	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7186	FREE_LOCK(&lk);
7187	if (ip->i_number == ino)
7188		ip->i_flag |= IN_MODIFIED;
7189}
7190
7191/*
7192 * Check to see if an inode has never been written to disk. If
7193 * so free the inodedep and return success, otherwise return failure.
7194 * This routine must be called with splbio interrupts blocked.
7195 *
7196 * If we still have a bitmap dependency, then the inode has never
7197 * been written to disk. Drop the dependency as it is no longer
7198 * necessary since the inode is being deallocated. We set the
7199 * ALLCOMPLETE flags since the bitmap now properly shows that the
7200 * inode is not allocated. Even if the inode is actively being
7201 * written, it has been rolled back to its zero'ed state, so we
7202 * are ensured that a zero inode is what is on the disk. For short
7203 * lived files, this change will usually result in removing all the
7204 * dependencies from the inode so that it can be freed immediately.
7205 */
7206static int
7207check_inode_unwritten(inodedep)
7208	struct inodedep *inodedep;
7209{
7210
7211	mtx_assert(&lk, MA_OWNED);
7212
7213	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7214	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7215	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7216	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7217	    !LIST_EMPTY(&inodedep->id_inowait) ||
7218	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7219	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7220	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7221	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7222	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7223	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7224	    inodedep->id_mkdiradd != NULL ||
7225	    inodedep->id_nlinkdelta != 0)
7226		return (0);
7227	/*
7228	 * Another process might be in initiate_write_inodeblock_ufs[12]
7229	 * trying to allocate memory without holding "Softdep Lock".
7230	 */
7231	if ((inodedep->id_state & IOSTARTED) != 0 &&
7232	    inodedep->id_savedino1 == NULL)
7233		return (0);
7234
7235	if (inodedep->id_state & ONDEPLIST)
7236		LIST_REMOVE(inodedep, id_deps);
7237	inodedep->id_state &= ~ONDEPLIST;
7238	inodedep->id_state |= ALLCOMPLETE;
7239	inodedep->id_bmsafemap = NULL;
7240	if (inodedep->id_state & ONWORKLIST)
7241		WORKLIST_REMOVE(&inodedep->id_list);
7242	if (inodedep->id_savedino1 != NULL) {
7243		free(inodedep->id_savedino1, M_SAVEDINO);
7244		inodedep->id_savedino1 = NULL;
7245	}
7246	if (free_inodedep(inodedep) == 0)
7247		panic("check_inode_unwritten: busy inode");
7248	return (1);
7249}
7250
7251/*
7252 * Try to free an inodedep structure. Return 1 if it could be freed.
7253 */
7254static int
7255free_inodedep(inodedep)
7256	struct inodedep *inodedep;
7257{
7258
7259	mtx_assert(&lk, MA_OWNED);
7260	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7261	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7262	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7263	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7264	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7265	    !LIST_EMPTY(&inodedep->id_inowait) ||
7266	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7267	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7268	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7269	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7270	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7271	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7272	    inodedep->id_mkdiradd != NULL ||
7273	    inodedep->id_nlinkdelta != 0 ||
7274	    inodedep->id_savedino1 != NULL)
7275		return (0);
7276	if (inodedep->id_state & ONDEPLIST)
7277		LIST_REMOVE(inodedep, id_deps);
7278	LIST_REMOVE(inodedep, id_hash);
7279	WORKITEM_FREE(inodedep, D_INODEDEP);
7280	return (1);
7281}
7282
7283/*
7284 * Free the block referenced by a freework structure.  The parent freeblks
7285 * structure is released and completed when the final cg bitmap reaches
7286 * the disk.  This routine may be freeing a jnewblk which never made it to
7287 * disk in which case we do not have to wait as the operation is undone
7288 * in memory immediately.
7289 */
7290static void
7291freework_freeblock(freework)
7292	struct freework *freework;
7293{
7294	struct freeblks *freeblks;
7295	struct jnewblk *jnewblk;
7296	struct ufsmount *ump;
7297	struct workhead wkhd;
7298	struct fs *fs;
7299	int bsize;
7300	int needj;
7301
7302	mtx_assert(&lk, MA_OWNED);
7303	/*
7304	 * Handle partial truncate separately.
7305	 */
7306	if (freework->fw_indir) {
7307		complete_trunc_indir(freework);
7308		return;
7309	}
7310	freeblks = freework->fw_freeblks;
7311	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7312	fs = ump->um_fs;
7313	needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7314	bsize = lfragtosize(fs, freework->fw_frags);
7315	LIST_INIT(&wkhd);
7316	/*
7317	 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7318	 * on the indirblk hashtable and prevents premature freeing.
7319	 */
7320	freework->fw_state |= DEPCOMPLETE;
7321	/*
7322	 * SUJ needs to wait for the segment referencing freed indirect
7323	 * blocks to expire so that we know the checker will not confuse
7324	 * a re-allocated indirect block with its old contents.
7325	 */
7326	if (needj && freework->fw_lbn <= -NDADDR)
7327		indirblk_insert(freework);
7328	/*
7329	 * If we are canceling an existing jnewblk pass it to the free
7330	 * routine, otherwise pass the freeblk which will ultimately
7331	 * release the freeblks.  If we're not journaling, we can just
7332	 * free the freeblks immediately.
7333	 */
7334	jnewblk = freework->fw_jnewblk;
7335	if (jnewblk != NULL) {
7336		cancel_jnewblk(jnewblk, &wkhd);
7337		needj = 0;
7338	} else if (needj) {
7339		freework->fw_state |= DELAYEDFREE;
7340		freeblks->fb_cgwait++;
7341		WORKLIST_INSERT(&wkhd, &freework->fw_list);
7342	}
7343	FREE_LOCK(&lk);
7344	freeblks_free(ump, freeblks, btodb(bsize));
7345	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
7346	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
7347	ACQUIRE_LOCK(&lk);
7348	/*
7349	 * The jnewblk will be discarded and the bits in the map never
7350	 * made it to disk.  We can immediately free the freeblk.
7351	 */
7352	if (needj == 0)
7353		handle_written_freework(freework);
7354}
7355
7356/*
7357 * We enqueue freework items that need processing back on the freeblks and
7358 * add the freeblks to the worklist.  This makes it easier to find all work
7359 * required to flush a truncation in process_truncates().
7360 */
7361static void
7362freework_enqueue(freework)
7363	struct freework *freework;
7364{
7365	struct freeblks *freeblks;
7366
7367	freeblks = freework->fw_freeblks;
7368	if ((freework->fw_state & INPROGRESS) == 0)
7369		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
7370	if ((freeblks->fb_state &
7371	    (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
7372	    LIST_EMPTY(&freeblks->fb_jblkdephd))
7373		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7374}
7375
7376/*
7377 * Start, continue, or finish the process of freeing an indirect block tree.
7378 * The free operation may be paused at any point with fw_off containing the
7379 * offset to restart from.  This enables us to implement some flow control
7380 * for large truncates which may fan out and generate a huge number of
7381 * dependencies.
7382 */
7383static void
7384handle_workitem_indirblk(freework)
7385	struct freework *freework;
7386{
7387	struct freeblks *freeblks;
7388	struct ufsmount *ump;
7389	struct fs *fs;
7390
7391	freeblks = freework->fw_freeblks;
7392	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7393	fs = ump->um_fs;
7394	if (freework->fw_state & DEPCOMPLETE) {
7395		handle_written_freework(freework);
7396		return;
7397	}
7398	if (freework->fw_off == NINDIR(fs)) {
7399		freework_freeblock(freework);
7400		return;
7401	}
7402	freework->fw_state |= INPROGRESS;
7403	FREE_LOCK(&lk);
7404	indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
7405	    freework->fw_lbn);
7406	ACQUIRE_LOCK(&lk);
7407}
7408
7409/*
7410 * Called when a freework structure attached to a cg buf is written.  The
7411 * ref on either the parent or the freeblks structure is released and
7412 * the freeblks is added back to the worklist if there is more work to do.
7413 */
7414static void
7415handle_written_freework(freework)
7416	struct freework *freework;
7417{
7418	struct freeblks *freeblks;
7419	struct freework *parent;
7420
7421	freeblks = freework->fw_freeblks;
7422	parent = freework->fw_parent;
7423	if (freework->fw_state & DELAYEDFREE)
7424		freeblks->fb_cgwait--;
7425	freework->fw_state |= COMPLETE;
7426	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
7427		WORKITEM_FREE(freework, D_FREEWORK);
7428	if (parent) {
7429		if (--parent->fw_ref == 0)
7430			freework_enqueue(parent);
7431		return;
7432	}
7433	if (--freeblks->fb_ref != 0)
7434		return;
7435	if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
7436	    ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
7437		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7438}
7439
7440/*
7441 * This workitem routine performs the block de-allocation.
7442 * The workitem is added to the pending list after the updated
7443 * inode block has been written to disk.  As mentioned above,
7444 * checks regarding the number of blocks de-allocated (compared
7445 * to the number of blocks allocated for the file) are also
7446 * performed in this function.
7447 */
7448static int
7449handle_workitem_freeblocks(freeblks, flags)
7450	struct freeblks *freeblks;
7451	int flags;
7452{
7453	struct freework *freework;
7454	struct newblk *newblk;
7455	struct allocindir *aip;
7456	struct ufsmount *ump;
7457	struct worklist *wk;
7458
7459	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
7460	    ("handle_workitem_freeblocks: Journal entries not written."));
7461	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7462	ACQUIRE_LOCK(&lk);
7463	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
7464		WORKLIST_REMOVE(wk);
7465		switch (wk->wk_type) {
7466		case D_DIRREM:
7467			wk->wk_state |= COMPLETE;
7468			add_to_worklist(wk, 0);
7469			continue;
7470
7471		case D_ALLOCDIRECT:
7472			free_newblk(WK_NEWBLK(wk));
7473			continue;
7474
7475		case D_ALLOCINDIR:
7476			aip = WK_ALLOCINDIR(wk);
7477			freework = NULL;
7478			if (aip->ai_state & DELAYEDFREE) {
7479				FREE_LOCK(&lk);
7480				freework = newfreework(ump, freeblks, NULL,
7481				    aip->ai_lbn, aip->ai_newblkno,
7482				    ump->um_fs->fs_frag, 0, 0);
7483				ACQUIRE_LOCK(&lk);
7484			}
7485			newblk = WK_NEWBLK(wk);
7486			if (newblk->nb_jnewblk) {
7487				freework->fw_jnewblk = newblk->nb_jnewblk;
7488				newblk->nb_jnewblk->jn_dep = &freework->fw_list;
7489				newblk->nb_jnewblk = NULL;
7490			}
7491			free_newblk(newblk);
7492			continue;
7493
7494		case D_FREEWORK:
7495			freework = WK_FREEWORK(wk);
7496			if (freework->fw_lbn <= -NDADDR)
7497				handle_workitem_indirblk(freework);
7498			else
7499				freework_freeblock(freework);
7500			continue;
7501		default:
7502			panic("handle_workitem_freeblocks: Unknown type %s",
7503			    TYPENAME(wk->wk_type));
7504		}
7505	}
7506	if (freeblks->fb_ref != 0) {
7507		freeblks->fb_state &= ~INPROGRESS;
7508		wake_worklist(&freeblks->fb_list);
7509		freeblks = NULL;
7510	}
7511	FREE_LOCK(&lk);
7512	if (freeblks)
7513		return handle_complete_freeblocks(freeblks, flags);
7514	return (0);
7515}
7516
7517/*
7518 * Handle completion of block free via truncate.  This allows fs_pending
7519 * to track the actual free block count more closely than if we only updated
7520 * it at the end.  We must be careful to handle cases where the block count
7521 * on free was incorrect.
7522 */
7523static void
7524freeblks_free(ump, freeblks, blocks)
7525	struct ufsmount *ump;
7526	struct freeblks *freeblks;
7527	int blocks;
7528{
7529	struct fs *fs;
7530	ufs2_daddr_t remain;
7531
7532	UFS_LOCK(ump);
7533	remain = -freeblks->fb_chkcnt;
7534	freeblks->fb_chkcnt += blocks;
7535	if (remain > 0) {
7536		if (remain < blocks)
7537			blocks = remain;
7538		fs = ump->um_fs;
7539		fs->fs_pendingblocks -= blocks;
7540	}
7541	UFS_UNLOCK(ump);
7542}
7543
7544/*
7545 * Once all of the freework workitems are complete we can retire the
7546 * freeblocks dependency and any journal work awaiting completion.  This
7547 * can not be called until all other dependencies are stable on disk.
7548 */
7549static int
7550handle_complete_freeblocks(freeblks, flags)
7551	struct freeblks *freeblks;
7552	int flags;
7553{
7554	struct inodedep *inodedep;
7555	struct inode *ip;
7556	struct vnode *vp;
7557	struct fs *fs;
7558	struct ufsmount *ump;
7559	ufs2_daddr_t spare;
7560
7561	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7562	fs = ump->um_fs;
7563	flags = LK_EXCLUSIVE | flags;
7564	spare = freeblks->fb_chkcnt;
7565
7566	/*
7567	 * If we did not release the expected number of blocks we may have
7568	 * to adjust the inode block count here.  Only do so if it wasn't
7569	 * a truncation to zero and the modrev still matches.
7570	 */
7571	if (spare && freeblks->fb_len != 0) {
7572		if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7573		    flags, &vp, FFSV_FORCEINSMQ) != 0)
7574			return (EBUSY);
7575		ip = VTOI(vp);
7576		if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
7577			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
7578			ip->i_flag |= IN_CHANGE;
7579			/*
7580			 * We must wait so this happens before the
7581			 * journal is reclaimed.
7582			 */
7583			ffs_update(vp, 1);
7584		}
7585		vput(vp);
7586	}
7587	if (spare < 0) {
7588		UFS_LOCK(ump);
7589		fs->fs_pendingblocks += spare;
7590		UFS_UNLOCK(ump);
7591	}
7592#ifdef QUOTA
7593	/* Handle spare. */
7594	if (spare)
7595		quotaadj(freeblks->fb_quota, ump, -spare);
7596	quotarele(freeblks->fb_quota);
7597#endif
7598	ACQUIRE_LOCK(&lk);
7599	if (freeblks->fb_state & ONDEPLIST) {
7600		inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7601		    0, &inodedep);
7602		TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
7603		freeblks->fb_state &= ~ONDEPLIST;
7604		if (TAILQ_EMPTY(&inodedep->id_freeblklst))
7605			free_inodedep(inodedep);
7606	}
7607	/*
7608	 * All of the freeblock deps must be complete prior to this call
7609	 * so it's now safe to complete earlier outstanding journal entries.
7610	 */
7611	handle_jwork(&freeblks->fb_jwork);
7612	WORKITEM_FREE(freeblks, D_FREEBLKS);
7613	FREE_LOCK(&lk);
7614	return (0);
7615}
7616
7617/*
7618 * Release blocks associated with the freeblks and stored in the indirect
7619 * block dbn. If level is greater than SINGLE, the block is an indirect block
7620 * and recursive calls to indirtrunc must be used to cleanse other indirect
7621 * blocks.
7622 *
7623 * This handles partial and complete truncation of blocks.  Partial is noted
7624 * with goingaway == 0.  In this case the freework is completed after the
7625 * zero'd indirects are written to disk.  For full truncation the freework
7626 * is completed after the block is freed.
7627 */
7628static void
7629indir_trunc(freework, dbn, lbn)
7630	struct freework *freework;
7631	ufs2_daddr_t dbn;
7632	ufs_lbn_t lbn;
7633{
7634	struct freework *nfreework;
7635	struct workhead wkhd;
7636	struct freeblks *freeblks;
7637	struct buf *bp;
7638	struct fs *fs;
7639	struct indirdep *indirdep;
7640	struct ufsmount *ump;
7641	ufs1_daddr_t *bap1 = 0;
7642	ufs2_daddr_t nb, nnb, *bap2 = 0;
7643	ufs_lbn_t lbnadd, nlbn;
7644	int i, nblocks, ufs1fmt;
7645	int freedblocks;
7646	int goingaway;
7647	int freedeps;
7648	int needj;
7649	int level;
7650	int cnt;
7651
7652	freeblks = freework->fw_freeblks;
7653	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7654	fs = ump->um_fs;
7655	/*
7656	 * Get buffer of block pointers to be freed.  There are three cases:
7657	 *
7658	 * 1) Partial truncate caches the indirdep pointer in the freework
7659	 *    which provides us a back copy to the save bp which holds the
7660	 *    pointers we want to clear.  When this completes the zero
7661	 *    pointers are written to the real copy.
7662	 * 2) The indirect is being completely truncated, cancel_indirdep()
7663	 *    eliminated the real copy and placed the indirdep on the saved
7664	 *    copy.  The indirdep and buf are discarded when this completes.
7665	 * 3) The indirect was not in memory, we read a copy off of the disk
7666	 *    using the devvp and drop and invalidate the buffer when we're
7667	 *    done.
7668	 */
7669	goingaway = 1;
7670	indirdep = NULL;
7671	if (freework->fw_indir != NULL) {
7672		goingaway = 0;
7673		indirdep = freework->fw_indir;
7674		bp = indirdep->ir_savebp;
7675		if (bp == NULL || bp->b_blkno != dbn)
7676			panic("indir_trunc: Bad saved buf %p blkno %jd",
7677			    bp, (intmax_t)dbn);
7678	} else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
7679		/*
7680		 * The lock prevents the buf dep list from changing and
7681	 	 * indirects on devvp should only ever have one dependency.
7682		 */
7683		indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
7684		if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
7685			panic("indir_trunc: Bad indirdep %p from buf %p",
7686			    indirdep, bp);
7687	} else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
7688	    NOCRED, &bp) != 0) {
7689		brelse(bp);
7690		return;
7691	}
7692	ACQUIRE_LOCK(&lk);
7693	/* Protects against a race with complete_trunc_indir(). */
7694	freework->fw_state &= ~INPROGRESS;
7695	/*
7696	 * If we have an indirdep we need to enforce the truncation order
7697	 * and discard it when it is complete.
7698	 */
7699	if (indirdep) {
7700		if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
7701		    !TAILQ_EMPTY(&indirdep->ir_trunc)) {
7702			/*
7703			 * Add the complete truncate to the list on the
7704			 * indirdep to enforce in-order processing.
7705			 */
7706			if (freework->fw_indir == NULL)
7707				TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
7708				    freework, fw_next);
7709			FREE_LOCK(&lk);
7710			return;
7711		}
7712		/*
7713		 * If we're goingaway, free the indirdep.  Otherwise it will
7714		 * linger until the write completes.
7715		 */
7716		if (goingaway) {
7717			free_indirdep(indirdep);
7718			ump->um_numindirdeps -= 1;
7719		}
7720	}
7721	FREE_LOCK(&lk);
7722	/* Initialize pointers depending on block size. */
7723	if (ump->um_fstype == UFS1) {
7724		bap1 = (ufs1_daddr_t *)bp->b_data;
7725		nb = bap1[freework->fw_off];
7726		ufs1fmt = 1;
7727	} else {
7728		bap2 = (ufs2_daddr_t *)bp->b_data;
7729		nb = bap2[freework->fw_off];
7730		ufs1fmt = 0;
7731	}
7732	level = lbn_level(lbn);
7733	needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
7734	lbnadd = lbn_offset(fs, level);
7735	nblocks = btodb(fs->fs_bsize);
7736	nfreework = freework;
7737	freedeps = 0;
7738	cnt = 0;
7739	/*
7740	 * Reclaim blocks.  Traverses into nested indirect levels and
7741	 * arranges for the current level to be freed when subordinates
7742	 * are free when journaling.
7743	 */
7744	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
7745		if (i != NINDIR(fs) - 1) {
7746			if (ufs1fmt)
7747				nnb = bap1[i+1];
7748			else
7749				nnb = bap2[i+1];
7750		} else
7751			nnb = 0;
7752		if (nb == 0)
7753			continue;
7754		cnt++;
7755		if (level != 0) {
7756			nlbn = (lbn + 1) - (i * lbnadd);
7757			if (needj != 0) {
7758				nfreework = newfreework(ump, freeblks, freework,
7759				    nlbn, nb, fs->fs_frag, 0, 0);
7760				freedeps++;
7761			}
7762			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
7763		} else {
7764			struct freedep *freedep;
7765
7766			/*
7767			 * Attempt to aggregate freedep dependencies for
7768			 * all blocks being released to the same CG.
7769			 */
7770			LIST_INIT(&wkhd);
7771			if (needj != 0 &&
7772			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
7773				freedep = newfreedep(freework);
7774				WORKLIST_INSERT_UNLOCKED(&wkhd,
7775				    &freedep->fd_list);
7776				freedeps++;
7777			}
7778			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
7779			    fs->fs_bsize, freeblks->fb_inum,
7780			    freeblks->fb_vtype, &wkhd);
7781		}
7782	}
7783	if (goingaway) {
7784		bp->b_flags |= B_INVAL | B_NOCACHE;
7785		brelse(bp);
7786	}
7787	freedblocks = 0;
7788	if (level == 0)
7789		freedblocks = (nblocks * cnt);
7790	if (needj == 0)
7791		freedblocks += nblocks;
7792	freeblks_free(ump, freeblks, freedblocks);
7793	/*
7794	 * If we are journaling set up the ref counts and offset so this
7795	 * indirect can be completed when its children are free.
7796	 */
7797	if (needj) {
7798		ACQUIRE_LOCK(&lk);
7799		freework->fw_off = i;
7800		freework->fw_ref += freedeps;
7801		freework->fw_ref -= NINDIR(fs) + 1;
7802		if (level == 0)
7803			freeblks->fb_cgwait += freedeps;
7804		if (freework->fw_ref == 0)
7805			freework_freeblock(freework);
7806		FREE_LOCK(&lk);
7807		return;
7808	}
7809	/*
7810	 * If we're not journaling we can free the indirect now.
7811	 */
7812	dbn = dbtofsb(fs, dbn);
7813	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
7814	    freeblks->fb_inum, freeblks->fb_vtype, NULL);
7815	/* Non SUJ softdep does single-threaded truncations. */
7816	if (freework->fw_blkno == dbn) {
7817		freework->fw_state |= ALLCOMPLETE;
7818		ACQUIRE_LOCK(&lk);
7819		handle_written_freework(freework);
7820		FREE_LOCK(&lk);
7821	}
7822	return;
7823}
7824
7825/*
7826 * Cancel an allocindir when it is removed via truncation.  When bp is not
7827 * NULL the indirect never appeared on disk and is scheduled to be freed
7828 * independently of the indir so we can more easily track journal work.
7829 */
7830static void
7831cancel_allocindir(aip, bp, freeblks, trunc)
7832	struct allocindir *aip;
7833	struct buf *bp;
7834	struct freeblks *freeblks;
7835	int trunc;
7836{
7837	struct indirdep *indirdep;
7838	struct freefrag *freefrag;
7839	struct newblk *newblk;
7840
7841	newblk = (struct newblk *)aip;
7842	LIST_REMOVE(aip, ai_next);
7843	/*
7844	 * We must eliminate the pointer in bp if it must be freed on its
7845	 * own due to partial truncate or pending journal work.
7846	 */
7847	if (bp && (trunc || newblk->nb_jnewblk)) {
7848		/*
7849		 * Clear the pointer and mark the aip to be freed
7850		 * directly if it never existed on disk.
7851		 */
7852		aip->ai_state |= DELAYEDFREE;
7853		indirdep = aip->ai_indirdep;
7854		if (indirdep->ir_state & UFS1FMT)
7855			((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
7856		else
7857			((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
7858	}
7859	/*
7860	 * When truncating the previous pointer will be freed via
7861	 * savedbp.  Eliminate the freefrag which would dup free.
7862	 */
7863	if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
7864		newblk->nb_freefrag = NULL;
7865		if (freefrag->ff_jdep)
7866			cancel_jfreefrag(
7867			    WK_JFREEFRAG(freefrag->ff_jdep));
7868		jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
7869		WORKITEM_FREE(freefrag, D_FREEFRAG);
7870	}
7871	/*
7872	 * If the journal hasn't been written the jnewblk must be passed
7873	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
7874	 * this by leaving the journal dependency on the newblk to be freed
7875	 * when a freework is created in handle_workitem_freeblocks().
7876	 */
7877	cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
7878	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7879}
7880
7881/*
7882 * Create the mkdir dependencies for . and .. in a new directory.  Link them
7883 * in to a newdirblk so any subsequent additions are tracked properly.  The
7884 * caller is responsible for adding the mkdir1 dependency to the journal
7885 * and updating id_mkdiradd.  This function returns with lk held.
7886 */
7887static struct mkdir *
7888setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
7889	struct diradd *dap;
7890	ino_t newinum;
7891	ino_t dinum;
7892	struct buf *newdirbp;
7893	struct mkdir **mkdirp;
7894{
7895	struct newblk *newblk;
7896	struct pagedep *pagedep;
7897	struct inodedep *inodedep;
7898	struct newdirblk *newdirblk = 0;
7899	struct mkdir *mkdir1, *mkdir2;
7900	struct worklist *wk;
7901	struct jaddref *jaddref;
7902	struct mount *mp;
7903
7904	mp = dap->da_list.wk_mp;
7905	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
7906	    M_SOFTDEP_FLAGS);
7907	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
7908	LIST_INIT(&newdirblk->db_mkdir);
7909	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
7910	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
7911	mkdir1->md_state = ATTACHED | MKDIR_BODY;
7912	mkdir1->md_diradd = dap;
7913	mkdir1->md_jaddref = NULL;
7914	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
7915	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
7916	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
7917	mkdir2->md_diradd = dap;
7918	mkdir2->md_jaddref = NULL;
7919	if (MOUNTEDSUJ(mp) == 0) {
7920		mkdir1->md_state |= DEPCOMPLETE;
7921		mkdir2->md_state |= DEPCOMPLETE;
7922	}
7923	/*
7924	 * Dependency on "." and ".." being written to disk.
7925	 */
7926	mkdir1->md_buf = newdirbp;
7927	ACQUIRE_LOCK(&lk);
7928	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
7929	/*
7930	 * We must link the pagedep, allocdirect, and newdirblk for
7931	 * the initial file page so the pointer to the new directory
7932	 * is not written until the directory contents are live and
7933	 * any subsequent additions are not marked live until the
7934	 * block is reachable via the inode.
7935	 */
7936	if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
7937		panic("setup_newdir: lost pagedep");
7938	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
7939		if (wk->wk_type == D_ALLOCDIRECT)
7940			break;
7941	if (wk == NULL)
7942		panic("setup_newdir: lost allocdirect");
7943	if (pagedep->pd_state & NEWBLOCK)
7944		panic("setup_newdir: NEWBLOCK already set");
7945	newblk = WK_NEWBLK(wk);
7946	pagedep->pd_state |= NEWBLOCK;
7947	pagedep->pd_newdirblk = newdirblk;
7948	newdirblk->db_pagedep = pagedep;
7949	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
7950	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
7951	/*
7952	 * Look up the inodedep for the parent directory so that we
7953	 * can link mkdir2 into the pending dotdot jaddref or
7954	 * the inode write if there is none.  If the inode is
7955	 * ALLCOMPLETE and no jaddref is present all dependencies have
7956	 * been satisfied and mkdir2 can be freed.
7957	 */
7958	inodedep_lookup(mp, dinum, 0, &inodedep);
7959	if (MOUNTEDSUJ(mp)) {
7960		if (inodedep == NULL)
7961			panic("setup_newdir: Lost parent.");
7962		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
7963		    inoreflst);
7964		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
7965		    (jaddref->ja_state & MKDIR_PARENT),
7966		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
7967		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
7968		mkdir2->md_jaddref = jaddref;
7969		jaddref->ja_mkdir = mkdir2;
7970	} else if (inodedep == NULL ||
7971	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
7972		dap->da_state &= ~MKDIR_PARENT;
7973		WORKITEM_FREE(mkdir2, D_MKDIR);
7974	} else {
7975		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
7976		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
7977	}
7978	*mkdirp = mkdir2;
7979
7980	return (mkdir1);
7981}
7982
7983/*
7984 * Directory entry addition dependencies.
7985 *
7986 * When adding a new directory entry, the inode (with its incremented link
7987 * count) must be written to disk before the directory entry's pointer to it.
7988 * Also, if the inode is newly allocated, the corresponding freemap must be
7989 * updated (on disk) before the directory entry's pointer. These requirements
7990 * are met via undo/redo on the directory entry's pointer, which consists
7991 * simply of the inode number.
7992 *
7993 * As directory entries are added and deleted, the free space within a
7994 * directory block can become fragmented.  The ufs filesystem will compact
7995 * a fragmented directory block to make space for a new entry. When this
7996 * occurs, the offsets of previously added entries change. Any "diradd"
7997 * dependency structures corresponding to these entries must be updated with
7998 * the new offsets.
7999 */
8000
8001/*
8002 * This routine is called after the in-memory inode's link
8003 * count has been incremented, but before the directory entry's
8004 * pointer to the inode has been set.
8005 */
8006int
8007softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
8008	struct buf *bp;		/* buffer containing directory block */
8009	struct inode *dp;	/* inode for directory */
8010	off_t diroffset;	/* offset of new entry in directory */
8011	ino_t newinum;		/* inode referenced by new directory entry */
8012	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
8013	int isnewblk;		/* entry is in a newly allocated block */
8014{
8015	int offset;		/* offset of new entry within directory block */
8016	ufs_lbn_t lbn;		/* block in directory containing new entry */
8017	struct fs *fs;
8018	struct diradd *dap;
8019	struct newblk *newblk;
8020	struct pagedep *pagedep;
8021	struct inodedep *inodedep;
8022	struct newdirblk *newdirblk = 0;
8023	struct mkdir *mkdir1, *mkdir2;
8024	struct jaddref *jaddref;
8025	struct mount *mp;
8026	int isindir;
8027
8028	/*
8029	 * Whiteouts have no dependencies.
8030	 */
8031	if (newinum == WINO) {
8032		if (newdirbp != NULL)
8033			bdwrite(newdirbp);
8034		return (0);
8035	}
8036	jaddref = NULL;
8037	mkdir1 = mkdir2 = NULL;
8038	mp = UFSTOVFS(dp->i_ump);
8039	fs = dp->i_fs;
8040	lbn = lblkno(fs, diroffset);
8041	offset = blkoff(fs, diroffset);
8042	dap = malloc(sizeof(struct diradd), M_DIRADD,
8043		M_SOFTDEP_FLAGS|M_ZERO);
8044	workitem_alloc(&dap->da_list, D_DIRADD, mp);
8045	dap->da_offset = offset;
8046	dap->da_newinum = newinum;
8047	dap->da_state = ATTACHED;
8048	LIST_INIT(&dap->da_jwork);
8049	isindir = bp->b_lblkno >= NDADDR;
8050	if (isnewblk &&
8051	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8052		newdirblk = malloc(sizeof(struct newdirblk),
8053		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8054		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8055		LIST_INIT(&newdirblk->db_mkdir);
8056	}
8057	/*
8058	 * If we're creating a new directory setup the dependencies and set
8059	 * the dap state to wait for them.  Otherwise it's COMPLETE and
8060	 * we can move on.
8061	 */
8062	if (newdirbp == NULL) {
8063		dap->da_state |= DEPCOMPLETE;
8064		ACQUIRE_LOCK(&lk);
8065	} else {
8066		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8067		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8068		    &mkdir2);
8069	}
8070	/*
8071	 * Link into parent directory pagedep to await its being written.
8072	 */
8073	pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8074#ifdef DEBUG
8075	if (diradd_lookup(pagedep, offset) != NULL)
8076		panic("softdep_setup_directory_add: %p already at off %d\n",
8077		    diradd_lookup(pagedep, offset), offset);
8078#endif
8079	dap->da_pagedep = pagedep;
8080	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8081	    da_pdlist);
8082	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
8083	/*
8084	 * If we're journaling, link the diradd into the jaddref so it
8085	 * may be completed after the journal entry is written.  Otherwise,
8086	 * link the diradd into its inodedep.  If the inode is not yet
8087	 * written place it on the bufwait list, otherwise do the post-inode
8088	 * write processing to put it on the id_pendinghd list.
8089	 */
8090	if (MOUNTEDSUJ(mp)) {
8091		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8092		    inoreflst);
8093		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8094		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8095		jaddref->ja_diroff = diroffset;
8096		jaddref->ja_diradd = dap;
8097		add_to_journal(&jaddref->ja_list);
8098	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8099		diradd_inode_written(dap, inodedep);
8100	else
8101		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8102	/*
8103	 * Add the journal entries for . and .. links now that the primary
8104	 * link is written.
8105	 */
8106	if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8107		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8108		    inoreflst, if_deps);
8109		KASSERT(jaddref != NULL &&
8110		    jaddref->ja_ino == jaddref->ja_parent &&
8111		    (jaddref->ja_state & MKDIR_BODY),
8112		    ("softdep_setup_directory_add: bad dot jaddref %p",
8113		    jaddref));
8114		mkdir1->md_jaddref = jaddref;
8115		jaddref->ja_mkdir = mkdir1;
8116		/*
8117		 * It is important that the dotdot journal entry
8118		 * is added prior to the dot entry since dot writes
8119		 * both the dot and dotdot links.  These both must
8120		 * be added after the primary link for the journal
8121		 * to remain consistent.
8122		 */
8123		add_to_journal(&mkdir2->md_jaddref->ja_list);
8124		add_to_journal(&jaddref->ja_list);
8125	}
8126	/*
8127	 * If we are adding a new directory remember this diradd so that if
8128	 * we rename it we can keep the dot and dotdot dependencies.  If
8129	 * we are adding a new name for an inode that has a mkdiradd we
8130	 * must be in rename and we have to move the dot and dotdot
8131	 * dependencies to this new name.  The old name is being orphaned
8132	 * soon.
8133	 */
8134	if (mkdir1 != NULL) {
8135		if (inodedep->id_mkdiradd != NULL)
8136			panic("softdep_setup_directory_add: Existing mkdir");
8137		inodedep->id_mkdiradd = dap;
8138	} else if (inodedep->id_mkdiradd)
8139		merge_diradd(inodedep, dap);
8140	if (newdirblk) {
8141		/*
8142		 * There is nothing to do if we are already tracking
8143		 * this block.
8144		 */
8145		if ((pagedep->pd_state & NEWBLOCK) != 0) {
8146			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8147			FREE_LOCK(&lk);
8148			return (0);
8149		}
8150		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8151		    == 0)
8152			panic("softdep_setup_directory_add: lost entry");
8153		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8154		pagedep->pd_state |= NEWBLOCK;
8155		pagedep->pd_newdirblk = newdirblk;
8156		newdirblk->db_pagedep = pagedep;
8157		FREE_LOCK(&lk);
8158		/*
8159		 * If we extended into an indirect signal direnter to sync.
8160		 */
8161		if (isindir)
8162			return (1);
8163		return (0);
8164	}
8165	FREE_LOCK(&lk);
8166	return (0);
8167}
8168
8169/*
8170 * This procedure is called to change the offset of a directory
8171 * entry when compacting a directory block which must be owned
8172 * exclusively by the caller. Note that the actual entry movement
8173 * must be done in this procedure to ensure that no I/O completions
8174 * occur while the move is in progress.
8175 */
8176void
8177softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
8178	struct buf *bp;		/* Buffer holding directory block. */
8179	struct inode *dp;	/* inode for directory */
8180	caddr_t base;		/* address of dp->i_offset */
8181	caddr_t oldloc;		/* address of old directory location */
8182	caddr_t newloc;		/* address of new directory location */
8183	int entrysize;		/* size of directory entry */
8184{
8185	int offset, oldoffset, newoffset;
8186	struct pagedep *pagedep;
8187	struct jmvref *jmvref;
8188	struct diradd *dap;
8189	struct direct *de;
8190	struct mount *mp;
8191	ufs_lbn_t lbn;
8192	int flags;
8193
8194	mp = UFSTOVFS(dp->i_ump);
8195	de = (struct direct *)oldloc;
8196	jmvref = NULL;
8197	flags = 0;
8198	/*
8199	 * Moves are always journaled as it would be too complex to
8200	 * determine if any affected adds or removes are present in the
8201	 * journal.
8202	 */
8203	if (MOUNTEDSUJ(mp)) {
8204		flags = DEPALLOC;
8205		jmvref = newjmvref(dp, de->d_ino,
8206		    dp->i_offset + (oldloc - base),
8207		    dp->i_offset + (newloc - base));
8208	}
8209	lbn = lblkno(dp->i_fs, dp->i_offset);
8210	offset = blkoff(dp->i_fs, dp->i_offset);
8211	oldoffset = offset + (oldloc - base);
8212	newoffset = offset + (newloc - base);
8213	ACQUIRE_LOCK(&lk);
8214	if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8215		goto done;
8216	dap = diradd_lookup(pagedep, oldoffset);
8217	if (dap) {
8218		dap->da_offset = newoffset;
8219		newoffset = DIRADDHASH(newoffset);
8220		oldoffset = DIRADDHASH(oldoffset);
8221		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8222		    newoffset != oldoffset) {
8223			LIST_REMOVE(dap, da_pdlist);
8224			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8225			    dap, da_pdlist);
8226		}
8227	}
8228done:
8229	if (jmvref) {
8230		jmvref->jm_pagedep = pagedep;
8231		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8232		add_to_journal(&jmvref->jm_list);
8233	}
8234	bcopy(oldloc, newloc, entrysize);
8235	FREE_LOCK(&lk);
8236}
8237
8238/*
8239 * Move the mkdir dependencies and journal work from one diradd to another
8240 * when renaming a directory.  The new name must depend on the mkdir deps
8241 * completing as the old name did.  Directories can only have one valid link
8242 * at a time so one must be canonical.
8243 */
8244static void
8245merge_diradd(inodedep, newdap)
8246	struct inodedep *inodedep;
8247	struct diradd *newdap;
8248{
8249	struct diradd *olddap;
8250	struct mkdir *mkdir, *nextmd;
8251	short state;
8252
8253	olddap = inodedep->id_mkdiradd;
8254	inodedep->id_mkdiradd = newdap;
8255	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8256		newdap->da_state &= ~DEPCOMPLETE;
8257		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
8258			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8259			if (mkdir->md_diradd != olddap)
8260				continue;
8261			mkdir->md_diradd = newdap;
8262			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8263			newdap->da_state |= state;
8264			olddap->da_state &= ~state;
8265			if ((olddap->da_state &
8266			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
8267				break;
8268		}
8269		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8270			panic("merge_diradd: unfound ref");
8271	}
8272	/*
8273	 * Any mkdir related journal items are not safe to be freed until
8274	 * the new name is stable.
8275	 */
8276	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8277	olddap->da_state |= DEPCOMPLETE;
8278	complete_diradd(olddap);
8279}
8280
8281/*
8282 * Move the diradd to the pending list when all diradd dependencies are
8283 * complete.
8284 */
8285static void
8286complete_diradd(dap)
8287	struct diradd *dap;
8288{
8289	struct pagedep *pagedep;
8290
8291	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8292		if (dap->da_state & DIRCHG)
8293			pagedep = dap->da_previous->dm_pagedep;
8294		else
8295			pagedep = dap->da_pagedep;
8296		LIST_REMOVE(dap, da_pdlist);
8297		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8298	}
8299}
8300
8301/*
8302 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
8303 * add entries and conditonally journal the remove.
8304 */
8305static void
8306cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
8307	struct diradd *dap;
8308	struct dirrem *dirrem;
8309	struct jremref *jremref;
8310	struct jremref *dotremref;
8311	struct jremref *dotdotremref;
8312{
8313	struct inodedep *inodedep;
8314	struct jaddref *jaddref;
8315	struct inoref *inoref;
8316	struct mkdir *mkdir;
8317
8318	/*
8319	 * If no remove references were allocated we're on a non-journaled
8320	 * filesystem and can skip the cancel step.
8321	 */
8322	if (jremref == NULL) {
8323		free_diradd(dap, NULL);
8324		return;
8325	}
8326	/*
8327	 * Cancel the primary name an free it if it does not require
8328	 * journaling.
8329	 */
8330	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
8331	    0, &inodedep) != 0) {
8332		/* Abort the addref that reference this diradd.  */
8333		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
8334			if (inoref->if_list.wk_type != D_JADDREF)
8335				continue;
8336			jaddref = (struct jaddref *)inoref;
8337			if (jaddref->ja_diradd != dap)
8338				continue;
8339			if (cancel_jaddref(jaddref, inodedep,
8340			    &dirrem->dm_jwork) == 0) {
8341				free_jremref(jremref);
8342				jremref = NULL;
8343			}
8344			break;
8345		}
8346	}
8347	/*
8348	 * Cancel subordinate names and free them if they do not require
8349	 * journaling.
8350	 */
8351	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8352		LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
8353			if (mkdir->md_diradd != dap)
8354				continue;
8355			if ((jaddref = mkdir->md_jaddref) == NULL)
8356				continue;
8357			mkdir->md_jaddref = NULL;
8358			if (mkdir->md_state & MKDIR_PARENT) {
8359				if (cancel_jaddref(jaddref, NULL,
8360				    &dirrem->dm_jwork) == 0) {
8361					free_jremref(dotdotremref);
8362					dotdotremref = NULL;
8363				}
8364			} else {
8365				if (cancel_jaddref(jaddref, inodedep,
8366				    &dirrem->dm_jwork) == 0) {
8367					free_jremref(dotremref);
8368					dotremref = NULL;
8369				}
8370			}
8371		}
8372	}
8373
8374	if (jremref)
8375		journal_jremref(dirrem, jremref, inodedep);
8376	if (dotremref)
8377		journal_jremref(dirrem, dotremref, inodedep);
8378	if (dotdotremref)
8379		journal_jremref(dirrem, dotdotremref, NULL);
8380	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
8381	free_diradd(dap, &dirrem->dm_jwork);
8382}
8383
8384/*
8385 * Free a diradd dependency structure. This routine must be called
8386 * with splbio interrupts blocked.
8387 */
8388static void
8389free_diradd(dap, wkhd)
8390	struct diradd *dap;
8391	struct workhead *wkhd;
8392{
8393	struct dirrem *dirrem;
8394	struct pagedep *pagedep;
8395	struct inodedep *inodedep;
8396	struct mkdir *mkdir, *nextmd;
8397
8398	mtx_assert(&lk, MA_OWNED);
8399	LIST_REMOVE(dap, da_pdlist);
8400	if (dap->da_state & ONWORKLIST)
8401		WORKLIST_REMOVE(&dap->da_list);
8402	if ((dap->da_state & DIRCHG) == 0) {
8403		pagedep = dap->da_pagedep;
8404	} else {
8405		dirrem = dap->da_previous;
8406		pagedep = dirrem->dm_pagedep;
8407		dirrem->dm_dirinum = pagedep->pd_ino;
8408		dirrem->dm_state |= COMPLETE;
8409		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8410			add_to_worklist(&dirrem->dm_list, 0);
8411	}
8412	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
8413	    0, &inodedep) != 0)
8414		if (inodedep->id_mkdiradd == dap)
8415			inodedep->id_mkdiradd = NULL;
8416	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8417		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
8418			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8419			if (mkdir->md_diradd != dap)
8420				continue;
8421			dap->da_state &=
8422			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
8423			LIST_REMOVE(mkdir, md_mkdirs);
8424			if (mkdir->md_state & ONWORKLIST)
8425				WORKLIST_REMOVE(&mkdir->md_list);
8426			if (mkdir->md_jaddref != NULL)
8427				panic("free_diradd: Unexpected jaddref");
8428			WORKITEM_FREE(mkdir, D_MKDIR);
8429			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
8430				break;
8431		}
8432		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8433			panic("free_diradd: unfound ref");
8434	}
8435	if (inodedep)
8436		free_inodedep(inodedep);
8437	/*
8438	 * Free any journal segments waiting for the directory write.
8439	 */
8440	handle_jwork(&dap->da_jwork);
8441	WORKITEM_FREE(dap, D_DIRADD);
8442}
8443
8444/*
8445 * Directory entry removal dependencies.
8446 *
8447 * When removing a directory entry, the entry's inode pointer must be
8448 * zero'ed on disk before the corresponding inode's link count is decremented
8449 * (possibly freeing the inode for re-use). This dependency is handled by
8450 * updating the directory entry but delaying the inode count reduction until
8451 * after the directory block has been written to disk. After this point, the
8452 * inode count can be decremented whenever it is convenient.
8453 */
8454
8455/*
8456 * This routine should be called immediately after removing
8457 * a directory entry.  The inode's link count should not be
8458 * decremented by the calling procedure -- the soft updates
8459 * code will do this task when it is safe.
8460 */
8461void
8462softdep_setup_remove(bp, dp, ip, isrmdir)
8463	struct buf *bp;		/* buffer containing directory block */
8464	struct inode *dp;	/* inode for the directory being modified */
8465	struct inode *ip;	/* inode for directory entry being removed */
8466	int isrmdir;		/* indicates if doing RMDIR */
8467{
8468	struct dirrem *dirrem, *prevdirrem;
8469	struct inodedep *inodedep;
8470	int direct;
8471
8472	/*
8473	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
8474	 * newdirrem() to setup the full directory remove which requires
8475	 * isrmdir > 1.
8476	 */
8477	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8478	/*
8479	 * Add the dirrem to the inodedep's pending remove list for quick
8480	 * discovery later.
8481	 */
8482	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8483	    &inodedep) == 0)
8484		panic("softdep_setup_remove: Lost inodedep.");
8485	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
8486	dirrem->dm_state |= ONDEPLIST;
8487	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8488
8489	/*
8490	 * If the COMPLETE flag is clear, then there were no active
8491	 * entries and we want to roll back to a zeroed entry until
8492	 * the new inode is committed to disk. If the COMPLETE flag is
8493	 * set then we have deleted an entry that never made it to
8494	 * disk. If the entry we deleted resulted from a name change,
8495	 * then the old name still resides on disk. We cannot delete
8496	 * its inode (returned to us in prevdirrem) until the zeroed
8497	 * directory entry gets to disk. The new inode has never been
8498	 * referenced on the disk, so can be deleted immediately.
8499	 */
8500	if ((dirrem->dm_state & COMPLETE) == 0) {
8501		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
8502		    dm_next);
8503		FREE_LOCK(&lk);
8504	} else {
8505		if (prevdirrem != NULL)
8506			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
8507			    prevdirrem, dm_next);
8508		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
8509		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
8510		FREE_LOCK(&lk);
8511		if (direct)
8512			handle_workitem_remove(dirrem, 0);
8513	}
8514}
8515
8516/*
8517 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
8518 * pd_pendinghd list of a pagedep.
8519 */
8520static struct diradd *
8521diradd_lookup(pagedep, offset)
8522	struct pagedep *pagedep;
8523	int offset;
8524{
8525	struct diradd *dap;
8526
8527	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
8528		if (dap->da_offset == offset)
8529			return (dap);
8530	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
8531		if (dap->da_offset == offset)
8532			return (dap);
8533	return (NULL);
8534}
8535
8536/*
8537 * Search for a .. diradd dependency in a directory that is being removed.
8538 * If the directory was renamed to a new parent we have a diradd rather
8539 * than a mkdir for the .. entry.  We need to cancel it now before
8540 * it is found in truncate().
8541 */
8542static struct jremref *
8543cancel_diradd_dotdot(ip, dirrem, jremref)
8544	struct inode *ip;
8545	struct dirrem *dirrem;
8546	struct jremref *jremref;
8547{
8548	struct pagedep *pagedep;
8549	struct diradd *dap;
8550	struct worklist *wk;
8551
8552	if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0,
8553	    &pagedep) == 0)
8554		return (jremref);
8555	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
8556	if (dap == NULL)
8557		return (jremref);
8558	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
8559	/*
8560	 * Mark any journal work as belonging to the parent so it is freed
8561	 * with the .. reference.
8562	 */
8563	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
8564		wk->wk_state |= MKDIR_PARENT;
8565	return (NULL);
8566}
8567
8568/*
8569 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
8570 * replace it with a dirrem/diradd pair as a result of re-parenting a
8571 * directory.  This ensures that we don't simultaneously have a mkdir and
8572 * a diradd for the same .. entry.
8573 */
8574static struct jremref *
8575cancel_mkdir_dotdot(ip, dirrem, jremref)
8576	struct inode *ip;
8577	struct dirrem *dirrem;
8578	struct jremref *jremref;
8579{
8580	struct inodedep *inodedep;
8581	struct jaddref *jaddref;
8582	struct mkdir *mkdir;
8583	struct diradd *dap;
8584
8585	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8586	    &inodedep) == 0)
8587		panic("cancel_mkdir_dotdot: Lost inodedep");
8588	dap = inodedep->id_mkdiradd;
8589	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
8590		return (jremref);
8591	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir;
8592	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
8593		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
8594			break;
8595	if (mkdir == NULL)
8596		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
8597	if ((jaddref = mkdir->md_jaddref) != NULL) {
8598		mkdir->md_jaddref = NULL;
8599		jaddref->ja_state &= ~MKDIR_PARENT;
8600		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
8601		    &inodedep) == 0)
8602			panic("cancel_mkdir_dotdot: Lost parent inodedep");
8603		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
8604			journal_jremref(dirrem, jremref, inodedep);
8605			jremref = NULL;
8606		}
8607	}
8608	if (mkdir->md_state & ONWORKLIST)
8609		WORKLIST_REMOVE(&mkdir->md_list);
8610	mkdir->md_state |= ALLCOMPLETE;
8611	complete_mkdir(mkdir);
8612	return (jremref);
8613}
8614
8615static void
8616journal_jremref(dirrem, jremref, inodedep)
8617	struct dirrem *dirrem;
8618	struct jremref *jremref;
8619	struct inodedep *inodedep;
8620{
8621
8622	if (inodedep == NULL)
8623		if (inodedep_lookup(jremref->jr_list.wk_mp,
8624		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
8625			panic("journal_jremref: Lost inodedep");
8626	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
8627	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
8628	add_to_journal(&jremref->jr_list);
8629}
8630
8631static void
8632dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
8633	struct dirrem *dirrem;
8634	struct jremref *jremref;
8635	struct jremref *dotremref;
8636	struct jremref *dotdotremref;
8637{
8638	struct inodedep *inodedep;
8639
8640
8641	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
8642	    &inodedep) == 0)
8643		panic("dirrem_journal: Lost inodedep");
8644	journal_jremref(dirrem, jremref, inodedep);
8645	if (dotremref)
8646		journal_jremref(dirrem, dotremref, inodedep);
8647	if (dotdotremref)
8648		journal_jremref(dirrem, dotdotremref, NULL);
8649}
8650
8651/*
8652 * Allocate a new dirrem if appropriate and return it along with
8653 * its associated pagedep. Called without a lock, returns with lock.
8654 */
8655static struct dirrem *
8656newdirrem(bp, dp, ip, isrmdir, prevdirremp)
8657	struct buf *bp;		/* buffer containing directory block */
8658	struct inode *dp;	/* inode for the directory being modified */
8659	struct inode *ip;	/* inode for directory entry being removed */
8660	int isrmdir;		/* indicates if doing RMDIR */
8661	struct dirrem **prevdirremp; /* previously referenced inode, if any */
8662{
8663	int offset;
8664	ufs_lbn_t lbn;
8665	struct diradd *dap;
8666	struct dirrem *dirrem;
8667	struct pagedep *pagedep;
8668	struct jremref *jremref;
8669	struct jremref *dotremref;
8670	struct jremref *dotdotremref;
8671	struct vnode *dvp;
8672
8673	/*
8674	 * Whiteouts have no deletion dependencies.
8675	 */
8676	if (ip == NULL)
8677		panic("newdirrem: whiteout");
8678	dvp = ITOV(dp);
8679	/*
8680	 * If we are over our limit, try to improve the situation.
8681	 * Limiting the number of dirrem structures will also limit
8682	 * the number of freefile and freeblks structures.
8683	 */
8684	ACQUIRE_LOCK(&lk);
8685	if (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2)
8686		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS);
8687	FREE_LOCK(&lk);
8688	dirrem = malloc(sizeof(struct dirrem),
8689		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
8690	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
8691	LIST_INIT(&dirrem->dm_jremrefhd);
8692	LIST_INIT(&dirrem->dm_jwork);
8693	dirrem->dm_state = isrmdir ? RMDIR : 0;
8694	dirrem->dm_oldinum = ip->i_number;
8695	*prevdirremp = NULL;
8696	/*
8697	 * Allocate remove reference structures to track journal write
8698	 * dependencies.  We will always have one for the link and
8699	 * when doing directories we will always have one more for dot.
8700	 * When renaming a directory we skip the dotdot link change so
8701	 * this is not needed.
8702	 */
8703	jremref = dotremref = dotdotremref = NULL;
8704	if (DOINGSUJ(dvp)) {
8705		if (isrmdir) {
8706			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
8707			    ip->i_effnlink + 2);
8708			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
8709			    ip->i_effnlink + 1);
8710			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
8711			    dp->i_effnlink + 1);
8712			dotdotremref->jr_state |= MKDIR_PARENT;
8713		} else
8714			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
8715			    ip->i_effnlink + 1);
8716	}
8717	ACQUIRE_LOCK(&lk);
8718	lbn = lblkno(dp->i_fs, dp->i_offset);
8719	offset = blkoff(dp->i_fs, dp->i_offset);
8720	pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC,
8721	    &pagedep);
8722	dirrem->dm_pagedep = pagedep;
8723	dirrem->dm_offset = offset;
8724	/*
8725	 * If we're renaming a .. link to a new directory, cancel any
8726	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
8727	 * the jremref is preserved for any potential diradd in this
8728	 * location.  This can not coincide with a rmdir.
8729	 */
8730	if (dp->i_offset == DOTDOT_OFFSET) {
8731		if (isrmdir)
8732			panic("newdirrem: .. directory change during remove?");
8733		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
8734	}
8735	/*
8736	 * If we're removing a directory search for the .. dependency now and
8737	 * cancel it.  Any pending journal work will be added to the dirrem
8738	 * to be completed when the workitem remove completes.
8739	 */
8740	if (isrmdir)
8741		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
8742	/*
8743	 * Check for a diradd dependency for the same directory entry.
8744	 * If present, then both dependencies become obsolete and can
8745	 * be de-allocated.
8746	 */
8747	dap = diradd_lookup(pagedep, offset);
8748	if (dap == NULL) {
8749		/*
8750		 * Link the jremref structures into the dirrem so they are
8751		 * written prior to the pagedep.
8752		 */
8753		if (jremref)
8754			dirrem_journal(dirrem, jremref, dotremref,
8755			    dotdotremref);
8756		return (dirrem);
8757	}
8758	/*
8759	 * Must be ATTACHED at this point.
8760	 */
8761	if ((dap->da_state & ATTACHED) == 0)
8762		panic("newdirrem: not ATTACHED");
8763	if (dap->da_newinum != ip->i_number)
8764		panic("newdirrem: inum %ju should be %ju",
8765		    (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
8766	/*
8767	 * If we are deleting a changed name that never made it to disk,
8768	 * then return the dirrem describing the previous inode (which
8769	 * represents the inode currently referenced from this entry on disk).
8770	 */
8771	if ((dap->da_state & DIRCHG) != 0) {
8772		*prevdirremp = dap->da_previous;
8773		dap->da_state &= ~DIRCHG;
8774		dap->da_pagedep = pagedep;
8775	}
8776	/*
8777	 * We are deleting an entry that never made it to disk.
8778	 * Mark it COMPLETE so we can delete its inode immediately.
8779	 */
8780	dirrem->dm_state |= COMPLETE;
8781	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
8782#ifdef SUJ_DEBUG
8783	if (isrmdir == 0) {
8784		struct worklist *wk;
8785
8786		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
8787			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
8788				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
8789	}
8790#endif
8791
8792	return (dirrem);
8793}
8794
8795/*
8796 * Directory entry change dependencies.
8797 *
8798 * Changing an existing directory entry requires that an add operation
8799 * be completed first followed by a deletion. The semantics for the addition
8800 * are identical to the description of adding a new entry above except
8801 * that the rollback is to the old inode number rather than zero. Once
8802 * the addition dependency is completed, the removal is done as described
8803 * in the removal routine above.
8804 */
8805
8806/*
8807 * This routine should be called immediately after changing
8808 * a directory entry.  The inode's link count should not be
8809 * decremented by the calling procedure -- the soft updates
8810 * code will perform this task when it is safe.
8811 */
8812void
8813softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
8814	struct buf *bp;		/* buffer containing directory block */
8815	struct inode *dp;	/* inode for the directory being modified */
8816	struct inode *ip;	/* inode for directory entry being removed */
8817	ino_t newinum;		/* new inode number for changed entry */
8818	int isrmdir;		/* indicates if doing RMDIR */
8819{
8820	int offset;
8821	struct diradd *dap = NULL;
8822	struct dirrem *dirrem, *prevdirrem;
8823	struct pagedep *pagedep;
8824	struct inodedep *inodedep;
8825	struct jaddref *jaddref;
8826	struct mount *mp;
8827
8828	offset = blkoff(dp->i_fs, dp->i_offset);
8829	mp = UFSTOVFS(dp->i_ump);
8830
8831	/*
8832	 * Whiteouts do not need diradd dependencies.
8833	 */
8834	if (newinum != WINO) {
8835		dap = malloc(sizeof(struct diradd),
8836		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
8837		workitem_alloc(&dap->da_list, D_DIRADD, mp);
8838		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
8839		dap->da_offset = offset;
8840		dap->da_newinum = newinum;
8841		LIST_INIT(&dap->da_jwork);
8842	}
8843
8844	/*
8845	 * Allocate a new dirrem and ACQUIRE_LOCK.
8846	 */
8847	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8848	pagedep = dirrem->dm_pagedep;
8849	/*
8850	 * The possible values for isrmdir:
8851	 *	0 - non-directory file rename
8852	 *	1 - directory rename within same directory
8853	 *   inum - directory rename to new directory of given inode number
8854	 * When renaming to a new directory, we are both deleting and
8855	 * creating a new directory entry, so the link count on the new
8856	 * directory should not change. Thus we do not need the followup
8857	 * dirrem which is usually done in handle_workitem_remove. We set
8858	 * the DIRCHG flag to tell handle_workitem_remove to skip the
8859	 * followup dirrem.
8860	 */
8861	if (isrmdir > 1)
8862		dirrem->dm_state |= DIRCHG;
8863
8864	/*
8865	 * Whiteouts have no additional dependencies,
8866	 * so just put the dirrem on the correct list.
8867	 */
8868	if (newinum == WINO) {
8869		if ((dirrem->dm_state & COMPLETE) == 0) {
8870			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
8871			    dm_next);
8872		} else {
8873			dirrem->dm_dirinum = pagedep->pd_ino;
8874			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8875				add_to_worklist(&dirrem->dm_list, 0);
8876		}
8877		FREE_LOCK(&lk);
8878		return;
8879	}
8880	/*
8881	 * Add the dirrem to the inodedep's pending remove list for quick
8882	 * discovery later.  A valid nlinkdelta ensures that this lookup
8883	 * will not fail.
8884	 */
8885	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
8886		panic("softdep_setup_directory_change: Lost inodedep.");
8887	dirrem->dm_state |= ONDEPLIST;
8888	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8889
8890	/*
8891	 * If the COMPLETE flag is clear, then there were no active
8892	 * entries and we want to roll back to the previous inode until
8893	 * the new inode is committed to disk. If the COMPLETE flag is
8894	 * set, then we have deleted an entry that never made it to disk.
8895	 * If the entry we deleted resulted from a name change, then the old
8896	 * inode reference still resides on disk. Any rollback that we do
8897	 * needs to be to that old inode (returned to us in prevdirrem). If
8898	 * the entry we deleted resulted from a create, then there is
8899	 * no entry on the disk, so we want to roll back to zero rather
8900	 * than the uncommitted inode. In either of the COMPLETE cases we
8901	 * want to immediately free the unwritten and unreferenced inode.
8902	 */
8903	if ((dirrem->dm_state & COMPLETE) == 0) {
8904		dap->da_previous = dirrem;
8905	} else {
8906		if (prevdirrem != NULL) {
8907			dap->da_previous = prevdirrem;
8908		} else {
8909			dap->da_state &= ~DIRCHG;
8910			dap->da_pagedep = pagedep;
8911		}
8912		dirrem->dm_dirinum = pagedep->pd_ino;
8913		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8914			add_to_worklist(&dirrem->dm_list, 0);
8915	}
8916	/*
8917	 * Lookup the jaddref for this journal entry.  We must finish
8918	 * initializing it and make the diradd write dependent on it.
8919	 * If we're not journaling, put it on the id_bufwait list if the
8920	 * inode is not yet written. If it is written, do the post-inode
8921	 * write processing to put it on the id_pendinghd list.
8922	 */
8923	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
8924	if (MOUNTEDSUJ(mp)) {
8925		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8926		    inoreflst);
8927		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8928		    ("softdep_setup_directory_change: bad jaddref %p",
8929		    jaddref));
8930		jaddref->ja_diroff = dp->i_offset;
8931		jaddref->ja_diradd = dap;
8932		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
8933		    dap, da_pdlist);
8934		add_to_journal(&jaddref->ja_list);
8935	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8936		dap->da_state |= COMPLETE;
8937		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8938		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
8939	} else {
8940		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
8941		    dap, da_pdlist);
8942		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8943	}
8944	/*
8945	 * If we're making a new name for a directory that has not been
8946	 * committed when need to move the dot and dotdot references to
8947	 * this new name.
8948	 */
8949	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
8950		merge_diradd(inodedep, dap);
8951	FREE_LOCK(&lk);
8952}
8953
8954/*
8955 * Called whenever the link count on an inode is changed.
8956 * It creates an inode dependency so that the new reference(s)
8957 * to the inode cannot be committed to disk until the updated
8958 * inode has been written.
8959 */
8960void
8961softdep_change_linkcnt(ip)
8962	struct inode *ip;	/* the inode with the increased link count */
8963{
8964	struct inodedep *inodedep;
8965	int dflags;
8966
8967	ACQUIRE_LOCK(&lk);
8968	dflags = DEPALLOC;
8969	if (IS_SNAPSHOT(ip))
8970		dflags |= NODELAY;
8971	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
8972	if (ip->i_nlink < ip->i_effnlink)
8973		panic("softdep_change_linkcnt: bad delta");
8974	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
8975	FREE_LOCK(&lk);
8976}
8977
8978/*
8979 * Attach a sbdep dependency to the superblock buf so that we can keep
8980 * track of the head of the linked list of referenced but unlinked inodes.
8981 */
8982void
8983softdep_setup_sbupdate(ump, fs, bp)
8984	struct ufsmount *ump;
8985	struct fs *fs;
8986	struct buf *bp;
8987{
8988	struct sbdep *sbdep;
8989	struct worklist *wk;
8990
8991	if (MOUNTEDSUJ(UFSTOVFS(ump)) == 0)
8992		return;
8993	LIST_FOREACH(wk, &bp->b_dep, wk_list)
8994		if (wk->wk_type == D_SBDEP)
8995			break;
8996	if (wk != NULL)
8997		return;
8998	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
8999	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9000	sbdep->sb_fs = fs;
9001	sbdep->sb_ump = ump;
9002	ACQUIRE_LOCK(&lk);
9003	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9004	FREE_LOCK(&lk);
9005}
9006
9007/*
9008 * Return the first unlinked inodedep which is ready to be the head of the
9009 * list.  The inodedep and all those after it must have valid next pointers.
9010 */
9011static struct inodedep *
9012first_unlinked_inodedep(ump)
9013	struct ufsmount *ump;
9014{
9015	struct inodedep *inodedep;
9016	struct inodedep *idp;
9017
9018	mtx_assert(&lk, MA_OWNED);
9019	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9020	    inodedep; inodedep = idp) {
9021		if ((inodedep->id_state & UNLINKNEXT) == 0)
9022			return (NULL);
9023		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9024		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9025			break;
9026		if ((inodedep->id_state & UNLINKPREV) == 0)
9027			break;
9028	}
9029	return (inodedep);
9030}
9031
9032/*
9033 * Set the sujfree unlinked head pointer prior to writing a superblock.
9034 */
9035static void
9036initiate_write_sbdep(sbdep)
9037	struct sbdep *sbdep;
9038{
9039	struct inodedep *inodedep;
9040	struct fs *bpfs;
9041	struct fs *fs;
9042
9043	bpfs = sbdep->sb_fs;
9044	fs = sbdep->sb_ump->um_fs;
9045	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9046	if (inodedep) {
9047		fs->fs_sujfree = inodedep->id_ino;
9048		inodedep->id_state |= UNLINKPREV;
9049	} else
9050		fs->fs_sujfree = 0;
9051	bpfs->fs_sujfree = fs->fs_sujfree;
9052}
9053
9054/*
9055 * After a superblock is written determine whether it must be written again
9056 * due to a changing unlinked list head.
9057 */
9058static int
9059handle_written_sbdep(sbdep, bp)
9060	struct sbdep *sbdep;
9061	struct buf *bp;
9062{
9063	struct inodedep *inodedep;
9064	struct mount *mp;
9065	struct fs *fs;
9066
9067	mtx_assert(&lk, MA_OWNED);
9068	fs = sbdep->sb_fs;
9069	mp = UFSTOVFS(sbdep->sb_ump);
9070	/*
9071	 * If the superblock doesn't match the in-memory list start over.
9072	 */
9073	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9074	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9075	    (inodedep == NULL && fs->fs_sujfree != 0)) {
9076		bdirty(bp);
9077		return (1);
9078	}
9079	WORKITEM_FREE(sbdep, D_SBDEP);
9080	if (fs->fs_sujfree == 0)
9081		return (0);
9082	/*
9083	 * Now that we have a record of this inode in stable store allow it
9084	 * to be written to free up pending work.  Inodes may see a lot of
9085	 * write activity after they are unlinked which we must not hold up.
9086	 */
9087	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9088		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9089			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9090			    inodedep, inodedep->id_state);
9091		if (inodedep->id_state & UNLINKONLIST)
9092			break;
9093		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9094	}
9095
9096	return (0);
9097}
9098
9099/*
9100 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9101 */
9102static void
9103unlinked_inodedep(mp, inodedep)
9104	struct mount *mp;
9105	struct inodedep *inodedep;
9106{
9107	struct ufsmount *ump;
9108
9109	mtx_assert(&lk, MA_OWNED);
9110	if (MOUNTEDSUJ(mp) == 0)
9111		return;
9112	ump = VFSTOUFS(mp);
9113	ump->um_fs->fs_fmod = 1;
9114	if (inodedep->id_state & UNLINKED)
9115		panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9116	inodedep->id_state |= UNLINKED;
9117	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9118}
9119
9120/*
9121 * Remove an inodedep from the unlinked inodedep list.  This may require
9122 * disk writes if the inode has made it that far.
9123 */
9124static void
9125clear_unlinked_inodedep(inodedep)
9126	struct inodedep *inodedep;
9127{
9128	struct ufsmount *ump;
9129	struct inodedep *idp;
9130	struct inodedep *idn;
9131	struct fs *fs;
9132	struct buf *bp;
9133	ino_t ino;
9134	ino_t nino;
9135	ino_t pino;
9136	int error;
9137
9138	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9139	fs = ump->um_fs;
9140	ino = inodedep->id_ino;
9141	error = 0;
9142	for (;;) {
9143		mtx_assert(&lk, MA_OWNED);
9144		KASSERT((inodedep->id_state & UNLINKED) != 0,
9145		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9146		    inodedep));
9147		/*
9148		 * If nothing has yet been written simply remove us from
9149		 * the in memory list and return.  This is the most common
9150		 * case where handle_workitem_remove() loses the final
9151		 * reference.
9152		 */
9153		if ((inodedep->id_state & UNLINKLINKS) == 0)
9154			break;
9155		/*
9156		 * If we have a NEXT pointer and no PREV pointer we can simply
9157		 * clear NEXT's PREV and remove ourselves from the list.  Be
9158		 * careful not to clear PREV if the superblock points at
9159		 * next as well.
9160		 */
9161		idn = TAILQ_NEXT(inodedep, id_unlinked);
9162		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9163			if (idn && fs->fs_sujfree != idn->id_ino)
9164				idn->id_state &= ~UNLINKPREV;
9165			break;
9166		}
9167		/*
9168		 * Here we have an inodedep which is actually linked into
9169		 * the list.  We must remove it by forcing a write to the
9170		 * link before us, whether it be the superblock or an inode.
9171		 * Unfortunately the list may change while we're waiting
9172		 * on the buf lock for either resource so we must loop until
9173		 * we lock the right one.  If both the superblock and an
9174		 * inode point to this inode we must clear the inode first
9175		 * followed by the superblock.
9176		 */
9177		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9178		pino = 0;
9179		if (idp && (idp->id_state & UNLINKNEXT))
9180			pino = idp->id_ino;
9181		FREE_LOCK(&lk);
9182		if (pino == 0)
9183			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9184			    (int)fs->fs_sbsize, 0, 0, 0);
9185		else
9186			error = bread(ump->um_devvp,
9187			    fsbtodb(fs, ino_to_fsba(fs, pino)),
9188			    (int)fs->fs_bsize, NOCRED, &bp);
9189		ACQUIRE_LOCK(&lk);
9190		if (error)
9191			break;
9192		/* If the list has changed restart the loop. */
9193		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9194		nino = 0;
9195		if (idp && (idp->id_state & UNLINKNEXT))
9196			nino = idp->id_ino;
9197		if (nino != pino ||
9198		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9199			FREE_LOCK(&lk);
9200			brelse(bp);
9201			ACQUIRE_LOCK(&lk);
9202			continue;
9203		}
9204		nino = 0;
9205		idn = TAILQ_NEXT(inodedep, id_unlinked);
9206		if (idn)
9207			nino = idn->id_ino;
9208		/*
9209		 * Remove us from the in memory list.  After this we cannot
9210		 * access the inodedep.
9211		 */
9212		KASSERT((inodedep->id_state & UNLINKED) != 0,
9213		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9214		    inodedep));
9215		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9216		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9217		FREE_LOCK(&lk);
9218		/*
9219		 * The predecessor's next pointer is manually updated here
9220		 * so that the NEXT flag is never cleared for an element
9221		 * that is in the list.
9222		 */
9223		if (pino == 0) {
9224			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9225			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9226			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9227			    bp);
9228		} else if (fs->fs_magic == FS_UFS1_MAGIC)
9229			((struct ufs1_dinode *)bp->b_data +
9230			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9231		else
9232			((struct ufs2_dinode *)bp->b_data +
9233			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9234		/*
9235		 * If the bwrite fails we have no recourse to recover.  The
9236		 * filesystem is corrupted already.
9237		 */
9238		bwrite(bp);
9239		ACQUIRE_LOCK(&lk);
9240		/*
9241		 * If the superblock pointer still needs to be cleared force
9242		 * a write here.
9243		 */
9244		if (fs->fs_sujfree == ino) {
9245			FREE_LOCK(&lk);
9246			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9247			    (int)fs->fs_sbsize, 0, 0, 0);
9248			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9249			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9250			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9251			    bp);
9252			bwrite(bp);
9253			ACQUIRE_LOCK(&lk);
9254		}
9255
9256		if (fs->fs_sujfree != ino)
9257			return;
9258		panic("clear_unlinked_inodedep: Failed to clear free head");
9259	}
9260	if (inodedep->id_ino == fs->fs_sujfree)
9261		panic("clear_unlinked_inodedep: Freeing head of free list");
9262	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9263	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9264	return;
9265}
9266
9267/*
9268 * This workitem decrements the inode's link count.
9269 * If the link count reaches zero, the file is removed.
9270 */
9271static int
9272handle_workitem_remove(dirrem, flags)
9273	struct dirrem *dirrem;
9274	int flags;
9275{
9276	struct inodedep *inodedep;
9277	struct workhead dotdotwk;
9278	struct worklist *wk;
9279	struct ufsmount *ump;
9280	struct mount *mp;
9281	struct vnode *vp;
9282	struct inode *ip;
9283	ino_t oldinum;
9284
9285	if (dirrem->dm_state & ONWORKLIST)
9286		panic("handle_workitem_remove: dirrem %p still on worklist",
9287		    dirrem);
9288	oldinum = dirrem->dm_oldinum;
9289	mp = dirrem->dm_list.wk_mp;
9290	ump = VFSTOUFS(mp);
9291	flags |= LK_EXCLUSIVE;
9292	if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
9293		return (EBUSY);
9294	ip = VTOI(vp);
9295	ACQUIRE_LOCK(&lk);
9296	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
9297		panic("handle_workitem_remove: lost inodedep");
9298	if (dirrem->dm_state & ONDEPLIST)
9299		LIST_REMOVE(dirrem, dm_inonext);
9300	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9301	    ("handle_workitem_remove:  Journal entries not written."));
9302
9303	/*
9304	 * Move all dependencies waiting on the remove to complete
9305	 * from the dirrem to the inode inowait list to be completed
9306	 * after the inode has been updated and written to disk.  Any
9307	 * marked MKDIR_PARENT are saved to be completed when the .. ref
9308	 * is removed.
9309	 */
9310	LIST_INIT(&dotdotwk);
9311	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
9312		WORKLIST_REMOVE(wk);
9313		if (wk->wk_state & MKDIR_PARENT) {
9314			wk->wk_state &= ~MKDIR_PARENT;
9315			WORKLIST_INSERT(&dotdotwk, wk);
9316			continue;
9317		}
9318		WORKLIST_INSERT(&inodedep->id_inowait, wk);
9319	}
9320	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
9321	/*
9322	 * Normal file deletion.
9323	 */
9324	if ((dirrem->dm_state & RMDIR) == 0) {
9325		ip->i_nlink--;
9326		DIP_SET(ip, i_nlink, ip->i_nlink);
9327		ip->i_flag |= IN_CHANGE;
9328		if (ip->i_nlink < ip->i_effnlink)
9329			panic("handle_workitem_remove: bad file delta");
9330		if (ip->i_nlink == 0)
9331			unlinked_inodedep(mp, inodedep);
9332		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9333		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9334		    ("handle_workitem_remove: worklist not empty. %s",
9335		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
9336		WORKITEM_FREE(dirrem, D_DIRREM);
9337		FREE_LOCK(&lk);
9338		goto out;
9339	}
9340	/*
9341	 * Directory deletion. Decrement reference count for both the
9342	 * just deleted parent directory entry and the reference for ".".
9343	 * Arrange to have the reference count on the parent decremented
9344	 * to account for the loss of "..".
9345	 */
9346	ip->i_nlink -= 2;
9347	DIP_SET(ip, i_nlink, ip->i_nlink);
9348	ip->i_flag |= IN_CHANGE;
9349	if (ip->i_nlink < ip->i_effnlink)
9350		panic("handle_workitem_remove: bad dir delta");
9351	if (ip->i_nlink == 0)
9352		unlinked_inodedep(mp, inodedep);
9353	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9354	/*
9355	 * Rename a directory to a new parent. Since, we are both deleting
9356	 * and creating a new directory entry, the link count on the new
9357	 * directory should not change. Thus we skip the followup dirrem.
9358	 */
9359	if (dirrem->dm_state & DIRCHG) {
9360		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9361		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
9362		WORKITEM_FREE(dirrem, D_DIRREM);
9363		FREE_LOCK(&lk);
9364		goto out;
9365	}
9366	dirrem->dm_state = ONDEPLIST;
9367	dirrem->dm_oldinum = dirrem->dm_dirinum;
9368	/*
9369	 * Place the dirrem on the parent's diremhd list.
9370	 */
9371	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
9372		panic("handle_workitem_remove: lost dir inodedep");
9373	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9374	/*
9375	 * If the allocated inode has never been written to disk, then
9376	 * the on-disk inode is zero'ed and we can remove the file
9377	 * immediately.  When journaling if the inode has been marked
9378	 * unlinked and not DEPCOMPLETE we know it can never be written.
9379	 */
9380	inodedep_lookup(mp, oldinum, 0, &inodedep);
9381	if (inodedep == NULL ||
9382	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
9383	    check_inode_unwritten(inodedep)) {
9384		FREE_LOCK(&lk);
9385		vput(vp);
9386		return handle_workitem_remove(dirrem, flags);
9387	}
9388	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
9389	FREE_LOCK(&lk);
9390	ip->i_flag |= IN_CHANGE;
9391out:
9392	ffs_update(vp, 0);
9393	vput(vp);
9394	return (0);
9395}
9396
9397/*
9398 * Inode de-allocation dependencies.
9399 *
9400 * When an inode's link count is reduced to zero, it can be de-allocated. We
9401 * found it convenient to postpone de-allocation until after the inode is
9402 * written to disk with its new link count (zero).  At this point, all of the
9403 * on-disk inode's block pointers are nullified and, with careful dependency
9404 * list ordering, all dependencies related to the inode will be satisfied and
9405 * the corresponding dependency structures de-allocated.  So, if/when the
9406 * inode is reused, there will be no mixing of old dependencies with new
9407 * ones.  This artificial dependency is set up by the block de-allocation
9408 * procedure above (softdep_setup_freeblocks) and completed by the
9409 * following procedure.
9410 */
9411static void
9412handle_workitem_freefile(freefile)
9413	struct freefile *freefile;
9414{
9415	struct workhead wkhd;
9416	struct fs *fs;
9417	struct inodedep *idp;
9418	struct ufsmount *ump;
9419	int error;
9420
9421	ump = VFSTOUFS(freefile->fx_list.wk_mp);
9422	fs = ump->um_fs;
9423#ifdef DEBUG
9424	ACQUIRE_LOCK(&lk);
9425	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
9426	FREE_LOCK(&lk);
9427	if (error)
9428		panic("handle_workitem_freefile: inodedep %p survived", idp);
9429#endif
9430	UFS_LOCK(ump);
9431	fs->fs_pendinginodes -= 1;
9432	UFS_UNLOCK(ump);
9433	LIST_INIT(&wkhd);
9434	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
9435	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
9436	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
9437		softdep_error("handle_workitem_freefile", error);
9438	ACQUIRE_LOCK(&lk);
9439	WORKITEM_FREE(freefile, D_FREEFILE);
9440	FREE_LOCK(&lk);
9441}
9442
9443
9444/*
9445 * Helper function which unlinks marker element from work list and returns
9446 * the next element on the list.
9447 */
9448static __inline struct worklist *
9449markernext(struct worklist *marker)
9450{
9451	struct worklist *next;
9452
9453	next = LIST_NEXT(marker, wk_list);
9454	LIST_REMOVE(marker, wk_list);
9455	return next;
9456}
9457
9458/*
9459 * Disk writes.
9460 *
9461 * The dependency structures constructed above are most actively used when file
9462 * system blocks are written to disk.  No constraints are placed on when a
9463 * block can be written, but unsatisfied update dependencies are made safe by
9464 * modifying (or replacing) the source memory for the duration of the disk
9465 * write.  When the disk write completes, the memory block is again brought
9466 * up-to-date.
9467 *
9468 * In-core inode structure reclamation.
9469 *
9470 * Because there are a finite number of "in-core" inode structures, they are
9471 * reused regularly.  By transferring all inode-related dependencies to the
9472 * in-memory inode block and indexing them separately (via "inodedep"s), we
9473 * can allow "in-core" inode structures to be reused at any time and avoid
9474 * any increase in contention.
9475 *
9476 * Called just before entering the device driver to initiate a new disk I/O.
9477 * The buffer must be locked, thus, no I/O completion operations can occur
9478 * while we are manipulating its associated dependencies.
9479 */
9480static void
9481softdep_disk_io_initiation(bp)
9482	struct buf *bp;		/* structure describing disk write to occur */
9483{
9484	struct worklist *wk;
9485	struct worklist marker;
9486	struct inodedep *inodedep;
9487	struct freeblks *freeblks;
9488	struct jblkdep *jblkdep;
9489	struct newblk *newblk;
9490
9491	/*
9492	 * We only care about write operations. There should never
9493	 * be dependencies for reads.
9494	 */
9495	if (bp->b_iocmd != BIO_WRITE)
9496		panic("softdep_disk_io_initiation: not write");
9497
9498	if (bp->b_vflags & BV_BKGRDINPROG)
9499		panic("softdep_disk_io_initiation: Writing buffer with "
9500		    "background write in progress: %p", bp);
9501
9502	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
9503	PHOLD(curproc);			/* Don't swap out kernel stack */
9504
9505	ACQUIRE_LOCK(&lk);
9506	/*
9507	 * Do any necessary pre-I/O processing.
9508	 */
9509	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
9510	     wk = markernext(&marker)) {
9511		LIST_INSERT_AFTER(wk, &marker, wk_list);
9512		switch (wk->wk_type) {
9513
9514		case D_PAGEDEP:
9515			initiate_write_filepage(WK_PAGEDEP(wk), bp);
9516			continue;
9517
9518		case D_INODEDEP:
9519			inodedep = WK_INODEDEP(wk);
9520			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
9521				initiate_write_inodeblock_ufs1(inodedep, bp);
9522			else
9523				initiate_write_inodeblock_ufs2(inodedep, bp);
9524			continue;
9525
9526		case D_INDIRDEP:
9527			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
9528			continue;
9529
9530		case D_BMSAFEMAP:
9531			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
9532			continue;
9533
9534		case D_JSEG:
9535			WK_JSEG(wk)->js_buf = NULL;
9536			continue;
9537
9538		case D_FREEBLKS:
9539			freeblks = WK_FREEBLKS(wk);
9540			jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
9541			/*
9542			 * We have to wait for the freeblks to be journaled
9543			 * before we can write an inodeblock with updated
9544			 * pointers.  Be careful to arrange the marker so
9545			 * we revisit the freeblks if it's not removed by
9546			 * the first jwait().
9547			 */
9548			if (jblkdep != NULL) {
9549				LIST_REMOVE(&marker, wk_list);
9550				LIST_INSERT_BEFORE(wk, &marker, wk_list);
9551				jwait(&jblkdep->jb_list, MNT_WAIT);
9552			}
9553			continue;
9554		case D_ALLOCDIRECT:
9555		case D_ALLOCINDIR:
9556			/*
9557			 * We have to wait for the jnewblk to be journaled
9558			 * before we can write to a block if the contents
9559			 * may be confused with an earlier file's indirect
9560			 * at recovery time.  Handle the marker as described
9561			 * above.
9562			 */
9563			newblk = WK_NEWBLK(wk);
9564			if (newblk->nb_jnewblk != NULL &&
9565			    indirblk_lookup(newblk->nb_list.wk_mp,
9566			    newblk->nb_newblkno)) {
9567				LIST_REMOVE(&marker, wk_list);
9568				LIST_INSERT_BEFORE(wk, &marker, wk_list);
9569				jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
9570			}
9571			continue;
9572
9573		case D_SBDEP:
9574			initiate_write_sbdep(WK_SBDEP(wk));
9575			continue;
9576
9577		case D_MKDIR:
9578		case D_FREEWORK:
9579		case D_FREEDEP:
9580		case D_JSEGDEP:
9581			continue;
9582
9583		default:
9584			panic("handle_disk_io_initiation: Unexpected type %s",
9585			    TYPENAME(wk->wk_type));
9586			/* NOTREACHED */
9587		}
9588	}
9589	FREE_LOCK(&lk);
9590	PRELE(curproc);			/* Allow swapout of kernel stack */
9591}
9592
9593/*
9594 * Called from within the procedure above to deal with unsatisfied
9595 * allocation dependencies in a directory. The buffer must be locked,
9596 * thus, no I/O completion operations can occur while we are
9597 * manipulating its associated dependencies.
9598 */
9599static void
9600initiate_write_filepage(pagedep, bp)
9601	struct pagedep *pagedep;
9602	struct buf *bp;
9603{
9604	struct jremref *jremref;
9605	struct jmvref *jmvref;
9606	struct dirrem *dirrem;
9607	struct diradd *dap;
9608	struct direct *ep;
9609	int i;
9610
9611	if (pagedep->pd_state & IOSTARTED) {
9612		/*
9613		 * This can only happen if there is a driver that does not
9614		 * understand chaining. Here biodone will reissue the call
9615		 * to strategy for the incomplete buffers.
9616		 */
9617		printf("initiate_write_filepage: already started\n");
9618		return;
9619	}
9620	pagedep->pd_state |= IOSTARTED;
9621	/*
9622	 * Wait for all journal remove dependencies to hit the disk.
9623	 * We can not allow any potentially conflicting directory adds
9624	 * to be visible before removes and rollback is too difficult.
9625	 * lk may be dropped and re-acquired, however we hold the buf
9626	 * locked so the dependency can not go away.
9627	 */
9628	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
9629		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
9630			jwait(&jremref->jr_list, MNT_WAIT);
9631	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
9632		jwait(&jmvref->jm_list, MNT_WAIT);
9633	for (i = 0; i < DAHASHSZ; i++) {
9634		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
9635			ep = (struct direct *)
9636			    ((char *)bp->b_data + dap->da_offset);
9637			if (ep->d_ino != dap->da_newinum)
9638				panic("%s: dir inum %ju != new %ju",
9639				    "initiate_write_filepage",
9640				    (uintmax_t)ep->d_ino,
9641				    (uintmax_t)dap->da_newinum);
9642			if (dap->da_state & DIRCHG)
9643				ep->d_ino = dap->da_previous->dm_oldinum;
9644			else
9645				ep->d_ino = 0;
9646			dap->da_state &= ~ATTACHED;
9647			dap->da_state |= UNDONE;
9648		}
9649	}
9650}
9651
9652/*
9653 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
9654 * Note that any bug fixes made to this routine must be done in the
9655 * version found below.
9656 *
9657 * Called from within the procedure above to deal with unsatisfied
9658 * allocation dependencies in an inodeblock. The buffer must be
9659 * locked, thus, no I/O completion operations can occur while we
9660 * are manipulating its associated dependencies.
9661 */
9662static void
9663initiate_write_inodeblock_ufs1(inodedep, bp)
9664	struct inodedep *inodedep;
9665	struct buf *bp;			/* The inode block */
9666{
9667	struct allocdirect *adp, *lastadp;
9668	struct ufs1_dinode *dp;
9669	struct ufs1_dinode *sip;
9670	struct inoref *inoref;
9671	struct fs *fs;
9672	ufs_lbn_t i;
9673#ifdef INVARIANTS
9674	ufs_lbn_t prevlbn = 0;
9675#endif
9676	int deplist;
9677
9678	if (inodedep->id_state & IOSTARTED)
9679		panic("initiate_write_inodeblock_ufs1: already started");
9680	inodedep->id_state |= IOSTARTED;
9681	fs = inodedep->id_fs;
9682	dp = (struct ufs1_dinode *)bp->b_data +
9683	    ino_to_fsbo(fs, inodedep->id_ino);
9684
9685	/*
9686	 * If we're on the unlinked list but have not yet written our
9687	 * next pointer initialize it here.
9688	 */
9689	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
9690		struct inodedep *inon;
9691
9692		inon = TAILQ_NEXT(inodedep, id_unlinked);
9693		dp->di_freelink = inon ? inon->id_ino : 0;
9694	}
9695	/*
9696	 * If the bitmap is not yet written, then the allocated
9697	 * inode cannot be written to disk.
9698	 */
9699	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
9700		if (inodedep->id_savedino1 != NULL)
9701			panic("initiate_write_inodeblock_ufs1: I/O underway");
9702		FREE_LOCK(&lk);
9703		sip = malloc(sizeof(struct ufs1_dinode),
9704		    M_SAVEDINO, M_SOFTDEP_FLAGS);
9705		ACQUIRE_LOCK(&lk);
9706		inodedep->id_savedino1 = sip;
9707		*inodedep->id_savedino1 = *dp;
9708		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
9709		dp->di_gen = inodedep->id_savedino1->di_gen;
9710		dp->di_freelink = inodedep->id_savedino1->di_freelink;
9711		return;
9712	}
9713	/*
9714	 * If no dependencies, then there is nothing to roll back.
9715	 */
9716	inodedep->id_savedsize = dp->di_size;
9717	inodedep->id_savedextsize = 0;
9718	inodedep->id_savednlink = dp->di_nlink;
9719	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
9720	    TAILQ_EMPTY(&inodedep->id_inoreflst))
9721		return;
9722	/*
9723	 * Revert the link count to that of the first unwritten journal entry.
9724	 */
9725	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
9726	if (inoref)
9727		dp->di_nlink = inoref->if_nlink;
9728	/*
9729	 * Set the dependencies to busy.
9730	 */
9731	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
9732	     adp = TAILQ_NEXT(adp, ad_next)) {
9733#ifdef INVARIANTS
9734		if (deplist != 0 && prevlbn >= adp->ad_offset)
9735			panic("softdep_write_inodeblock: lbn order");
9736		prevlbn = adp->ad_offset;
9737		if (adp->ad_offset < NDADDR &&
9738		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
9739			panic("%s: direct pointer #%jd mismatch %d != %jd",
9740			    "softdep_write_inodeblock",
9741			    (intmax_t)adp->ad_offset,
9742			    dp->di_db[adp->ad_offset],
9743			    (intmax_t)adp->ad_newblkno);
9744		if (adp->ad_offset >= NDADDR &&
9745		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
9746			panic("%s: indirect pointer #%jd mismatch %d != %jd",
9747			    "softdep_write_inodeblock",
9748			    (intmax_t)adp->ad_offset - NDADDR,
9749			    dp->di_ib[adp->ad_offset - NDADDR],
9750			    (intmax_t)adp->ad_newblkno);
9751		deplist |= 1 << adp->ad_offset;
9752		if ((adp->ad_state & ATTACHED) == 0)
9753			panic("softdep_write_inodeblock: Unknown state 0x%x",
9754			    adp->ad_state);
9755#endif /* INVARIANTS */
9756		adp->ad_state &= ~ATTACHED;
9757		adp->ad_state |= UNDONE;
9758	}
9759	/*
9760	 * The on-disk inode cannot claim to be any larger than the last
9761	 * fragment that has been written. Otherwise, the on-disk inode
9762	 * might have fragments that were not the last block in the file
9763	 * which would corrupt the filesystem.
9764	 */
9765	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
9766	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
9767		if (adp->ad_offset >= NDADDR)
9768			break;
9769		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
9770		/* keep going until hitting a rollback to a frag */
9771		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
9772			continue;
9773		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
9774		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
9775#ifdef INVARIANTS
9776			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
9777				panic("softdep_write_inodeblock: lost dep1");
9778#endif /* INVARIANTS */
9779			dp->di_db[i] = 0;
9780		}
9781		for (i = 0; i < NIADDR; i++) {
9782#ifdef INVARIANTS
9783			if (dp->di_ib[i] != 0 &&
9784			    (deplist & ((1 << NDADDR) << i)) == 0)
9785				panic("softdep_write_inodeblock: lost dep2");
9786#endif /* INVARIANTS */
9787			dp->di_ib[i] = 0;
9788		}
9789		return;
9790	}
9791	/*
9792	 * If we have zero'ed out the last allocated block of the file,
9793	 * roll back the size to the last currently allocated block.
9794	 * We know that this last allocated block is a full-sized as
9795	 * we already checked for fragments in the loop above.
9796	 */
9797	if (lastadp != NULL &&
9798	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
9799		for (i = lastadp->ad_offset; i >= 0; i--)
9800			if (dp->di_db[i] != 0)
9801				break;
9802		dp->di_size = (i + 1) * fs->fs_bsize;
9803	}
9804	/*
9805	 * The only dependencies are for indirect blocks.
9806	 *
9807	 * The file size for indirect block additions is not guaranteed.
9808	 * Such a guarantee would be non-trivial to achieve. The conventional
9809	 * synchronous write implementation also does not make this guarantee.
9810	 * Fsck should catch and fix discrepancies. Arguably, the file size
9811	 * can be over-estimated without destroying integrity when the file
9812	 * moves into the indirect blocks (i.e., is large). If we want to
9813	 * postpone fsck, we are stuck with this argument.
9814	 */
9815	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
9816		dp->di_ib[adp->ad_offset - NDADDR] = 0;
9817}
9818
9819/*
9820 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
9821 * Note that any bug fixes made to this routine must be done in the
9822 * version found above.
9823 *
9824 * Called from within the procedure above to deal with unsatisfied
9825 * allocation dependencies in an inodeblock. The buffer must be
9826 * locked, thus, no I/O completion operations can occur while we
9827 * are manipulating its associated dependencies.
9828 */
9829static void
9830initiate_write_inodeblock_ufs2(inodedep, bp)
9831	struct inodedep *inodedep;
9832	struct buf *bp;			/* The inode block */
9833{
9834	struct allocdirect *adp, *lastadp;
9835	struct ufs2_dinode *dp;
9836	struct ufs2_dinode *sip;
9837	struct inoref *inoref;
9838	struct fs *fs;
9839	ufs_lbn_t i;
9840#ifdef INVARIANTS
9841	ufs_lbn_t prevlbn = 0;
9842#endif
9843	int deplist;
9844
9845	if (inodedep->id_state & IOSTARTED)
9846		panic("initiate_write_inodeblock_ufs2: already started");
9847	inodedep->id_state |= IOSTARTED;
9848	fs = inodedep->id_fs;
9849	dp = (struct ufs2_dinode *)bp->b_data +
9850	    ino_to_fsbo(fs, inodedep->id_ino);
9851
9852	/*
9853	 * If we're on the unlinked list but have not yet written our
9854	 * next pointer initialize it here.
9855	 */
9856	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
9857		struct inodedep *inon;
9858
9859		inon = TAILQ_NEXT(inodedep, id_unlinked);
9860		dp->di_freelink = inon ? inon->id_ino : 0;
9861	}
9862	/*
9863	 * If the bitmap is not yet written, then the allocated
9864	 * inode cannot be written to disk.
9865	 */
9866	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
9867		if (inodedep->id_savedino2 != NULL)
9868			panic("initiate_write_inodeblock_ufs2: I/O underway");
9869		FREE_LOCK(&lk);
9870		sip = malloc(sizeof(struct ufs2_dinode),
9871		    M_SAVEDINO, M_SOFTDEP_FLAGS);
9872		ACQUIRE_LOCK(&lk);
9873		inodedep->id_savedino2 = sip;
9874		*inodedep->id_savedino2 = *dp;
9875		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
9876		dp->di_gen = inodedep->id_savedino2->di_gen;
9877		dp->di_freelink = inodedep->id_savedino2->di_freelink;
9878		return;
9879	}
9880	/*
9881	 * If no dependencies, then there is nothing to roll back.
9882	 */
9883	inodedep->id_savedsize = dp->di_size;
9884	inodedep->id_savedextsize = dp->di_extsize;
9885	inodedep->id_savednlink = dp->di_nlink;
9886	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
9887	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
9888	    TAILQ_EMPTY(&inodedep->id_inoreflst))
9889		return;
9890	/*
9891	 * Revert the link count to that of the first unwritten journal entry.
9892	 */
9893	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
9894	if (inoref)
9895		dp->di_nlink = inoref->if_nlink;
9896
9897	/*
9898	 * Set the ext data dependencies to busy.
9899	 */
9900	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
9901	     adp = TAILQ_NEXT(adp, ad_next)) {
9902#ifdef INVARIANTS
9903		if (deplist != 0 && prevlbn >= adp->ad_offset)
9904			panic("softdep_write_inodeblock: lbn order");
9905		prevlbn = adp->ad_offset;
9906		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
9907			panic("%s: direct pointer #%jd mismatch %jd != %jd",
9908			    "softdep_write_inodeblock",
9909			    (intmax_t)adp->ad_offset,
9910			    (intmax_t)dp->di_extb[adp->ad_offset],
9911			    (intmax_t)adp->ad_newblkno);
9912		deplist |= 1 << adp->ad_offset;
9913		if ((adp->ad_state & ATTACHED) == 0)
9914			panic("softdep_write_inodeblock: Unknown state 0x%x",
9915			    adp->ad_state);
9916#endif /* INVARIANTS */
9917		adp->ad_state &= ~ATTACHED;
9918		adp->ad_state |= UNDONE;
9919	}
9920	/*
9921	 * The on-disk inode cannot claim to be any larger than the last
9922	 * fragment that has been written. Otherwise, the on-disk inode
9923	 * might have fragments that were not the last block in the ext
9924	 * data which would corrupt the filesystem.
9925	 */
9926	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
9927	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
9928		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
9929		/* keep going until hitting a rollback to a frag */
9930		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
9931			continue;
9932		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
9933		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
9934#ifdef INVARIANTS
9935			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
9936				panic("softdep_write_inodeblock: lost dep1");
9937#endif /* INVARIANTS */
9938			dp->di_extb[i] = 0;
9939		}
9940		lastadp = NULL;
9941		break;
9942	}
9943	/*
9944	 * If we have zero'ed out the last allocated block of the ext
9945	 * data, roll back the size to the last currently allocated block.
9946	 * We know that this last allocated block is a full-sized as
9947	 * we already checked for fragments in the loop above.
9948	 */
9949	if (lastadp != NULL &&
9950	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
9951		for (i = lastadp->ad_offset; i >= 0; i--)
9952			if (dp->di_extb[i] != 0)
9953				break;
9954		dp->di_extsize = (i + 1) * fs->fs_bsize;
9955	}
9956	/*
9957	 * Set the file data dependencies to busy.
9958	 */
9959	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
9960	     adp = TAILQ_NEXT(adp, ad_next)) {
9961#ifdef INVARIANTS
9962		if (deplist != 0 && prevlbn >= adp->ad_offset)
9963			panic("softdep_write_inodeblock: lbn order");
9964		if ((adp->ad_state & ATTACHED) == 0)
9965			panic("inodedep %p and adp %p not attached", inodedep, adp);
9966		prevlbn = adp->ad_offset;
9967		if (adp->ad_offset < NDADDR &&
9968		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
9969			panic("%s: direct pointer #%jd mismatch %jd != %jd",
9970			    "softdep_write_inodeblock",
9971			    (intmax_t)adp->ad_offset,
9972			    (intmax_t)dp->di_db[adp->ad_offset],
9973			    (intmax_t)adp->ad_newblkno);
9974		if (adp->ad_offset >= NDADDR &&
9975		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
9976			panic("%s indirect pointer #%jd mismatch %jd != %jd",
9977			    "softdep_write_inodeblock:",
9978			    (intmax_t)adp->ad_offset - NDADDR,
9979			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
9980			    (intmax_t)adp->ad_newblkno);
9981		deplist |= 1 << adp->ad_offset;
9982		if ((adp->ad_state & ATTACHED) == 0)
9983			panic("softdep_write_inodeblock: Unknown state 0x%x",
9984			    adp->ad_state);
9985#endif /* INVARIANTS */
9986		adp->ad_state &= ~ATTACHED;
9987		adp->ad_state |= UNDONE;
9988	}
9989	/*
9990	 * The on-disk inode cannot claim to be any larger than the last
9991	 * fragment that has been written. Otherwise, the on-disk inode
9992	 * might have fragments that were not the last block in the file
9993	 * which would corrupt the filesystem.
9994	 */
9995	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
9996	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
9997		if (adp->ad_offset >= NDADDR)
9998			break;
9999		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10000		/* keep going until hitting a rollback to a frag */
10001		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10002			continue;
10003		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10004		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10005#ifdef INVARIANTS
10006			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10007				panic("softdep_write_inodeblock: lost dep2");
10008#endif /* INVARIANTS */
10009			dp->di_db[i] = 0;
10010		}
10011		for (i = 0; i < NIADDR; i++) {
10012#ifdef INVARIANTS
10013			if (dp->di_ib[i] != 0 &&
10014			    (deplist & ((1 << NDADDR) << i)) == 0)
10015				panic("softdep_write_inodeblock: lost dep3");
10016#endif /* INVARIANTS */
10017			dp->di_ib[i] = 0;
10018		}
10019		return;
10020	}
10021	/*
10022	 * If we have zero'ed out the last allocated block of the file,
10023	 * roll back the size to the last currently allocated block.
10024	 * We know that this last allocated block is a full-sized as
10025	 * we already checked for fragments in the loop above.
10026	 */
10027	if (lastadp != NULL &&
10028	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10029		for (i = lastadp->ad_offset; i >= 0; i--)
10030			if (dp->di_db[i] != 0)
10031				break;
10032		dp->di_size = (i + 1) * fs->fs_bsize;
10033	}
10034	/*
10035	 * The only dependencies are for indirect blocks.
10036	 *
10037	 * The file size for indirect block additions is not guaranteed.
10038	 * Such a guarantee would be non-trivial to achieve. The conventional
10039	 * synchronous write implementation also does not make this guarantee.
10040	 * Fsck should catch and fix discrepancies. Arguably, the file size
10041	 * can be over-estimated without destroying integrity when the file
10042	 * moves into the indirect blocks (i.e., is large). If we want to
10043	 * postpone fsck, we are stuck with this argument.
10044	 */
10045	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10046		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10047}
10048
10049/*
10050 * Cancel an indirdep as a result of truncation.  Release all of the
10051 * children allocindirs and place their journal work on the appropriate
10052 * list.
10053 */
10054static void
10055cancel_indirdep(indirdep, bp, freeblks)
10056	struct indirdep *indirdep;
10057	struct buf *bp;
10058	struct freeblks *freeblks;
10059{
10060	struct allocindir *aip;
10061
10062	/*
10063	 * None of the indirect pointers will ever be visible,
10064	 * so they can simply be tossed. GOINGAWAY ensures
10065	 * that allocated pointers will be saved in the buffer
10066	 * cache until they are freed. Note that they will
10067	 * only be able to be found by their physical address
10068	 * since the inode mapping the logical address will
10069	 * be gone. The save buffer used for the safe copy
10070	 * was allocated in setup_allocindir_phase2 using
10071	 * the physical address so it could be used for this
10072	 * purpose. Hence we swap the safe copy with the real
10073	 * copy, allowing the safe copy to be freed and holding
10074	 * on to the real copy for later use in indir_trunc.
10075	 */
10076	if (indirdep->ir_state & GOINGAWAY)
10077		panic("cancel_indirdep: already gone");
10078	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10079		indirdep->ir_state |= DEPCOMPLETE;
10080		LIST_REMOVE(indirdep, ir_next);
10081	}
10082	indirdep->ir_state |= GOINGAWAY;
10083	VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1;
10084	/*
10085	 * Pass in bp for blocks still have journal writes
10086	 * pending so we can cancel them on their own.
10087	 */
10088	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
10089		cancel_allocindir(aip, bp, freeblks, 0);
10090	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
10091		cancel_allocindir(aip, NULL, freeblks, 0);
10092	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
10093		cancel_allocindir(aip, NULL, freeblks, 0);
10094	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
10095		cancel_allocindir(aip, NULL, freeblks, 0);
10096	/*
10097	 * If there are pending partial truncations we need to keep the
10098	 * old block copy around until they complete.  This is because
10099	 * the current b_data is not a perfect superset of the available
10100	 * blocks.
10101	 */
10102	if (TAILQ_EMPTY(&indirdep->ir_trunc))
10103		bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10104	else
10105		bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10106	WORKLIST_REMOVE(&indirdep->ir_list);
10107	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10108	indirdep->ir_bp = NULL;
10109	indirdep->ir_freeblks = freeblks;
10110}
10111
10112/*
10113 * Free an indirdep once it no longer has new pointers to track.
10114 */
10115static void
10116free_indirdep(indirdep)
10117	struct indirdep *indirdep;
10118{
10119
10120	KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10121	    ("free_indirdep: Indir trunc list not empty."));
10122	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10123	    ("free_indirdep: Complete head not empty."));
10124	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10125	    ("free_indirdep: write head not empty."));
10126	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10127	    ("free_indirdep: done head not empty."));
10128	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10129	    ("free_indirdep: deplist head not empty."));
10130	KASSERT((indirdep->ir_state & DEPCOMPLETE),
10131	    ("free_indirdep: %p still on newblk list.", indirdep));
10132	KASSERT(indirdep->ir_saveddata == NULL,
10133	    ("free_indirdep: %p still has saved data.", indirdep));
10134	if (indirdep->ir_state & ONWORKLIST)
10135		WORKLIST_REMOVE(&indirdep->ir_list);
10136	WORKITEM_FREE(indirdep, D_INDIRDEP);
10137}
10138
10139/*
10140 * Called before a write to an indirdep.  This routine is responsible for
10141 * rolling back pointers to a safe state which includes only those
10142 * allocindirs which have been completed.
10143 */
10144static void
10145initiate_write_indirdep(indirdep, bp)
10146	struct indirdep *indirdep;
10147	struct buf *bp;
10148{
10149
10150	indirdep->ir_state |= IOSTARTED;
10151	if (indirdep->ir_state & GOINGAWAY)
10152		panic("disk_io_initiation: indirdep gone");
10153	/*
10154	 * If there are no remaining dependencies, this will be writing
10155	 * the real pointers.
10156	 */
10157	if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10158	    TAILQ_EMPTY(&indirdep->ir_trunc))
10159		return;
10160	/*
10161	 * Replace up-to-date version with safe version.
10162	 */
10163	if (indirdep->ir_saveddata == NULL) {
10164		FREE_LOCK(&lk);
10165		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10166		    M_SOFTDEP_FLAGS);
10167		ACQUIRE_LOCK(&lk);
10168	}
10169	indirdep->ir_state &= ~ATTACHED;
10170	indirdep->ir_state |= UNDONE;
10171	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10172	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10173	    bp->b_bcount);
10174}
10175
10176/*
10177 * Called when an inode has been cleared in a cg bitmap.  This finally
10178 * eliminates any canceled jaddrefs
10179 */
10180void
10181softdep_setup_inofree(mp, bp, ino, wkhd)
10182	struct mount *mp;
10183	struct buf *bp;
10184	ino_t ino;
10185	struct workhead *wkhd;
10186{
10187	struct worklist *wk, *wkn;
10188	struct inodedep *inodedep;
10189	uint8_t *inosused;
10190	struct cg *cgp;
10191	struct fs *fs;
10192
10193	ACQUIRE_LOCK(&lk);
10194	fs = VFSTOUFS(mp)->um_fs;
10195	cgp = (struct cg *)bp->b_data;
10196	inosused = cg_inosused(cgp);
10197	if (isset(inosused, ino % fs->fs_ipg))
10198		panic("softdep_setup_inofree: inode %ju not freed.",
10199		    (uintmax_t)ino);
10200	if (inodedep_lookup(mp, ino, 0, &inodedep))
10201		panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
10202		    (uintmax_t)ino, inodedep);
10203	if (wkhd) {
10204		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10205			if (wk->wk_type != D_JADDREF)
10206				continue;
10207			WORKLIST_REMOVE(wk);
10208			/*
10209			 * We can free immediately even if the jaddref
10210			 * isn't attached in a background write as now
10211			 * the bitmaps are reconciled.
10212		 	 */
10213			wk->wk_state |= COMPLETE | ATTACHED;
10214			free_jaddref(WK_JADDREF(wk));
10215		}
10216		jwork_move(&bp->b_dep, wkhd);
10217	}
10218	FREE_LOCK(&lk);
10219}
10220
10221
10222/*
10223 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10224 * map.  Any dependencies waiting for the write to clear are added to the
10225 * buf's list and any jnewblks that are being canceled are discarded
10226 * immediately.
10227 */
10228void
10229softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
10230	struct mount *mp;
10231	struct buf *bp;
10232	ufs2_daddr_t blkno;
10233	int frags;
10234	struct workhead *wkhd;
10235{
10236	struct bmsafemap *bmsafemap;
10237	struct jnewblk *jnewblk;
10238	struct worklist *wk;
10239	struct fs *fs;
10240#ifdef SUJ_DEBUG
10241	uint8_t *blksfree;
10242	struct cg *cgp;
10243	ufs2_daddr_t jstart;
10244	ufs2_daddr_t jend;
10245	ufs2_daddr_t end;
10246	long bno;
10247	int i;
10248#endif
10249
10250	ACQUIRE_LOCK(&lk);
10251	/* Lookup the bmsafemap so we track when it is dirty. */
10252	fs = VFSTOUFS(mp)->um_fs;
10253	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10254	/*
10255	 * Detach any jnewblks which have been canceled.  They must linger
10256	 * until the bitmap is cleared again by ffs_blkfree() to prevent
10257	 * an unjournaled allocation from hitting the disk.
10258	 */
10259	if (wkhd) {
10260		while ((wk = LIST_FIRST(wkhd)) != NULL) {
10261			WORKLIST_REMOVE(wk);
10262			if (wk->wk_type != D_JNEWBLK) {
10263				WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
10264				continue;
10265			}
10266			jnewblk = WK_JNEWBLK(wk);
10267			KASSERT(jnewblk->jn_state & GOINGAWAY,
10268			    ("softdep_setup_blkfree: jnewblk not canceled."));
10269#ifdef SUJ_DEBUG
10270			/*
10271			 * Assert that this block is free in the bitmap
10272			 * before we discard the jnewblk.
10273			 */
10274			cgp = (struct cg *)bp->b_data;
10275			blksfree = cg_blksfree(cgp);
10276			bno = dtogd(fs, jnewblk->jn_blkno);
10277			for (i = jnewblk->jn_oldfrags;
10278			    i < jnewblk->jn_frags; i++) {
10279				if (isset(blksfree, bno + i))
10280					continue;
10281				panic("softdep_setup_blkfree: not free");
10282			}
10283#endif
10284			/*
10285			 * Even if it's not attached we can free immediately
10286			 * as the new bitmap is correct.
10287			 */
10288			wk->wk_state |= COMPLETE | ATTACHED;
10289			free_jnewblk(jnewblk);
10290		}
10291	}
10292
10293#ifdef SUJ_DEBUG
10294	/*
10295	 * Assert that we are not freeing a block which has an outstanding
10296	 * allocation dependency.
10297	 */
10298	fs = VFSTOUFS(mp)->um_fs;
10299	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10300	end = blkno + frags;
10301	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10302		/*
10303		 * Don't match against blocks that will be freed when the
10304		 * background write is done.
10305		 */
10306		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
10307		    (COMPLETE | DEPCOMPLETE))
10308			continue;
10309		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
10310		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
10311		if ((blkno >= jstart && blkno < jend) ||
10312		    (end > jstart && end <= jend)) {
10313			printf("state 0x%X %jd - %d %d dep %p\n",
10314			    jnewblk->jn_state, jnewblk->jn_blkno,
10315			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
10316			    jnewblk->jn_dep);
10317			panic("softdep_setup_blkfree: "
10318			    "%jd-%jd(%d) overlaps with %jd-%jd",
10319			    blkno, end, frags, jstart, jend);
10320		}
10321	}
10322#endif
10323	FREE_LOCK(&lk);
10324}
10325
10326/*
10327 * Revert a block allocation when the journal record that describes it
10328 * is not yet written.
10329 */
10330int
10331jnewblk_rollback(jnewblk, fs, cgp, blksfree)
10332	struct jnewblk *jnewblk;
10333	struct fs *fs;
10334	struct cg *cgp;
10335	uint8_t *blksfree;
10336{
10337	ufs1_daddr_t fragno;
10338	long cgbno, bbase;
10339	int frags, blk;
10340	int i;
10341
10342	frags = 0;
10343	cgbno = dtogd(fs, jnewblk->jn_blkno);
10344	/*
10345	 * We have to test which frags need to be rolled back.  We may
10346	 * be operating on a stale copy when doing background writes.
10347	 */
10348	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
10349		if (isclr(blksfree, cgbno + i))
10350			frags++;
10351	if (frags == 0)
10352		return (0);
10353	/*
10354	 * This is mostly ffs_blkfree() sans some validation and
10355	 * superblock updates.
10356	 */
10357	if (frags == fs->fs_frag) {
10358		fragno = fragstoblks(fs, cgbno);
10359		ffs_setblock(fs, blksfree, fragno);
10360		ffs_clusteracct(fs, cgp, fragno, 1);
10361		cgp->cg_cs.cs_nbfree++;
10362	} else {
10363		cgbno += jnewblk->jn_oldfrags;
10364		bbase = cgbno - fragnum(fs, cgbno);
10365		/* Decrement the old frags.  */
10366		blk = blkmap(fs, blksfree, bbase);
10367		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
10368		/* Deallocate the fragment */
10369		for (i = 0; i < frags; i++)
10370			setbit(blksfree, cgbno + i);
10371		cgp->cg_cs.cs_nffree += frags;
10372		/* Add back in counts associated with the new frags */
10373		blk = blkmap(fs, blksfree, bbase);
10374		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
10375                /* If a complete block has been reassembled, account for it. */
10376		fragno = fragstoblks(fs, bbase);
10377		if (ffs_isblock(fs, blksfree, fragno)) {
10378			cgp->cg_cs.cs_nffree -= fs->fs_frag;
10379			ffs_clusteracct(fs, cgp, fragno, 1);
10380			cgp->cg_cs.cs_nbfree++;
10381		}
10382	}
10383	stat_jnewblk++;
10384	jnewblk->jn_state &= ~ATTACHED;
10385	jnewblk->jn_state |= UNDONE;
10386
10387	return (frags);
10388}
10389
10390static void
10391initiate_write_bmsafemap(bmsafemap, bp)
10392	struct bmsafemap *bmsafemap;
10393	struct buf *bp;			/* The cg block. */
10394{
10395	struct jaddref *jaddref;
10396	struct jnewblk *jnewblk;
10397	uint8_t *inosused;
10398	uint8_t *blksfree;
10399	struct cg *cgp;
10400	struct fs *fs;
10401	ino_t ino;
10402
10403	if (bmsafemap->sm_state & IOSTARTED)
10404		panic("initiate_write_bmsafemap: Already started\n");
10405	bmsafemap->sm_state |= IOSTARTED;
10406	/*
10407	 * Clear any inode allocations which are pending journal writes.
10408	 */
10409	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
10410		cgp = (struct cg *)bp->b_data;
10411		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10412		inosused = cg_inosused(cgp);
10413		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
10414			ino = jaddref->ja_ino % fs->fs_ipg;
10415			/*
10416			 * If this is a background copy the inode may not
10417			 * be marked used yet.
10418			 */
10419			if (isset(inosused, ino)) {
10420				if ((jaddref->ja_mode & IFMT) == IFDIR)
10421					cgp->cg_cs.cs_ndir--;
10422				cgp->cg_cs.cs_nifree++;
10423				clrbit(inosused, ino);
10424				jaddref->ja_state &= ~ATTACHED;
10425				jaddref->ja_state |= UNDONE;
10426				stat_jaddref++;
10427			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
10428				panic("initiate_write_bmsafemap: inode %ju "
10429				    "marked free", (uintmax_t)jaddref->ja_ino);
10430		}
10431	}
10432	/*
10433	 * Clear any block allocations which are pending journal writes.
10434	 */
10435	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
10436		cgp = (struct cg *)bp->b_data;
10437		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10438		blksfree = cg_blksfree(cgp);
10439		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10440			if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
10441				continue;
10442			if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
10443				panic("initiate_write_bmsafemap: block %jd "
10444				    "marked free", jnewblk->jn_blkno);
10445		}
10446	}
10447	/*
10448	 * Move allocation lists to the written lists so they can be
10449	 * cleared once the block write is complete.
10450	 */
10451	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
10452	    inodedep, id_deps);
10453	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
10454	    newblk, nb_deps);
10455	LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
10456	    wk_list);
10457}
10458
10459/*
10460 * This routine is called during the completion interrupt
10461 * service routine for a disk write (from the procedure called
10462 * by the device driver to inform the filesystem caches of
10463 * a request completion).  It should be called early in this
10464 * procedure, before the block is made available to other
10465 * processes or other routines are called.
10466 *
10467 */
10468static void
10469softdep_disk_write_complete(bp)
10470	struct buf *bp;		/* describes the completed disk write */
10471{
10472	struct worklist *wk;
10473	struct worklist *owk;
10474	struct workhead reattach;
10475	struct freeblks *freeblks;
10476	struct buf *sbp;
10477
10478	/*
10479	 * If an error occurred while doing the write, then the data
10480	 * has not hit the disk and the dependencies cannot be unrolled.
10481	 */
10482	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
10483		return;
10484	LIST_INIT(&reattach);
10485	/*
10486	 * This lock must not be released anywhere in this code segment.
10487	 */
10488	sbp = NULL;
10489	owk = NULL;
10490	ACQUIRE_LOCK(&lk);
10491	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
10492		WORKLIST_REMOVE(wk);
10493		dep_write[wk->wk_type]++;
10494		if (wk == owk)
10495			panic("duplicate worklist: %p\n", wk);
10496		owk = wk;
10497		switch (wk->wk_type) {
10498
10499		case D_PAGEDEP:
10500			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
10501				WORKLIST_INSERT(&reattach, wk);
10502			continue;
10503
10504		case D_INODEDEP:
10505			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
10506				WORKLIST_INSERT(&reattach, wk);
10507			continue;
10508
10509		case D_BMSAFEMAP:
10510			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
10511				WORKLIST_INSERT(&reattach, wk);
10512			continue;
10513
10514		case D_MKDIR:
10515			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
10516			continue;
10517
10518		case D_ALLOCDIRECT:
10519			wk->wk_state |= COMPLETE;
10520			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
10521			continue;
10522
10523		case D_ALLOCINDIR:
10524			wk->wk_state |= COMPLETE;
10525			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
10526			continue;
10527
10528		case D_INDIRDEP:
10529			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
10530				WORKLIST_INSERT(&reattach, wk);
10531			continue;
10532
10533		case D_FREEBLKS:
10534			wk->wk_state |= COMPLETE;
10535			freeblks = WK_FREEBLKS(wk);
10536			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
10537			    LIST_EMPTY(&freeblks->fb_jblkdephd))
10538				add_to_worklist(wk, WK_NODELAY);
10539			continue;
10540
10541		case D_FREEWORK:
10542			handle_written_freework(WK_FREEWORK(wk));
10543			break;
10544
10545		case D_JSEGDEP:
10546			free_jsegdep(WK_JSEGDEP(wk));
10547			continue;
10548
10549		case D_JSEG:
10550			handle_written_jseg(WK_JSEG(wk), bp);
10551			continue;
10552
10553		case D_SBDEP:
10554			if (handle_written_sbdep(WK_SBDEP(wk), bp))
10555				WORKLIST_INSERT(&reattach, wk);
10556			continue;
10557
10558		case D_FREEDEP:
10559			free_freedep(WK_FREEDEP(wk));
10560			continue;
10561
10562		default:
10563			panic("handle_disk_write_complete: Unknown type %s",
10564			    TYPENAME(wk->wk_type));
10565			/* NOTREACHED */
10566		}
10567	}
10568	/*
10569	 * Reattach any requests that must be redone.
10570	 */
10571	while ((wk = LIST_FIRST(&reattach)) != NULL) {
10572		WORKLIST_REMOVE(wk);
10573		WORKLIST_INSERT(&bp->b_dep, wk);
10574	}
10575	FREE_LOCK(&lk);
10576	if (sbp)
10577		brelse(sbp);
10578}
10579
10580/*
10581 * Called from within softdep_disk_write_complete above. Note that
10582 * this routine is always called from interrupt level with further
10583 * splbio interrupts blocked.
10584 */
10585static void
10586handle_allocdirect_partdone(adp, wkhd)
10587	struct allocdirect *adp;	/* the completed allocdirect */
10588	struct workhead *wkhd;		/* Work to do when inode is writtne. */
10589{
10590	struct allocdirectlst *listhead;
10591	struct allocdirect *listadp;
10592	struct inodedep *inodedep;
10593	long bsize;
10594
10595	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
10596		return;
10597	/*
10598	 * The on-disk inode cannot claim to be any larger than the last
10599	 * fragment that has been written. Otherwise, the on-disk inode
10600	 * might have fragments that were not the last block in the file
10601	 * which would corrupt the filesystem. Thus, we cannot free any
10602	 * allocdirects after one whose ad_oldblkno claims a fragment as
10603	 * these blocks must be rolled back to zero before writing the inode.
10604	 * We check the currently active set of allocdirects in id_inoupdt
10605	 * or id_extupdt as appropriate.
10606	 */
10607	inodedep = adp->ad_inodedep;
10608	bsize = inodedep->id_fs->fs_bsize;
10609	if (adp->ad_state & EXTDATA)
10610		listhead = &inodedep->id_extupdt;
10611	else
10612		listhead = &inodedep->id_inoupdt;
10613	TAILQ_FOREACH(listadp, listhead, ad_next) {
10614		/* found our block */
10615		if (listadp == adp)
10616			break;
10617		/* continue if ad_oldlbn is not a fragment */
10618		if (listadp->ad_oldsize == 0 ||
10619		    listadp->ad_oldsize == bsize)
10620			continue;
10621		/* hit a fragment */
10622		return;
10623	}
10624	/*
10625	 * If we have reached the end of the current list without
10626	 * finding the just finished dependency, then it must be
10627	 * on the future dependency list. Future dependencies cannot
10628	 * be freed until they are moved to the current list.
10629	 */
10630	if (listadp == NULL) {
10631#ifdef DEBUG
10632		if (adp->ad_state & EXTDATA)
10633			listhead = &inodedep->id_newextupdt;
10634		else
10635			listhead = &inodedep->id_newinoupdt;
10636		TAILQ_FOREACH(listadp, listhead, ad_next)
10637			/* found our block */
10638			if (listadp == adp)
10639				break;
10640		if (listadp == NULL)
10641			panic("handle_allocdirect_partdone: lost dep");
10642#endif /* DEBUG */
10643		return;
10644	}
10645	/*
10646	 * If we have found the just finished dependency, then queue
10647	 * it along with anything that follows it that is complete.
10648	 * Since the pointer has not yet been written in the inode
10649	 * as the dependency prevents it, place the allocdirect on the
10650	 * bufwait list where it will be freed once the pointer is
10651	 * valid.
10652	 */
10653	if (wkhd == NULL)
10654		wkhd = &inodedep->id_bufwait;
10655	for (; adp; adp = listadp) {
10656		listadp = TAILQ_NEXT(adp, ad_next);
10657		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
10658			return;
10659		TAILQ_REMOVE(listhead, adp, ad_next);
10660		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
10661	}
10662}
10663
10664/*
10665 * Called from within softdep_disk_write_complete above.  This routine
10666 * completes successfully written allocindirs.
10667 */
10668static void
10669handle_allocindir_partdone(aip)
10670	struct allocindir *aip;		/* the completed allocindir */
10671{
10672	struct indirdep *indirdep;
10673
10674	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
10675		return;
10676	indirdep = aip->ai_indirdep;
10677	LIST_REMOVE(aip, ai_next);
10678	/*
10679	 * Don't set a pointer while the buffer is undergoing IO or while
10680	 * we have active truncations.
10681	 */
10682	if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
10683		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
10684		return;
10685	}
10686	if (indirdep->ir_state & UFS1FMT)
10687		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
10688		    aip->ai_newblkno;
10689	else
10690		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
10691		    aip->ai_newblkno;
10692	/*
10693	 * Await the pointer write before freeing the allocindir.
10694	 */
10695	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
10696}
10697
10698/*
10699 * Release segments held on a jwork list.
10700 */
10701static void
10702handle_jwork(wkhd)
10703	struct workhead *wkhd;
10704{
10705	struct worklist *wk;
10706
10707	while ((wk = LIST_FIRST(wkhd)) != NULL) {
10708		WORKLIST_REMOVE(wk);
10709		switch (wk->wk_type) {
10710		case D_JSEGDEP:
10711			free_jsegdep(WK_JSEGDEP(wk));
10712			continue;
10713		case D_FREEDEP:
10714			free_freedep(WK_FREEDEP(wk));
10715			continue;
10716		case D_FREEFRAG:
10717			rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
10718			WORKITEM_FREE(wk, D_FREEFRAG);
10719			continue;
10720		case D_FREEWORK:
10721			handle_written_freework(WK_FREEWORK(wk));
10722			continue;
10723		default:
10724			panic("handle_jwork: Unknown type %s\n",
10725			    TYPENAME(wk->wk_type));
10726		}
10727	}
10728}
10729
10730/*
10731 * Handle the bufwait list on an inode when it is safe to release items
10732 * held there.  This normally happens after an inode block is written but
10733 * may be delayed and handled later if there are pending journal items that
10734 * are not yet safe to be released.
10735 */
10736static struct freefile *
10737handle_bufwait(inodedep, refhd)
10738	struct inodedep *inodedep;
10739	struct workhead *refhd;
10740{
10741	struct jaddref *jaddref;
10742	struct freefile *freefile;
10743	struct worklist *wk;
10744
10745	freefile = NULL;
10746	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
10747		WORKLIST_REMOVE(wk);
10748		switch (wk->wk_type) {
10749		case D_FREEFILE:
10750			/*
10751			 * We defer adding freefile to the worklist
10752			 * until all other additions have been made to
10753			 * ensure that it will be done after all the
10754			 * old blocks have been freed.
10755			 */
10756			if (freefile != NULL)
10757				panic("handle_bufwait: freefile");
10758			freefile = WK_FREEFILE(wk);
10759			continue;
10760
10761		case D_MKDIR:
10762			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
10763			continue;
10764
10765		case D_DIRADD:
10766			diradd_inode_written(WK_DIRADD(wk), inodedep);
10767			continue;
10768
10769		case D_FREEFRAG:
10770			wk->wk_state |= COMPLETE;
10771			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
10772				add_to_worklist(wk, 0);
10773			continue;
10774
10775		case D_DIRREM:
10776			wk->wk_state |= COMPLETE;
10777			add_to_worklist(wk, 0);
10778			continue;
10779
10780		case D_ALLOCDIRECT:
10781		case D_ALLOCINDIR:
10782			free_newblk(WK_NEWBLK(wk));
10783			continue;
10784
10785		case D_JNEWBLK:
10786			wk->wk_state |= COMPLETE;
10787			free_jnewblk(WK_JNEWBLK(wk));
10788			continue;
10789
10790		/*
10791		 * Save freed journal segments and add references on
10792		 * the supplied list which will delay their release
10793		 * until the cg bitmap is cleared on disk.
10794		 */
10795		case D_JSEGDEP:
10796			if (refhd == NULL)
10797				free_jsegdep(WK_JSEGDEP(wk));
10798			else
10799				WORKLIST_INSERT(refhd, wk);
10800			continue;
10801
10802		case D_JADDREF:
10803			jaddref = WK_JADDREF(wk);
10804			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
10805			    if_deps);
10806			/*
10807			 * Transfer any jaddrefs to the list to be freed with
10808			 * the bitmap if we're handling a removed file.
10809			 */
10810			if (refhd == NULL) {
10811				wk->wk_state |= COMPLETE;
10812				free_jaddref(jaddref);
10813			} else
10814				WORKLIST_INSERT(refhd, wk);
10815			continue;
10816
10817		default:
10818			panic("handle_bufwait: Unknown type %p(%s)",
10819			    wk, TYPENAME(wk->wk_type));
10820			/* NOTREACHED */
10821		}
10822	}
10823	return (freefile);
10824}
10825/*
10826 * Called from within softdep_disk_write_complete above to restore
10827 * in-memory inode block contents to their most up-to-date state. Note
10828 * that this routine is always called from interrupt level with further
10829 * splbio interrupts blocked.
10830 */
10831static int
10832handle_written_inodeblock(inodedep, bp)
10833	struct inodedep *inodedep;
10834	struct buf *bp;		/* buffer containing the inode block */
10835{
10836	struct freefile *freefile;
10837	struct allocdirect *adp, *nextadp;
10838	struct ufs1_dinode *dp1 = NULL;
10839	struct ufs2_dinode *dp2 = NULL;
10840	struct workhead wkhd;
10841	int hadchanges, fstype;
10842	ino_t freelink;
10843
10844	LIST_INIT(&wkhd);
10845	hadchanges = 0;
10846	freefile = NULL;
10847	if ((inodedep->id_state & IOSTARTED) == 0)
10848		panic("handle_written_inodeblock: not started");
10849	inodedep->id_state &= ~IOSTARTED;
10850	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
10851		fstype = UFS1;
10852		dp1 = (struct ufs1_dinode *)bp->b_data +
10853		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
10854		freelink = dp1->di_freelink;
10855	} else {
10856		fstype = UFS2;
10857		dp2 = (struct ufs2_dinode *)bp->b_data +
10858		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
10859		freelink = dp2->di_freelink;
10860	}
10861	/*
10862	 * Leave this inodeblock dirty until it's in the list.
10863	 */
10864	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) {
10865		struct inodedep *inon;
10866
10867		inon = TAILQ_NEXT(inodedep, id_unlinked);
10868		if ((inon == NULL && freelink == 0) ||
10869		    (inon && inon->id_ino == freelink)) {
10870			if (inon)
10871				inon->id_state |= UNLINKPREV;
10872			inodedep->id_state |= UNLINKNEXT;
10873		}
10874		hadchanges = 1;
10875	}
10876	/*
10877	 * If we had to rollback the inode allocation because of
10878	 * bitmaps being incomplete, then simply restore it.
10879	 * Keep the block dirty so that it will not be reclaimed until
10880	 * all associated dependencies have been cleared and the
10881	 * corresponding updates written to disk.
10882	 */
10883	if (inodedep->id_savedino1 != NULL) {
10884		hadchanges = 1;
10885		if (fstype == UFS1)
10886			*dp1 = *inodedep->id_savedino1;
10887		else
10888			*dp2 = *inodedep->id_savedino2;
10889		free(inodedep->id_savedino1, M_SAVEDINO);
10890		inodedep->id_savedino1 = NULL;
10891		if ((bp->b_flags & B_DELWRI) == 0)
10892			stat_inode_bitmap++;
10893		bdirty(bp);
10894		/*
10895		 * If the inode is clear here and GOINGAWAY it will never
10896		 * be written.  Process the bufwait and clear any pending
10897		 * work which may include the freefile.
10898		 */
10899		if (inodedep->id_state & GOINGAWAY)
10900			goto bufwait;
10901		return (1);
10902	}
10903	inodedep->id_state |= COMPLETE;
10904	/*
10905	 * Roll forward anything that had to be rolled back before
10906	 * the inode could be updated.
10907	 */
10908	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
10909		nextadp = TAILQ_NEXT(adp, ad_next);
10910		if (adp->ad_state & ATTACHED)
10911			panic("handle_written_inodeblock: new entry");
10912		if (fstype == UFS1) {
10913			if (adp->ad_offset < NDADDR) {
10914				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
10915					panic("%s %s #%jd mismatch %d != %jd",
10916					    "handle_written_inodeblock:",
10917					    "direct pointer",
10918					    (intmax_t)adp->ad_offset,
10919					    dp1->di_db[adp->ad_offset],
10920					    (intmax_t)adp->ad_oldblkno);
10921				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
10922			} else {
10923				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
10924					panic("%s: %s #%jd allocated as %d",
10925					    "handle_written_inodeblock",
10926					    "indirect pointer",
10927					    (intmax_t)adp->ad_offset - NDADDR,
10928					    dp1->di_ib[adp->ad_offset - NDADDR]);
10929				dp1->di_ib[adp->ad_offset - NDADDR] =
10930				    adp->ad_newblkno;
10931			}
10932		} else {
10933			if (adp->ad_offset < NDADDR) {
10934				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
10935					panic("%s: %s #%jd %s %jd != %jd",
10936					    "handle_written_inodeblock",
10937					    "direct pointer",
10938					    (intmax_t)adp->ad_offset, "mismatch",
10939					    (intmax_t)dp2->di_db[adp->ad_offset],
10940					    (intmax_t)adp->ad_oldblkno);
10941				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
10942			} else {
10943				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
10944					panic("%s: %s #%jd allocated as %jd",
10945					    "handle_written_inodeblock",
10946					    "indirect pointer",
10947					    (intmax_t)adp->ad_offset - NDADDR,
10948					    (intmax_t)
10949					    dp2->di_ib[adp->ad_offset - NDADDR]);
10950				dp2->di_ib[adp->ad_offset - NDADDR] =
10951				    adp->ad_newblkno;
10952			}
10953		}
10954		adp->ad_state &= ~UNDONE;
10955		adp->ad_state |= ATTACHED;
10956		hadchanges = 1;
10957	}
10958	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
10959		nextadp = TAILQ_NEXT(adp, ad_next);
10960		if (adp->ad_state & ATTACHED)
10961			panic("handle_written_inodeblock: new entry");
10962		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
10963			panic("%s: direct pointers #%jd %s %jd != %jd",
10964			    "handle_written_inodeblock",
10965			    (intmax_t)adp->ad_offset, "mismatch",
10966			    (intmax_t)dp2->di_extb[adp->ad_offset],
10967			    (intmax_t)adp->ad_oldblkno);
10968		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
10969		adp->ad_state &= ~UNDONE;
10970		adp->ad_state |= ATTACHED;
10971		hadchanges = 1;
10972	}
10973	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
10974		stat_direct_blk_ptrs++;
10975	/*
10976	 * Reset the file size to its most up-to-date value.
10977	 */
10978	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
10979		panic("handle_written_inodeblock: bad size");
10980	if (inodedep->id_savednlink > LINK_MAX)
10981		panic("handle_written_inodeblock: Invalid link count "
10982		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
10983	if (fstype == UFS1) {
10984		if (dp1->di_nlink != inodedep->id_savednlink) {
10985			dp1->di_nlink = inodedep->id_savednlink;
10986			hadchanges = 1;
10987		}
10988		if (dp1->di_size != inodedep->id_savedsize) {
10989			dp1->di_size = inodedep->id_savedsize;
10990			hadchanges = 1;
10991		}
10992	} else {
10993		if (dp2->di_nlink != inodedep->id_savednlink) {
10994			dp2->di_nlink = inodedep->id_savednlink;
10995			hadchanges = 1;
10996		}
10997		if (dp2->di_size != inodedep->id_savedsize) {
10998			dp2->di_size = inodedep->id_savedsize;
10999			hadchanges = 1;
11000		}
11001		if (dp2->di_extsize != inodedep->id_savedextsize) {
11002			dp2->di_extsize = inodedep->id_savedextsize;
11003			hadchanges = 1;
11004		}
11005	}
11006	inodedep->id_savedsize = -1;
11007	inodedep->id_savedextsize = -1;
11008	inodedep->id_savednlink = -1;
11009	/*
11010	 * If there were any rollbacks in the inode block, then it must be
11011	 * marked dirty so that its will eventually get written back in
11012	 * its correct form.
11013	 */
11014	if (hadchanges)
11015		bdirty(bp);
11016bufwait:
11017	/*
11018	 * Process any allocdirects that completed during the update.
11019	 */
11020	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11021		handle_allocdirect_partdone(adp, &wkhd);
11022	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11023		handle_allocdirect_partdone(adp, &wkhd);
11024	/*
11025	 * Process deallocations that were held pending until the
11026	 * inode had been written to disk. Freeing of the inode
11027	 * is delayed until after all blocks have been freed to
11028	 * avoid creation of new <vfsid, inum, lbn> triples
11029	 * before the old ones have been deleted.  Completely
11030	 * unlinked inodes are not processed until the unlinked
11031	 * inode list is written or the last reference is removed.
11032	 */
11033	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11034		freefile = handle_bufwait(inodedep, NULL);
11035		if (freefile && !LIST_EMPTY(&wkhd)) {
11036			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11037			freefile = NULL;
11038		}
11039	}
11040	/*
11041	 * Move rolled forward dependency completions to the bufwait list
11042	 * now that those that were already written have been processed.
11043	 */
11044	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11045		panic("handle_written_inodeblock: bufwait but no changes");
11046	jwork_move(&inodedep->id_bufwait, &wkhd);
11047
11048	if (freefile != NULL) {
11049		/*
11050		 * If the inode is goingaway it was never written.  Fake up
11051		 * the state here so free_inodedep() can succeed.
11052		 */
11053		if (inodedep->id_state & GOINGAWAY)
11054			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11055		if (free_inodedep(inodedep) == 0)
11056			panic("handle_written_inodeblock: live inodedep %p",
11057			    inodedep);
11058		add_to_worklist(&freefile->fx_list, 0);
11059		return (0);
11060	}
11061
11062	/*
11063	 * If no outstanding dependencies, free it.
11064	 */
11065	if (free_inodedep(inodedep) ||
11066	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11067	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11068	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11069	     LIST_FIRST(&inodedep->id_bufwait) == 0))
11070		return (0);
11071	return (hadchanges);
11072}
11073
11074static int
11075handle_written_indirdep(indirdep, bp, bpp)
11076	struct indirdep *indirdep;
11077	struct buf *bp;
11078	struct buf **bpp;
11079{
11080	struct allocindir *aip;
11081	struct buf *sbp;
11082	int chgs;
11083
11084	if (indirdep->ir_state & GOINGAWAY)
11085		panic("handle_written_indirdep: indirdep gone");
11086	if ((indirdep->ir_state & IOSTARTED) == 0)
11087		panic("handle_written_indirdep: IO not started");
11088	chgs = 0;
11089	/*
11090	 * If there were rollbacks revert them here.
11091	 */
11092	if (indirdep->ir_saveddata) {
11093		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11094		if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11095			free(indirdep->ir_saveddata, M_INDIRDEP);
11096			indirdep->ir_saveddata = NULL;
11097		}
11098		chgs = 1;
11099	}
11100	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11101	indirdep->ir_state |= ATTACHED;
11102	/*
11103	 * Move allocindirs with written pointers to the completehd if
11104	 * the indirdep's pointer is not yet written.  Otherwise
11105	 * free them here.
11106	 */
11107	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
11108		LIST_REMOVE(aip, ai_next);
11109		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11110			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11111			    ai_next);
11112			newblk_freefrag(&aip->ai_block);
11113			continue;
11114		}
11115		free_newblk(&aip->ai_block);
11116	}
11117	/*
11118	 * Move allocindirs that have finished dependency processing from
11119	 * the done list to the write list after updating the pointers.
11120	 */
11121	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11122		while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
11123			handle_allocindir_partdone(aip);
11124			if (aip == LIST_FIRST(&indirdep->ir_donehd))
11125				panic("disk_write_complete: not gone");
11126			chgs = 1;
11127		}
11128	}
11129	/*
11130	 * Preserve the indirdep if there were any changes or if it is not
11131	 * yet valid on disk.
11132	 */
11133	if (chgs) {
11134		stat_indir_blk_ptrs++;
11135		bdirty(bp);
11136		return (1);
11137	}
11138	/*
11139	 * If there were no changes we can discard the savedbp and detach
11140	 * ourselves from the buf.  We are only carrying completed pointers
11141	 * in this case.
11142	 */
11143	sbp = indirdep->ir_savebp;
11144	sbp->b_flags |= B_INVAL | B_NOCACHE;
11145	indirdep->ir_savebp = NULL;
11146	indirdep->ir_bp = NULL;
11147	if (*bpp != NULL)
11148		panic("handle_written_indirdep: bp already exists.");
11149	*bpp = sbp;
11150	/*
11151	 * The indirdep may not be freed until its parent points at it.
11152	 */
11153	if (indirdep->ir_state & DEPCOMPLETE)
11154		free_indirdep(indirdep);
11155
11156	return (0);
11157}
11158
11159/*
11160 * Process a diradd entry after its dependent inode has been written.
11161 * This routine must be called with splbio interrupts blocked.
11162 */
11163static void
11164diradd_inode_written(dap, inodedep)
11165	struct diradd *dap;
11166	struct inodedep *inodedep;
11167{
11168
11169	dap->da_state |= COMPLETE;
11170	complete_diradd(dap);
11171	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
11172}
11173
11174/*
11175 * Returns true if the bmsafemap will have rollbacks when written.  Must
11176 * only be called with lk and the buf lock on the cg held.
11177 */
11178static int
11179bmsafemap_rollbacks(bmsafemap)
11180	struct bmsafemap *bmsafemap;
11181{
11182
11183	return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
11184	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd));
11185}
11186
11187/*
11188 * Re-apply an allocation when a cg write is complete.
11189 */
11190static int
11191jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
11192	struct jnewblk *jnewblk;
11193	struct fs *fs;
11194	struct cg *cgp;
11195	uint8_t *blksfree;
11196{
11197	ufs1_daddr_t fragno;
11198	ufs2_daddr_t blkno;
11199	long cgbno, bbase;
11200	int frags, blk;
11201	int i;
11202
11203	frags = 0;
11204	cgbno = dtogd(fs, jnewblk->jn_blkno);
11205	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
11206		if (isclr(blksfree, cgbno + i))
11207			panic("jnewblk_rollforward: re-allocated fragment");
11208		frags++;
11209	}
11210	if (frags == fs->fs_frag) {
11211		blkno = fragstoblks(fs, cgbno);
11212		ffs_clrblock(fs, blksfree, (long)blkno);
11213		ffs_clusteracct(fs, cgp, blkno, -1);
11214		cgp->cg_cs.cs_nbfree--;
11215	} else {
11216		bbase = cgbno - fragnum(fs, cgbno);
11217		cgbno += jnewblk->jn_oldfrags;
11218                /* If a complete block had been reassembled, account for it. */
11219		fragno = fragstoblks(fs, bbase);
11220		if (ffs_isblock(fs, blksfree, fragno)) {
11221			cgp->cg_cs.cs_nffree += fs->fs_frag;
11222			ffs_clusteracct(fs, cgp, fragno, -1);
11223			cgp->cg_cs.cs_nbfree--;
11224		}
11225		/* Decrement the old frags.  */
11226		blk = blkmap(fs, blksfree, bbase);
11227		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11228		/* Allocate the fragment */
11229		for (i = 0; i < frags; i++)
11230			clrbit(blksfree, cgbno + i);
11231		cgp->cg_cs.cs_nffree -= frags;
11232		/* Add back in counts associated with the new frags */
11233		blk = blkmap(fs, blksfree, bbase);
11234		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11235	}
11236	return (frags);
11237}
11238
11239/*
11240 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
11241 * changes if it's not a background write.  Set all written dependencies
11242 * to DEPCOMPLETE and free the structure if possible.
11243 */
11244static int
11245handle_written_bmsafemap(bmsafemap, bp)
11246	struct bmsafemap *bmsafemap;
11247	struct buf *bp;
11248{
11249	struct newblk *newblk;
11250	struct inodedep *inodedep;
11251	struct jaddref *jaddref, *jatmp;
11252	struct jnewblk *jnewblk, *jntmp;
11253	struct ufsmount *ump;
11254	uint8_t *inosused;
11255	uint8_t *blksfree;
11256	struct cg *cgp;
11257	struct fs *fs;
11258	ino_t ino;
11259	int chgs;
11260
11261	if ((bmsafemap->sm_state & IOSTARTED) == 0)
11262		panic("initiate_write_bmsafemap: Not started\n");
11263	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
11264	chgs = 0;
11265	bmsafemap->sm_state &= ~IOSTARTED;
11266	/*
11267	 * Release journal work that was waiting on the write.
11268	 */
11269	handle_jwork(&bmsafemap->sm_freewr);
11270
11271	/*
11272	 * Restore unwritten inode allocation pending jaddref writes.
11273	 */
11274	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
11275		cgp = (struct cg *)bp->b_data;
11276		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11277		inosused = cg_inosused(cgp);
11278		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
11279		    ja_bmdeps, jatmp) {
11280			if ((jaddref->ja_state & UNDONE) == 0)
11281				continue;
11282			ino = jaddref->ja_ino % fs->fs_ipg;
11283			if (isset(inosused, ino))
11284				panic("handle_written_bmsafemap: "
11285				    "re-allocated inode");
11286			if ((bp->b_xflags & BX_BKGRDMARKER) == 0) {
11287				if ((jaddref->ja_mode & IFMT) == IFDIR)
11288					cgp->cg_cs.cs_ndir++;
11289				cgp->cg_cs.cs_nifree--;
11290				setbit(inosused, ino);
11291				chgs = 1;
11292			}
11293			jaddref->ja_state &= ~UNDONE;
11294			jaddref->ja_state |= ATTACHED;
11295			free_jaddref(jaddref);
11296		}
11297	}
11298	/*
11299	 * Restore any block allocations which are pending journal writes.
11300	 */
11301	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11302		cgp = (struct cg *)bp->b_data;
11303		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11304		blksfree = cg_blksfree(cgp);
11305		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
11306		    jntmp) {
11307			if ((jnewblk->jn_state & UNDONE) == 0)
11308				continue;
11309			if ((bp->b_xflags & BX_BKGRDMARKER) == 0 &&
11310			    jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
11311				chgs = 1;
11312			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
11313			jnewblk->jn_state |= ATTACHED;
11314			free_jnewblk(jnewblk);
11315		}
11316	}
11317	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
11318		newblk->nb_state |= DEPCOMPLETE;
11319		newblk->nb_state &= ~ONDEPLIST;
11320		newblk->nb_bmsafemap = NULL;
11321		LIST_REMOVE(newblk, nb_deps);
11322		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
11323			handle_allocdirect_partdone(
11324			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
11325		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
11326			handle_allocindir_partdone(
11327			    WK_ALLOCINDIR(&newblk->nb_list));
11328		else if (newblk->nb_list.wk_type != D_NEWBLK)
11329			panic("handle_written_bmsafemap: Unexpected type: %s",
11330			    TYPENAME(newblk->nb_list.wk_type));
11331	}
11332	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
11333		inodedep->id_state |= DEPCOMPLETE;
11334		inodedep->id_state &= ~ONDEPLIST;
11335		LIST_REMOVE(inodedep, id_deps);
11336		inodedep->id_bmsafemap = NULL;
11337	}
11338	LIST_REMOVE(bmsafemap, sm_next);
11339	if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
11340	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
11341	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
11342	    LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
11343	    LIST_EMPTY(&bmsafemap->sm_freehd)) {
11344		LIST_REMOVE(bmsafemap, sm_hash);
11345		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
11346		return (0);
11347	}
11348	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
11349	bdirty(bp);
11350	return (1);
11351}
11352
11353/*
11354 * Try to free a mkdir dependency.
11355 */
11356static void
11357complete_mkdir(mkdir)
11358	struct mkdir *mkdir;
11359{
11360	struct diradd *dap;
11361
11362	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
11363		return;
11364	LIST_REMOVE(mkdir, md_mkdirs);
11365	dap = mkdir->md_diradd;
11366	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
11367	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
11368		dap->da_state |= DEPCOMPLETE;
11369		complete_diradd(dap);
11370	}
11371	WORKITEM_FREE(mkdir, D_MKDIR);
11372}
11373
11374/*
11375 * Handle the completion of a mkdir dependency.
11376 */
11377static void
11378handle_written_mkdir(mkdir, type)
11379	struct mkdir *mkdir;
11380	int type;
11381{
11382
11383	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
11384		panic("handle_written_mkdir: bad type");
11385	mkdir->md_state |= COMPLETE;
11386	complete_mkdir(mkdir);
11387}
11388
11389static int
11390free_pagedep(pagedep)
11391	struct pagedep *pagedep;
11392{
11393	int i;
11394
11395	if (pagedep->pd_state & NEWBLOCK)
11396		return (0);
11397	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
11398		return (0);
11399	for (i = 0; i < DAHASHSZ; i++)
11400		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
11401			return (0);
11402	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
11403		return (0);
11404	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
11405		return (0);
11406	if (pagedep->pd_state & ONWORKLIST)
11407		WORKLIST_REMOVE(&pagedep->pd_list);
11408	LIST_REMOVE(pagedep, pd_hash);
11409	WORKITEM_FREE(pagedep, D_PAGEDEP);
11410
11411	return (1);
11412}
11413
11414/*
11415 * Called from within softdep_disk_write_complete above.
11416 * A write operation was just completed. Removed inodes can
11417 * now be freed and associated block pointers may be committed.
11418 * Note that this routine is always called from interrupt level
11419 * with further splbio interrupts blocked.
11420 */
11421static int
11422handle_written_filepage(pagedep, bp)
11423	struct pagedep *pagedep;
11424	struct buf *bp;		/* buffer containing the written page */
11425{
11426	struct dirrem *dirrem;
11427	struct diradd *dap, *nextdap;
11428	struct direct *ep;
11429	int i, chgs;
11430
11431	if ((pagedep->pd_state & IOSTARTED) == 0)
11432		panic("handle_written_filepage: not started");
11433	pagedep->pd_state &= ~IOSTARTED;
11434	/*
11435	 * Process any directory removals that have been committed.
11436	 */
11437	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
11438		LIST_REMOVE(dirrem, dm_next);
11439		dirrem->dm_state |= COMPLETE;
11440		dirrem->dm_dirinum = pagedep->pd_ino;
11441		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
11442		    ("handle_written_filepage: Journal entries not written."));
11443		add_to_worklist(&dirrem->dm_list, 0);
11444	}
11445	/*
11446	 * Free any directory additions that have been committed.
11447	 * If it is a newly allocated block, we have to wait until
11448	 * the on-disk directory inode claims the new block.
11449	 */
11450	if ((pagedep->pd_state & NEWBLOCK) == 0)
11451		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
11452			free_diradd(dap, NULL);
11453	/*
11454	 * Uncommitted directory entries must be restored.
11455	 */
11456	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
11457		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
11458		     dap = nextdap) {
11459			nextdap = LIST_NEXT(dap, da_pdlist);
11460			if (dap->da_state & ATTACHED)
11461				panic("handle_written_filepage: attached");
11462			ep = (struct direct *)
11463			    ((char *)bp->b_data + dap->da_offset);
11464			ep->d_ino = dap->da_newinum;
11465			dap->da_state &= ~UNDONE;
11466			dap->da_state |= ATTACHED;
11467			chgs = 1;
11468			/*
11469			 * If the inode referenced by the directory has
11470			 * been written out, then the dependency can be
11471			 * moved to the pending list.
11472			 */
11473			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
11474				LIST_REMOVE(dap, da_pdlist);
11475				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
11476				    da_pdlist);
11477			}
11478		}
11479	}
11480	/*
11481	 * If there were any rollbacks in the directory, then it must be
11482	 * marked dirty so that its will eventually get written back in
11483	 * its correct form.
11484	 */
11485	if (chgs) {
11486		if ((bp->b_flags & B_DELWRI) == 0)
11487			stat_dir_entry++;
11488		bdirty(bp);
11489		return (1);
11490	}
11491	/*
11492	 * If we are not waiting for a new directory block to be
11493	 * claimed by its inode, then the pagedep will be freed.
11494	 * Otherwise it will remain to track any new entries on
11495	 * the page in case they are fsync'ed.
11496	 */
11497	free_pagedep(pagedep);
11498	return (0);
11499}
11500
11501/*
11502 * Writing back in-core inode structures.
11503 *
11504 * The filesystem only accesses an inode's contents when it occupies an
11505 * "in-core" inode structure.  These "in-core" structures are separate from
11506 * the page frames used to cache inode blocks.  Only the latter are
11507 * transferred to/from the disk.  So, when the updated contents of the
11508 * "in-core" inode structure are copied to the corresponding in-memory inode
11509 * block, the dependencies are also transferred.  The following procedure is
11510 * called when copying a dirty "in-core" inode to a cached inode block.
11511 */
11512
11513/*
11514 * Called when an inode is loaded from disk. If the effective link count
11515 * differed from the actual link count when it was last flushed, then we
11516 * need to ensure that the correct effective link count is put back.
11517 */
11518void
11519softdep_load_inodeblock(ip)
11520	struct inode *ip;	/* the "in_core" copy of the inode */
11521{
11522	struct inodedep *inodedep;
11523
11524	/*
11525	 * Check for alternate nlink count.
11526	 */
11527	ip->i_effnlink = ip->i_nlink;
11528	ACQUIRE_LOCK(&lk);
11529	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
11530	    &inodedep) == 0) {
11531		FREE_LOCK(&lk);
11532		return;
11533	}
11534	ip->i_effnlink -= inodedep->id_nlinkdelta;
11535	FREE_LOCK(&lk);
11536}
11537
11538/*
11539 * This routine is called just before the "in-core" inode
11540 * information is to be copied to the in-memory inode block.
11541 * Recall that an inode block contains several inodes. If
11542 * the force flag is set, then the dependencies will be
11543 * cleared so that the update can always be made. Note that
11544 * the buffer is locked when this routine is called, so we
11545 * will never be in the middle of writing the inode block
11546 * to disk.
11547 */
11548void
11549softdep_update_inodeblock(ip, bp, waitfor)
11550	struct inode *ip;	/* the "in_core" copy of the inode */
11551	struct buf *bp;		/* the buffer containing the inode block */
11552	int waitfor;		/* nonzero => update must be allowed */
11553{
11554	struct inodedep *inodedep;
11555	struct inoref *inoref;
11556	struct worklist *wk;
11557	struct mount *mp;
11558	struct buf *ibp;
11559	struct fs *fs;
11560	int error;
11561
11562	mp = UFSTOVFS(ip->i_ump);
11563	fs = ip->i_fs;
11564	/*
11565	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
11566	 * does not have access to the in-core ip so must write directly into
11567	 * the inode block buffer when setting freelink.
11568	 */
11569	if (fs->fs_magic == FS_UFS1_MAGIC)
11570		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
11571		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
11572	else
11573		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
11574		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
11575	/*
11576	 * If the effective link count is not equal to the actual link
11577	 * count, then we must track the difference in an inodedep while
11578	 * the inode is (potentially) tossed out of the cache. Otherwise,
11579	 * if there is no existing inodedep, then there are no dependencies
11580	 * to track.
11581	 */
11582	ACQUIRE_LOCK(&lk);
11583again:
11584	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
11585		FREE_LOCK(&lk);
11586		if (ip->i_effnlink != ip->i_nlink)
11587			panic("softdep_update_inodeblock: bad link count");
11588		return;
11589	}
11590	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
11591		panic("softdep_update_inodeblock: bad delta");
11592	/*
11593	 * If we're flushing all dependencies we must also move any waiting
11594	 * for journal writes onto the bufwait list prior to I/O.
11595	 */
11596	if (waitfor) {
11597		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
11598			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
11599			    == DEPCOMPLETE) {
11600				jwait(&inoref->if_list, MNT_WAIT);
11601				goto again;
11602			}
11603		}
11604	}
11605	/*
11606	 * Changes have been initiated. Anything depending on these
11607	 * changes cannot occur until this inode has been written.
11608	 */
11609	inodedep->id_state &= ~COMPLETE;
11610	if ((inodedep->id_state & ONWORKLIST) == 0)
11611		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
11612	/*
11613	 * Any new dependencies associated with the incore inode must
11614	 * now be moved to the list associated with the buffer holding
11615	 * the in-memory copy of the inode. Once merged process any
11616	 * allocdirects that are completed by the merger.
11617	 */
11618	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
11619	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
11620		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
11621		    NULL);
11622	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
11623	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
11624		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
11625		    NULL);
11626	/*
11627	 * Now that the inode has been pushed into the buffer, the
11628	 * operations dependent on the inode being written to disk
11629	 * can be moved to the id_bufwait so that they will be
11630	 * processed when the buffer I/O completes.
11631	 */
11632	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
11633		WORKLIST_REMOVE(wk);
11634		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
11635	}
11636	/*
11637	 * Newly allocated inodes cannot be written until the bitmap
11638	 * that allocates them have been written (indicated by
11639	 * DEPCOMPLETE being set in id_state). If we are doing a
11640	 * forced sync (e.g., an fsync on a file), we force the bitmap
11641	 * to be written so that the update can be done.
11642	 */
11643	if (waitfor == 0) {
11644		FREE_LOCK(&lk);
11645		return;
11646	}
11647retry:
11648	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
11649		FREE_LOCK(&lk);
11650		return;
11651	}
11652	ibp = inodedep->id_bmsafemap->sm_buf;
11653	ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
11654	if (ibp == NULL) {
11655		/*
11656		 * If ibp came back as NULL, the dependency could have been
11657		 * freed while we slept.  Look it up again, and check to see
11658		 * that it has completed.
11659		 */
11660		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
11661			goto retry;
11662		FREE_LOCK(&lk);
11663		return;
11664	}
11665	FREE_LOCK(&lk);
11666	if ((error = bwrite(ibp)) != 0)
11667		softdep_error("softdep_update_inodeblock: bwrite", error);
11668}
11669
11670/*
11671 * Merge the a new inode dependency list (such as id_newinoupdt) into an
11672 * old inode dependency list (such as id_inoupdt). This routine must be
11673 * called with splbio interrupts blocked.
11674 */
11675static void
11676merge_inode_lists(newlisthead, oldlisthead)
11677	struct allocdirectlst *newlisthead;
11678	struct allocdirectlst *oldlisthead;
11679{
11680	struct allocdirect *listadp, *newadp;
11681
11682	newadp = TAILQ_FIRST(newlisthead);
11683	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
11684		if (listadp->ad_offset < newadp->ad_offset) {
11685			listadp = TAILQ_NEXT(listadp, ad_next);
11686			continue;
11687		}
11688		TAILQ_REMOVE(newlisthead, newadp, ad_next);
11689		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
11690		if (listadp->ad_offset == newadp->ad_offset) {
11691			allocdirect_merge(oldlisthead, newadp,
11692			    listadp);
11693			listadp = newadp;
11694		}
11695		newadp = TAILQ_FIRST(newlisthead);
11696	}
11697	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
11698		TAILQ_REMOVE(newlisthead, newadp, ad_next);
11699		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
11700	}
11701}
11702
11703/*
11704 * If we are doing an fsync, then we must ensure that any directory
11705 * entries for the inode have been written after the inode gets to disk.
11706 */
11707int
11708softdep_fsync(vp)
11709	struct vnode *vp;	/* the "in_core" copy of the inode */
11710{
11711	struct inodedep *inodedep;
11712	struct pagedep *pagedep;
11713	struct inoref *inoref;
11714	struct worklist *wk;
11715	struct diradd *dap;
11716	struct mount *mp;
11717	struct vnode *pvp;
11718	struct inode *ip;
11719	struct buf *bp;
11720	struct fs *fs;
11721	struct thread *td = curthread;
11722	int error, flushparent, pagedep_new_block;
11723	ino_t parentino;
11724	ufs_lbn_t lbn;
11725
11726	ip = VTOI(vp);
11727	fs = ip->i_fs;
11728	mp = vp->v_mount;
11729	ACQUIRE_LOCK(&lk);
11730restart:
11731	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
11732		FREE_LOCK(&lk);
11733		return (0);
11734	}
11735	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
11736		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
11737		    == DEPCOMPLETE) {
11738			jwait(&inoref->if_list, MNT_WAIT);
11739			goto restart;
11740		}
11741	}
11742	if (!LIST_EMPTY(&inodedep->id_inowait) ||
11743	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
11744	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
11745	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
11746	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
11747		panic("softdep_fsync: pending ops %p", inodedep);
11748	for (error = 0, flushparent = 0; ; ) {
11749		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
11750			break;
11751		if (wk->wk_type != D_DIRADD)
11752			panic("softdep_fsync: Unexpected type %s",
11753			    TYPENAME(wk->wk_type));
11754		dap = WK_DIRADD(wk);
11755		/*
11756		 * Flush our parent if this directory entry has a MKDIR_PARENT
11757		 * dependency or is contained in a newly allocated block.
11758		 */
11759		if (dap->da_state & DIRCHG)
11760			pagedep = dap->da_previous->dm_pagedep;
11761		else
11762			pagedep = dap->da_pagedep;
11763		parentino = pagedep->pd_ino;
11764		lbn = pagedep->pd_lbn;
11765		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
11766			panic("softdep_fsync: dirty");
11767		if ((dap->da_state & MKDIR_PARENT) ||
11768		    (pagedep->pd_state & NEWBLOCK))
11769			flushparent = 1;
11770		else
11771			flushparent = 0;
11772		/*
11773		 * If we are being fsync'ed as part of vgone'ing this vnode,
11774		 * then we will not be able to release and recover the
11775		 * vnode below, so we just have to give up on writing its
11776		 * directory entry out. It will eventually be written, just
11777		 * not now, but then the user was not asking to have it
11778		 * written, so we are not breaking any promises.
11779		 */
11780		if (vp->v_iflag & VI_DOOMED)
11781			break;
11782		/*
11783		 * We prevent deadlock by always fetching inodes from the
11784		 * root, moving down the directory tree. Thus, when fetching
11785		 * our parent directory, we first try to get the lock. If
11786		 * that fails, we must unlock ourselves before requesting
11787		 * the lock on our parent. See the comment in ufs_lookup
11788		 * for details on possible races.
11789		 */
11790		FREE_LOCK(&lk);
11791		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
11792		    FFSV_FORCEINSMQ)) {
11793			error = vfs_busy(mp, MBF_NOWAIT);
11794			if (error != 0) {
11795				vfs_ref(mp);
11796				VOP_UNLOCK(vp, 0);
11797				error = vfs_busy(mp, 0);
11798				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
11799				vfs_rel(mp);
11800				if (error != 0)
11801					return (ENOENT);
11802				if (vp->v_iflag & VI_DOOMED) {
11803					vfs_unbusy(mp);
11804					return (ENOENT);
11805				}
11806			}
11807			VOP_UNLOCK(vp, 0);
11808			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
11809			    &pvp, FFSV_FORCEINSMQ);
11810			vfs_unbusy(mp);
11811			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
11812			if (vp->v_iflag & VI_DOOMED) {
11813				if (error == 0)
11814					vput(pvp);
11815				error = ENOENT;
11816			}
11817			if (error != 0)
11818				return (error);
11819		}
11820		/*
11821		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
11822		 * that are contained in direct blocks will be resolved by
11823		 * doing a ffs_update. Pagedeps contained in indirect blocks
11824		 * may require a complete sync'ing of the directory. So, we
11825		 * try the cheap and fast ffs_update first, and if that fails,
11826		 * then we do the slower ffs_syncvnode of the directory.
11827		 */
11828		if (flushparent) {
11829			int locked;
11830
11831			if ((error = ffs_update(pvp, 1)) != 0) {
11832				vput(pvp);
11833				return (error);
11834			}
11835			ACQUIRE_LOCK(&lk);
11836			locked = 1;
11837			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
11838				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
11839					if (wk->wk_type != D_DIRADD)
11840						panic("softdep_fsync: Unexpected type %s",
11841						      TYPENAME(wk->wk_type));
11842					dap = WK_DIRADD(wk);
11843					if (dap->da_state & DIRCHG)
11844						pagedep = dap->da_previous->dm_pagedep;
11845					else
11846						pagedep = dap->da_pagedep;
11847					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
11848					FREE_LOCK(&lk);
11849					locked = 0;
11850					if (pagedep_new_block && (error =
11851					    ffs_syncvnode(pvp, MNT_WAIT, 0))) {
11852						vput(pvp);
11853						return (error);
11854					}
11855				}
11856			}
11857			if (locked)
11858				FREE_LOCK(&lk);
11859		}
11860		/*
11861		 * Flush directory page containing the inode's name.
11862		 */
11863		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
11864		    &bp);
11865		if (error == 0)
11866			error = bwrite(bp);
11867		else
11868			brelse(bp);
11869		vput(pvp);
11870		if (error != 0)
11871			return (error);
11872		ACQUIRE_LOCK(&lk);
11873		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
11874			break;
11875	}
11876	FREE_LOCK(&lk);
11877	return (0);
11878}
11879
11880/*
11881 * Flush all the dirty bitmaps associated with the block device
11882 * before flushing the rest of the dirty blocks so as to reduce
11883 * the number of dependencies that will have to be rolled back.
11884 *
11885 * XXX Unused?
11886 */
11887void
11888softdep_fsync_mountdev(vp)
11889	struct vnode *vp;
11890{
11891	struct buf *bp, *nbp;
11892	struct worklist *wk;
11893	struct bufobj *bo;
11894
11895	if (!vn_isdisk(vp, NULL))
11896		panic("softdep_fsync_mountdev: vnode not a disk");
11897	bo = &vp->v_bufobj;
11898restart:
11899	BO_LOCK(bo);
11900	ACQUIRE_LOCK(&lk);
11901	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
11902		/*
11903		 * If it is already scheduled, skip to the next buffer.
11904		 */
11905		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
11906			continue;
11907
11908		if ((bp->b_flags & B_DELWRI) == 0)
11909			panic("softdep_fsync_mountdev: not dirty");
11910		/*
11911		 * We are only interested in bitmaps with outstanding
11912		 * dependencies.
11913		 */
11914		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
11915		    wk->wk_type != D_BMSAFEMAP ||
11916		    (bp->b_vflags & BV_BKGRDINPROG)) {
11917			BUF_UNLOCK(bp);
11918			continue;
11919		}
11920		FREE_LOCK(&lk);
11921		BO_UNLOCK(bo);
11922		bremfree(bp);
11923		(void) bawrite(bp);
11924		goto restart;
11925	}
11926	FREE_LOCK(&lk);
11927	drain_output(vp);
11928	BO_UNLOCK(bo);
11929}
11930
11931/*
11932 * Sync all cylinder groups that were dirty at the time this function is
11933 * called.  Newly dirtied cgs will be inserted before the sintenel.  This
11934 * is used to flush freedep activity that may be holding up writes to a
11935 * indirect block.
11936 */
11937static int
11938sync_cgs(mp, waitfor)
11939	struct mount *mp;
11940	int waitfor;
11941{
11942	struct bmsafemap *bmsafemap;
11943	struct bmsafemap *sintenel;
11944	struct ufsmount *ump;
11945	struct buf *bp;
11946	int error;
11947
11948	sintenel = malloc(sizeof(*sintenel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
11949	sintenel->sm_cg = -1;
11950	ump = VFSTOUFS(mp);
11951	error = 0;
11952	ACQUIRE_LOCK(&lk);
11953	LIST_INSERT_HEAD(&ump->softdep_dirtycg, sintenel, sm_next);
11954	for (bmsafemap = LIST_NEXT(sintenel, sm_next); bmsafemap != NULL;
11955	    bmsafemap = LIST_NEXT(sintenel, sm_next)) {
11956		/* Skip sintenels and cgs with no work to release. */
11957		if (bmsafemap->sm_cg == -1 ||
11958		    (LIST_EMPTY(&bmsafemap->sm_freehd) &&
11959		    LIST_EMPTY(&bmsafemap->sm_freewr))) {
11960			LIST_REMOVE(sintenel, sm_next);
11961			LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next);
11962			continue;
11963		}
11964		/*
11965		 * If we don't get the lock and we're waiting try again, if
11966		 * not move on to the next buf and try to sync it.
11967		 */
11968		bp = getdirtybuf(bmsafemap->sm_buf, &lk, waitfor);
11969		if (bp == NULL && waitfor == MNT_WAIT)
11970			continue;
11971		LIST_REMOVE(sintenel, sm_next);
11972		LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next);
11973		if (bp == NULL)
11974			continue;
11975		FREE_LOCK(&lk);
11976		if (waitfor == MNT_NOWAIT)
11977			bawrite(bp);
11978		else
11979			error = bwrite(bp);
11980		ACQUIRE_LOCK(&lk);
11981		if (error)
11982			break;
11983	}
11984	LIST_REMOVE(sintenel, sm_next);
11985	FREE_LOCK(&lk);
11986	free(sintenel, M_BMSAFEMAP);
11987	return (error);
11988}
11989
11990/*
11991 * This routine is called when we are trying to synchronously flush a
11992 * file. This routine must eliminate any filesystem metadata dependencies
11993 * so that the syncing routine can succeed.
11994 */
11995int
11996softdep_sync_metadata(struct vnode *vp)
11997{
11998	int error;
11999
12000	/*
12001	 * Ensure that any direct block dependencies have been cleared,
12002	 * truncations are started, and inode references are journaled.
12003	 */
12004	ACQUIRE_LOCK(&lk);
12005	/*
12006	 * Write all journal records to prevent rollbacks on devvp.
12007	 */
12008	if (vp->v_type == VCHR)
12009		softdep_flushjournal(vp->v_mount);
12010	error = flush_inodedep_deps(vp, vp->v_mount, VTOI(vp)->i_number);
12011	/*
12012	 * Ensure that all truncates are written so we won't find deps on
12013	 * indirect blocks.
12014	 */
12015	process_truncates(vp);
12016	FREE_LOCK(&lk);
12017
12018	return (error);
12019}
12020
12021/*
12022 * This routine is called when we are attempting to sync a buf with
12023 * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12024 * other IO it can but returns EBUSY if the buffer is not yet able to
12025 * be written.  Dependencies which will not cause rollbacks will always
12026 * return 0.
12027 */
12028int
12029softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
12030{
12031	struct indirdep *indirdep;
12032	struct pagedep *pagedep;
12033	struct allocindir *aip;
12034	struct newblk *newblk;
12035	struct buf *nbp;
12036	struct worklist *wk;
12037	int i, error;
12038
12039	/*
12040	 * For VCHR we just don't want to force flush any dependencies that
12041	 * will cause rollbacks.
12042	 */
12043	if (vp->v_type == VCHR) {
12044		if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12045			return (EBUSY);
12046		return (0);
12047	}
12048	ACQUIRE_LOCK(&lk);
12049	/*
12050	 * As we hold the buffer locked, none of its dependencies
12051	 * will disappear.
12052	 */
12053	error = 0;
12054top:
12055	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12056		switch (wk->wk_type) {
12057
12058		case D_ALLOCDIRECT:
12059		case D_ALLOCINDIR:
12060			newblk = WK_NEWBLK(wk);
12061			if (newblk->nb_jnewblk != NULL) {
12062				if (waitfor == MNT_NOWAIT) {
12063					error = EBUSY;
12064					goto out_unlock;
12065				}
12066				jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12067				goto top;
12068			}
12069			if (newblk->nb_state & DEPCOMPLETE ||
12070			    waitfor == MNT_NOWAIT)
12071				continue;
12072			nbp = newblk->nb_bmsafemap->sm_buf;
12073			nbp = getdirtybuf(nbp, &lk, waitfor);
12074			if (nbp == NULL)
12075				goto top;
12076			FREE_LOCK(&lk);
12077			if ((error = bwrite(nbp)) != 0)
12078				goto out;
12079			ACQUIRE_LOCK(&lk);
12080			continue;
12081
12082		case D_INDIRDEP:
12083			indirdep = WK_INDIRDEP(wk);
12084			if (waitfor == MNT_NOWAIT) {
12085				if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
12086				    !LIST_EMPTY(&indirdep->ir_deplisthd)) {
12087					error = EBUSY;
12088					goto out_unlock;
12089				}
12090			}
12091			if (!TAILQ_EMPTY(&indirdep->ir_trunc))
12092				panic("softdep_sync_buf: truncation pending.");
12093		restart:
12094			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
12095				newblk = (struct newblk *)aip;
12096				if (newblk->nb_jnewblk != NULL) {
12097					jwait(&newblk->nb_jnewblk->jn_list,
12098					    waitfor);
12099					goto restart;
12100				}
12101				if (newblk->nb_state & DEPCOMPLETE)
12102					continue;
12103				nbp = newblk->nb_bmsafemap->sm_buf;
12104				nbp = getdirtybuf(nbp, &lk, waitfor);
12105				if (nbp == NULL)
12106					goto restart;
12107				FREE_LOCK(&lk);
12108				if ((error = bwrite(nbp)) != 0)
12109					goto out;
12110				ACQUIRE_LOCK(&lk);
12111				goto restart;
12112			}
12113			continue;
12114
12115		case D_PAGEDEP:
12116			/*
12117			 * Only flush directory entries in synchronous passes.
12118			 */
12119			if (waitfor != MNT_WAIT) {
12120				error = EBUSY;
12121				goto out_unlock;
12122			}
12123			/*
12124			 * While syncing snapshots, we must allow recursive
12125			 * lookups.
12126			 */
12127			BUF_AREC(bp);
12128			/*
12129			 * We are trying to sync a directory that may
12130			 * have dependencies on both its own metadata
12131			 * and/or dependencies on the inodes of any
12132			 * recently allocated files. We walk its diradd
12133			 * lists pushing out the associated inode.
12134			 */
12135			pagedep = WK_PAGEDEP(wk);
12136			for (i = 0; i < DAHASHSZ; i++) {
12137				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
12138					continue;
12139				if ((error = flush_pagedep_deps(vp, wk->wk_mp,
12140				    &pagedep->pd_diraddhd[i]))) {
12141					BUF_NOREC(bp);
12142					goto out_unlock;
12143				}
12144			}
12145			BUF_NOREC(bp);
12146			continue;
12147
12148		case D_FREEWORK:
12149		case D_FREEDEP:
12150		case D_JSEGDEP:
12151		case D_JNEWBLK:
12152			continue;
12153
12154		default:
12155			panic("softdep_sync_buf: Unknown type %s",
12156			    TYPENAME(wk->wk_type));
12157			/* NOTREACHED */
12158		}
12159	}
12160out_unlock:
12161	FREE_LOCK(&lk);
12162out:
12163	return (error);
12164}
12165
12166/*
12167 * Flush the dependencies associated with an inodedep.
12168 * Called with splbio blocked.
12169 */
12170static int
12171flush_inodedep_deps(vp, mp, ino)
12172	struct vnode *vp;
12173	struct mount *mp;
12174	ino_t ino;
12175{
12176	struct inodedep *inodedep;
12177	struct inoref *inoref;
12178	int error, waitfor;
12179
12180	/*
12181	 * This work is done in two passes. The first pass grabs most
12182	 * of the buffers and begins asynchronously writing them. The
12183	 * only way to wait for these asynchronous writes is to sleep
12184	 * on the filesystem vnode which may stay busy for a long time
12185	 * if the filesystem is active. So, instead, we make a second
12186	 * pass over the dependencies blocking on each write. In the
12187	 * usual case we will be blocking against a write that we
12188	 * initiated, so when it is done the dependency will have been
12189	 * resolved. Thus the second pass is expected to end quickly.
12190	 * We give a brief window at the top of the loop to allow
12191	 * any pending I/O to complete.
12192	 */
12193	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
12194		if (error)
12195			return (error);
12196		FREE_LOCK(&lk);
12197		ACQUIRE_LOCK(&lk);
12198restart:
12199		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12200			return (0);
12201		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12202			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12203			    == DEPCOMPLETE) {
12204				jwait(&inoref->if_list, MNT_WAIT);
12205				goto restart;
12206			}
12207		}
12208		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
12209		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
12210		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
12211		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
12212			continue;
12213		/*
12214		 * If pass2, we are done, otherwise do pass 2.
12215		 */
12216		if (waitfor == MNT_WAIT)
12217			break;
12218		waitfor = MNT_WAIT;
12219	}
12220	/*
12221	 * Try freeing inodedep in case all dependencies have been removed.
12222	 */
12223	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
12224		(void) free_inodedep(inodedep);
12225	return (0);
12226}
12227
12228/*
12229 * Flush an inode dependency list.
12230 * Called with splbio blocked.
12231 */
12232static int
12233flush_deplist(listhead, waitfor, errorp)
12234	struct allocdirectlst *listhead;
12235	int waitfor;
12236	int *errorp;
12237{
12238	struct allocdirect *adp;
12239	struct newblk *newblk;
12240	struct buf *bp;
12241
12242	mtx_assert(&lk, MA_OWNED);
12243	TAILQ_FOREACH(adp, listhead, ad_next) {
12244		newblk = (struct newblk *)adp;
12245		if (newblk->nb_jnewblk != NULL) {
12246			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12247			return (1);
12248		}
12249		if (newblk->nb_state & DEPCOMPLETE)
12250			continue;
12251		bp = newblk->nb_bmsafemap->sm_buf;
12252		bp = getdirtybuf(bp, &lk, waitfor);
12253		if (bp == NULL) {
12254			if (waitfor == MNT_NOWAIT)
12255				continue;
12256			return (1);
12257		}
12258		FREE_LOCK(&lk);
12259		if (waitfor == MNT_NOWAIT)
12260			bawrite(bp);
12261		else
12262			*errorp = bwrite(bp);
12263		ACQUIRE_LOCK(&lk);
12264		return (1);
12265	}
12266	return (0);
12267}
12268
12269/*
12270 * Flush dependencies associated with an allocdirect block.
12271 */
12272static int
12273flush_newblk_dep(vp, mp, lbn)
12274	struct vnode *vp;
12275	struct mount *mp;
12276	ufs_lbn_t lbn;
12277{
12278	struct newblk *newblk;
12279	struct bufobj *bo;
12280	struct inode *ip;
12281	struct buf *bp;
12282	ufs2_daddr_t blkno;
12283	int error;
12284
12285	error = 0;
12286	bo = &vp->v_bufobj;
12287	ip = VTOI(vp);
12288	blkno = DIP(ip, i_db[lbn]);
12289	if (blkno == 0)
12290		panic("flush_newblk_dep: Missing block");
12291	ACQUIRE_LOCK(&lk);
12292	/*
12293	 * Loop until all dependencies related to this block are satisfied.
12294	 * We must be careful to restart after each sleep in case a write
12295	 * completes some part of this process for us.
12296	 */
12297	for (;;) {
12298		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
12299			FREE_LOCK(&lk);
12300			break;
12301		}
12302		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
12303			panic("flush_newblk_deps: Bad newblk %p", newblk);
12304		/*
12305		 * Flush the journal.
12306		 */
12307		if (newblk->nb_jnewblk != NULL) {
12308			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12309			continue;
12310		}
12311		/*
12312		 * Write the bitmap dependency.
12313		 */
12314		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
12315			bp = newblk->nb_bmsafemap->sm_buf;
12316			bp = getdirtybuf(bp, &lk, MNT_WAIT);
12317			if (bp == NULL)
12318				continue;
12319			FREE_LOCK(&lk);
12320			error = bwrite(bp);
12321			if (error)
12322				break;
12323			ACQUIRE_LOCK(&lk);
12324			continue;
12325		}
12326		/*
12327		 * Write the buffer.
12328		 */
12329		FREE_LOCK(&lk);
12330		BO_LOCK(bo);
12331		bp = gbincore(bo, lbn);
12332		if (bp != NULL) {
12333			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
12334			    LK_INTERLOCK, BO_MTX(bo));
12335			if (error == ENOLCK) {
12336				ACQUIRE_LOCK(&lk);
12337				continue; /* Slept, retry */
12338			}
12339			if (error != 0)
12340				break;	/* Failed */
12341			if (bp->b_flags & B_DELWRI) {
12342				bremfree(bp);
12343				error = bwrite(bp);
12344				if (error)
12345					break;
12346			} else
12347				BUF_UNLOCK(bp);
12348		} else
12349			BO_UNLOCK(bo);
12350		/*
12351		 * We have to wait for the direct pointers to
12352		 * point at the newdirblk before the dependency
12353		 * will go away.
12354		 */
12355		error = ffs_update(vp, 1);
12356		if (error)
12357			break;
12358		ACQUIRE_LOCK(&lk);
12359	}
12360	return (error);
12361}
12362
12363/*
12364 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
12365 * Called with splbio blocked.
12366 */
12367static int
12368flush_pagedep_deps(pvp, mp, diraddhdp)
12369	struct vnode *pvp;
12370	struct mount *mp;
12371	struct diraddhd *diraddhdp;
12372{
12373	struct inodedep *inodedep;
12374	struct inoref *inoref;
12375	struct ufsmount *ump;
12376	struct diradd *dap;
12377	struct vnode *vp;
12378	int error = 0;
12379	struct buf *bp;
12380	ino_t inum;
12381
12382	ump = VFSTOUFS(mp);
12383restart:
12384	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
12385		/*
12386		 * Flush ourselves if this directory entry
12387		 * has a MKDIR_PARENT dependency.
12388		 */
12389		if (dap->da_state & MKDIR_PARENT) {
12390			FREE_LOCK(&lk);
12391			if ((error = ffs_update(pvp, 1)) != 0)
12392				break;
12393			ACQUIRE_LOCK(&lk);
12394			/*
12395			 * If that cleared dependencies, go on to next.
12396			 */
12397			if (dap != LIST_FIRST(diraddhdp))
12398				continue;
12399			if (dap->da_state & MKDIR_PARENT)
12400				panic("flush_pagedep_deps: MKDIR_PARENT");
12401		}
12402		/*
12403		 * A newly allocated directory must have its "." and
12404		 * ".." entries written out before its name can be
12405		 * committed in its parent.
12406		 */
12407		inum = dap->da_newinum;
12408		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12409			panic("flush_pagedep_deps: lost inode1");
12410		/*
12411		 * Wait for any pending journal adds to complete so we don't
12412		 * cause rollbacks while syncing.
12413		 */
12414		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12415			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12416			    == DEPCOMPLETE) {
12417				jwait(&inoref->if_list, MNT_WAIT);
12418				goto restart;
12419			}
12420		}
12421		if (dap->da_state & MKDIR_BODY) {
12422			FREE_LOCK(&lk);
12423			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12424			    FFSV_FORCEINSMQ)))
12425				break;
12426			error = flush_newblk_dep(vp, mp, 0);
12427			/*
12428			 * If we still have the dependency we might need to
12429			 * update the vnode to sync the new link count to
12430			 * disk.
12431			 */
12432			if (error == 0 && dap == LIST_FIRST(diraddhdp))
12433				error = ffs_update(vp, 1);
12434			vput(vp);
12435			if (error != 0)
12436				break;
12437			ACQUIRE_LOCK(&lk);
12438			/*
12439			 * If that cleared dependencies, go on to next.
12440			 */
12441			if (dap != LIST_FIRST(diraddhdp))
12442				continue;
12443			if (dap->da_state & MKDIR_BODY) {
12444				inodedep_lookup(UFSTOVFS(ump), inum, 0,
12445				    &inodedep);
12446				panic("flush_pagedep_deps: MKDIR_BODY "
12447				    "inodedep %p dap %p vp %p",
12448				    inodedep, dap, vp);
12449			}
12450		}
12451		/*
12452		 * Flush the inode on which the directory entry depends.
12453		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
12454		 * the only remaining dependency is that the updated inode
12455		 * count must get pushed to disk. The inode has already
12456		 * been pushed into its inode buffer (via VOP_UPDATE) at
12457		 * the time of the reference count change. So we need only
12458		 * locate that buffer, ensure that there will be no rollback
12459		 * caused by a bitmap dependency, then write the inode buffer.
12460		 */
12461retry:
12462		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12463			panic("flush_pagedep_deps: lost inode");
12464		/*
12465		 * If the inode still has bitmap dependencies,
12466		 * push them to disk.
12467		 */
12468		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
12469			bp = inodedep->id_bmsafemap->sm_buf;
12470			bp = getdirtybuf(bp, &lk, MNT_WAIT);
12471			if (bp == NULL)
12472				goto retry;
12473			FREE_LOCK(&lk);
12474			if ((error = bwrite(bp)) != 0)
12475				break;
12476			ACQUIRE_LOCK(&lk);
12477			if (dap != LIST_FIRST(diraddhdp))
12478				continue;
12479		}
12480		/*
12481		 * If the inode is still sitting in a buffer waiting
12482		 * to be written or waiting for the link count to be
12483		 * adjusted update it here to flush it to disk.
12484		 */
12485		if (dap == LIST_FIRST(diraddhdp)) {
12486			FREE_LOCK(&lk);
12487			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12488			    FFSV_FORCEINSMQ)))
12489				break;
12490			error = ffs_update(vp, 1);
12491			vput(vp);
12492			if (error)
12493				break;
12494			ACQUIRE_LOCK(&lk);
12495		}
12496		/*
12497		 * If we have failed to get rid of all the dependencies
12498		 * then something is seriously wrong.
12499		 */
12500		if (dap == LIST_FIRST(diraddhdp)) {
12501			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
12502			panic("flush_pagedep_deps: failed to flush "
12503			    "inodedep %p ino %ju dap %p",
12504			    inodedep, (uintmax_t)inum, dap);
12505		}
12506	}
12507	if (error)
12508		ACQUIRE_LOCK(&lk);
12509	return (error);
12510}
12511
12512/*
12513 * A large burst of file addition or deletion activity can drive the
12514 * memory load excessively high. First attempt to slow things down
12515 * using the techniques below. If that fails, this routine requests
12516 * the offending operations to fall back to running synchronously
12517 * until the memory load returns to a reasonable level.
12518 */
12519int
12520softdep_slowdown(vp)
12521	struct vnode *vp;
12522{
12523	struct ufsmount *ump;
12524	int jlow;
12525	int max_softdeps_hard;
12526
12527	ACQUIRE_LOCK(&lk);
12528	jlow = 0;
12529	/*
12530	 * Check for journal space if needed.
12531	 */
12532	if (DOINGSUJ(vp)) {
12533		ump = VFSTOUFS(vp->v_mount);
12534		if (journal_space(ump, 0) == 0)
12535			jlow = 1;
12536	}
12537	max_softdeps_hard = max_softdeps * 11 / 10;
12538	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
12539	    dep_current[D_INODEDEP] < max_softdeps_hard &&
12540	    VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps &&
12541	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0) {
12542		FREE_LOCK(&lk);
12543  		return (0);
12544	}
12545	if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow)
12546		softdep_speedup();
12547	stat_sync_limit_hit += 1;
12548	FREE_LOCK(&lk);
12549	if (DOINGSUJ(vp))
12550		return (0);
12551	return (1);
12552}
12553
12554/*
12555 * Called by the allocation routines when they are about to fail
12556 * in the hope that we can free up the requested resource (inodes
12557 * or disk space).
12558 *
12559 * First check to see if the work list has anything on it. If it has,
12560 * clean up entries until we successfully free the requested resource.
12561 * Because this process holds inodes locked, we cannot handle any remove
12562 * requests that might block on a locked inode as that could lead to
12563 * deadlock. If the worklist yields none of the requested resource,
12564 * start syncing out vnodes to free up the needed space.
12565 */
12566int
12567softdep_request_cleanup(fs, vp, cred, resource)
12568	struct fs *fs;
12569	struct vnode *vp;
12570	struct ucred *cred;
12571	int resource;
12572{
12573	struct ufsmount *ump;
12574	struct mount *mp;
12575	struct vnode *lvp, *mvp;
12576	long starttime;
12577	ufs2_daddr_t needed;
12578	int error;
12579
12580	/*
12581	 * If we are being called because of a process doing a
12582	 * copy-on-write, then it is not safe to process any
12583	 * worklist items as we will recurse into the copyonwrite
12584	 * routine.  This will result in an incoherent snapshot.
12585	 * If the vnode that we hold is a snapshot, we must avoid
12586	 * handling other resources that could cause deadlock.
12587	 */
12588	if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
12589		return (0);
12590
12591	if (resource == FLUSH_BLOCKS_WAIT)
12592		stat_cleanup_blkrequests += 1;
12593	else
12594		stat_cleanup_inorequests += 1;
12595
12596	mp = vp->v_mount;
12597	ump = VFSTOUFS(mp);
12598	mtx_assert(UFS_MTX(ump), MA_OWNED);
12599	UFS_UNLOCK(ump);
12600	error = ffs_update(vp, 1);
12601	if (error != 0) {
12602		UFS_LOCK(ump);
12603		return (0);
12604	}
12605	/*
12606	 * If we are in need of resources, consider pausing for
12607	 * tickdelay to give ourselves some breathing room.
12608	 */
12609	ACQUIRE_LOCK(&lk);
12610	process_removes(vp);
12611	process_truncates(vp);
12612	request_cleanup(UFSTOVFS(ump), resource);
12613	FREE_LOCK(&lk);
12614	/*
12615	 * Now clean up at least as many resources as we will need.
12616	 *
12617	 * When requested to clean up inodes, the number that are needed
12618	 * is set by the number of simultaneous writers (mnt_writeopcount)
12619	 * plus a bit of slop (2) in case some more writers show up while
12620	 * we are cleaning.
12621	 *
12622	 * When requested to free up space, the amount of space that
12623	 * we need is enough blocks to allocate a full-sized segment
12624	 * (fs_contigsumsize). The number of such segments that will
12625	 * be needed is set by the number of simultaneous writers
12626	 * (mnt_writeopcount) plus a bit of slop (2) in case some more
12627	 * writers show up while we are cleaning.
12628	 *
12629	 * Additionally, if we are unpriviledged and allocating space,
12630	 * we need to ensure that we clean up enough blocks to get the
12631	 * needed number of blocks over the threshhold of the minimum
12632	 * number of blocks required to be kept free by the filesystem
12633	 * (fs_minfree).
12634	 */
12635	if (resource == FLUSH_INODES_WAIT) {
12636		needed = vp->v_mount->mnt_writeopcount + 2;
12637	} else if (resource == FLUSH_BLOCKS_WAIT) {
12638		needed = (vp->v_mount->mnt_writeopcount + 2) *
12639		    fs->fs_contigsumsize;
12640		if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
12641			needed += fragstoblks(fs,
12642			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
12643			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
12644	} else {
12645		UFS_LOCK(ump);
12646		printf("softdep_request_cleanup: Unknown resource type %d\n",
12647		    resource);
12648		return (0);
12649	}
12650	starttime = time_second;
12651retry:
12652	if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
12653	    fs->fs_cstotal.cs_nbfree <= needed) ||
12654	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
12655	    fs->fs_cstotal.cs_nifree <= needed)) {
12656		ACQUIRE_LOCK(&lk);
12657		if (ump->softdep_on_worklist > 0 &&
12658		    process_worklist_item(UFSTOVFS(ump),
12659		    ump->softdep_on_worklist, LK_NOWAIT) != 0)
12660			stat_worklist_push += 1;
12661		FREE_LOCK(&lk);
12662	}
12663	/*
12664	 * If we still need resources and there are no more worklist
12665	 * entries to process to obtain them, we have to start flushing
12666	 * the dirty vnodes to force the release of additional requests
12667	 * to the worklist that we can then process to reap addition
12668	 * resources. We walk the vnodes associated with the mount point
12669	 * until we get the needed worklist requests that we can reap.
12670	 */
12671	if ((resource == FLUSH_BLOCKS_WAIT &&
12672	     fs->fs_cstotal.cs_nbfree <= needed) ||
12673	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
12674	     fs->fs_cstotal.cs_nifree <= needed)) {
12675		MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
12676			if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
12677				VI_UNLOCK(lvp);
12678				continue;
12679			}
12680			if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
12681			    curthread))
12682				continue;
12683			if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
12684				vput(lvp);
12685				continue;
12686			}
12687			(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
12688			vput(lvp);
12689		}
12690		lvp = ump->um_devvp;
12691		if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
12692			VOP_FSYNC(lvp, MNT_NOWAIT, curthread);
12693			VOP_UNLOCK(lvp, 0);
12694		}
12695		if (ump->softdep_on_worklist > 0) {
12696			stat_cleanup_retries += 1;
12697			goto retry;
12698		}
12699		stat_cleanup_failures += 1;
12700	}
12701	if (time_second - starttime > stat_cleanup_high_delay)
12702		stat_cleanup_high_delay = time_second - starttime;
12703	UFS_LOCK(ump);
12704	return (1);
12705}
12706
12707/*
12708 * If memory utilization has gotten too high, deliberately slow things
12709 * down and speed up the I/O processing.
12710 */
12711extern struct thread *syncertd;
12712static int
12713request_cleanup(mp, resource)
12714	struct mount *mp;
12715	int resource;
12716{
12717	struct thread *td = curthread;
12718	struct ufsmount *ump;
12719
12720	mtx_assert(&lk, MA_OWNED);
12721	/*
12722	 * We never hold up the filesystem syncer or buf daemon.
12723	 */
12724	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
12725		return (0);
12726	ump = VFSTOUFS(mp);
12727	/*
12728	 * First check to see if the work list has gotten backlogged.
12729	 * If it has, co-opt this process to help clean up two entries.
12730	 * Because this process may hold inodes locked, we cannot
12731	 * handle any remove requests that might block on a locked
12732	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
12733	 * to avoid recursively processing the worklist.
12734	 */
12735	if (ump->softdep_on_worklist > max_softdeps / 10) {
12736		td->td_pflags |= TDP_SOFTDEP;
12737		process_worklist_item(mp, 2, LK_NOWAIT);
12738		td->td_pflags &= ~TDP_SOFTDEP;
12739		stat_worklist_push += 2;
12740		return(1);
12741	}
12742	/*
12743	 * Next, we attempt to speed up the syncer process. If that
12744	 * is successful, then we allow the process to continue.
12745	 */
12746	if (softdep_speedup() &&
12747	    resource != FLUSH_BLOCKS_WAIT &&
12748	    resource != FLUSH_INODES_WAIT)
12749		return(0);
12750	/*
12751	 * If we are resource constrained on inode dependencies, try
12752	 * flushing some dirty inodes. Otherwise, we are constrained
12753	 * by file deletions, so try accelerating flushes of directories
12754	 * with removal dependencies. We would like to do the cleanup
12755	 * here, but we probably hold an inode locked at this point and
12756	 * that might deadlock against one that we try to clean. So,
12757	 * the best that we can do is request the syncer daemon to do
12758	 * the cleanup for us.
12759	 */
12760	switch (resource) {
12761
12762	case FLUSH_INODES:
12763	case FLUSH_INODES_WAIT:
12764		stat_ino_limit_push += 1;
12765		req_clear_inodedeps += 1;
12766		stat_countp = &stat_ino_limit_hit;
12767		break;
12768
12769	case FLUSH_BLOCKS:
12770	case FLUSH_BLOCKS_WAIT:
12771		stat_blk_limit_push += 1;
12772		req_clear_remove += 1;
12773		stat_countp = &stat_blk_limit_hit;
12774		break;
12775
12776	default:
12777		panic("request_cleanup: unknown type");
12778	}
12779	/*
12780	 * Hopefully the syncer daemon will catch up and awaken us.
12781	 * We wait at most tickdelay before proceeding in any case.
12782	 */
12783	proc_waiting += 1;
12784	if (callout_pending(&softdep_callout) == FALSE)
12785		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
12786		    pause_timer, 0);
12787
12788	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
12789	proc_waiting -= 1;
12790	return (1);
12791}
12792
12793/*
12794 * Awaken processes pausing in request_cleanup and clear proc_waiting
12795 * to indicate that there is no longer a timer running.
12796 */
12797static void
12798pause_timer(arg)
12799	void *arg;
12800{
12801
12802	/*
12803	 * The callout_ API has acquired mtx and will hold it around this
12804	 * function call.
12805	 */
12806	*stat_countp += 1;
12807	wakeup_one(&proc_waiting);
12808	if (proc_waiting > 0)
12809		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
12810		    pause_timer, 0);
12811}
12812
12813/*
12814 * Flush out a directory with at least one removal dependency in an effort to
12815 * reduce the number of dirrem, freefile, and freeblks dependency structures.
12816 */
12817static void
12818clear_remove(void)
12819{
12820	struct pagedep_hashhead *pagedephd;
12821	struct pagedep *pagedep;
12822	static int next = 0;
12823	struct mount *mp;
12824	struct vnode *vp;
12825	struct bufobj *bo;
12826	int error, cnt;
12827	ino_t ino;
12828
12829	mtx_assert(&lk, MA_OWNED);
12830
12831	for (cnt = 0; cnt < pagedep_hash; cnt++) {
12832		pagedephd = &pagedep_hashtbl[next++];
12833		if (next >= pagedep_hash)
12834			next = 0;
12835		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
12836			if (LIST_EMPTY(&pagedep->pd_dirremhd))
12837				continue;
12838			mp = pagedep->pd_list.wk_mp;
12839			ino = pagedep->pd_ino;
12840			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
12841				continue;
12842			FREE_LOCK(&lk);
12843
12844			/*
12845			 * Let unmount clear deps
12846			 */
12847			error = vfs_busy(mp, MBF_NOWAIT);
12848			if (error != 0)
12849				goto finish_write;
12850			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
12851			     FFSV_FORCEINSMQ);
12852			vfs_unbusy(mp);
12853			if (error != 0) {
12854				softdep_error("clear_remove: vget", error);
12855				goto finish_write;
12856			}
12857			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
12858				softdep_error("clear_remove: fsync", error);
12859			bo = &vp->v_bufobj;
12860			BO_LOCK(bo);
12861			drain_output(vp);
12862			BO_UNLOCK(bo);
12863			vput(vp);
12864		finish_write:
12865			vn_finished_write(mp);
12866			ACQUIRE_LOCK(&lk);
12867			return;
12868		}
12869	}
12870}
12871
12872/*
12873 * Clear out a block of dirty inodes in an effort to reduce
12874 * the number of inodedep dependency structures.
12875 */
12876static void
12877clear_inodedeps(void)
12878{
12879	struct inodedep_hashhead *inodedephd;
12880	struct inodedep *inodedep;
12881	static int next = 0;
12882	struct mount *mp;
12883	struct vnode *vp;
12884	struct fs *fs;
12885	int error, cnt;
12886	ino_t firstino, lastino, ino;
12887
12888	mtx_assert(&lk, MA_OWNED);
12889	/*
12890	 * Pick a random inode dependency to be cleared.
12891	 * We will then gather up all the inodes in its block
12892	 * that have dependencies and flush them out.
12893	 */
12894	for (cnt = 0; cnt < inodedep_hash; cnt++) {
12895		inodedephd = &inodedep_hashtbl[next++];
12896		if (next >= inodedep_hash)
12897			next = 0;
12898		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
12899			break;
12900	}
12901	if (inodedep == NULL)
12902		return;
12903	fs = inodedep->id_fs;
12904	mp = inodedep->id_list.wk_mp;
12905	/*
12906	 * Find the last inode in the block with dependencies.
12907	 */
12908	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
12909	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
12910		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
12911			break;
12912	/*
12913	 * Asynchronously push all but the last inode with dependencies.
12914	 * Synchronously push the last inode with dependencies to ensure
12915	 * that the inode block gets written to free up the inodedeps.
12916	 */
12917	for (ino = firstino; ino <= lastino; ino++) {
12918		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12919			continue;
12920		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
12921			continue;
12922		FREE_LOCK(&lk);
12923		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
12924		if (error != 0) {
12925			vn_finished_write(mp);
12926			ACQUIRE_LOCK(&lk);
12927			return;
12928		}
12929		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
12930		    FFSV_FORCEINSMQ)) != 0) {
12931			softdep_error("clear_inodedeps: vget", error);
12932			vfs_unbusy(mp);
12933			vn_finished_write(mp);
12934			ACQUIRE_LOCK(&lk);
12935			return;
12936		}
12937		vfs_unbusy(mp);
12938		if (ino == lastino) {
12939			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
12940				softdep_error("clear_inodedeps: fsync1", error);
12941		} else {
12942			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
12943				softdep_error("clear_inodedeps: fsync2", error);
12944			BO_LOCK(&vp->v_bufobj);
12945			drain_output(vp);
12946			BO_UNLOCK(&vp->v_bufobj);
12947		}
12948		vput(vp);
12949		vn_finished_write(mp);
12950		ACQUIRE_LOCK(&lk);
12951	}
12952}
12953
12954void
12955softdep_buf_append(bp, wkhd)
12956	struct buf *bp;
12957	struct workhead *wkhd;
12958{
12959	struct worklist *wk;
12960
12961	ACQUIRE_LOCK(&lk);
12962	while ((wk = LIST_FIRST(wkhd)) != NULL) {
12963		WORKLIST_REMOVE(wk);
12964		WORKLIST_INSERT(&bp->b_dep, wk);
12965	}
12966	FREE_LOCK(&lk);
12967
12968}
12969
12970void
12971softdep_inode_append(ip, cred, wkhd)
12972	struct inode *ip;
12973	struct ucred *cred;
12974	struct workhead *wkhd;
12975{
12976	struct buf *bp;
12977	struct fs *fs;
12978	int error;
12979
12980	fs = ip->i_fs;
12981	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
12982	    (int)fs->fs_bsize, cred, &bp);
12983	if (error) {
12984		softdep_freework(wkhd);
12985		return;
12986	}
12987	softdep_buf_append(bp, wkhd);
12988	bqrelse(bp);
12989}
12990
12991void
12992softdep_freework(wkhd)
12993	struct workhead *wkhd;
12994{
12995
12996	ACQUIRE_LOCK(&lk);
12997	handle_jwork(wkhd);
12998	FREE_LOCK(&lk);
12999}
13000
13001/*
13002 * Function to determine if the buffer has outstanding dependencies
13003 * that will cause a roll-back if the buffer is written. If wantcount
13004 * is set, return number of dependencies, otherwise just yes or no.
13005 */
13006static int
13007softdep_count_dependencies(bp, wantcount)
13008	struct buf *bp;
13009	int wantcount;
13010{
13011	struct worklist *wk;
13012	struct bmsafemap *bmsafemap;
13013	struct freework *freework;
13014	struct inodedep *inodedep;
13015	struct indirdep *indirdep;
13016	struct freeblks *freeblks;
13017	struct allocindir *aip;
13018	struct pagedep *pagedep;
13019	struct dirrem *dirrem;
13020	struct newblk *newblk;
13021	struct mkdir *mkdir;
13022	struct diradd *dap;
13023	int i, retval;
13024
13025	retval = 0;
13026	ACQUIRE_LOCK(&lk);
13027	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
13028		switch (wk->wk_type) {
13029
13030		case D_INODEDEP:
13031			inodedep = WK_INODEDEP(wk);
13032			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
13033				/* bitmap allocation dependency */
13034				retval += 1;
13035				if (!wantcount)
13036					goto out;
13037			}
13038			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
13039				/* direct block pointer dependency */
13040				retval += 1;
13041				if (!wantcount)
13042					goto out;
13043			}
13044			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
13045				/* direct block pointer dependency */
13046				retval += 1;
13047				if (!wantcount)
13048					goto out;
13049			}
13050			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
13051				/* Add reference dependency. */
13052				retval += 1;
13053				if (!wantcount)
13054					goto out;
13055			}
13056			continue;
13057
13058		case D_INDIRDEP:
13059			indirdep = WK_INDIRDEP(wk);
13060
13061			TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
13062				/* indirect truncation dependency */
13063				retval += 1;
13064				if (!wantcount)
13065					goto out;
13066			}
13067
13068			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
13069				/* indirect block pointer dependency */
13070				retval += 1;
13071				if (!wantcount)
13072					goto out;
13073			}
13074			continue;
13075
13076		case D_PAGEDEP:
13077			pagedep = WK_PAGEDEP(wk);
13078			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
13079				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
13080					/* Journal remove ref dependency. */
13081					retval += 1;
13082					if (!wantcount)
13083						goto out;
13084				}
13085			}
13086			for (i = 0; i < DAHASHSZ; i++) {
13087
13088				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
13089					/* directory entry dependency */
13090					retval += 1;
13091					if (!wantcount)
13092						goto out;
13093				}
13094			}
13095			continue;
13096
13097		case D_BMSAFEMAP:
13098			bmsafemap = WK_BMSAFEMAP(wk);
13099			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
13100				/* Add reference dependency. */
13101				retval += 1;
13102				if (!wantcount)
13103					goto out;
13104			}
13105			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
13106				/* Allocate block dependency. */
13107				retval += 1;
13108				if (!wantcount)
13109					goto out;
13110			}
13111			continue;
13112
13113		case D_FREEBLKS:
13114			freeblks = WK_FREEBLKS(wk);
13115			if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
13116				/* Freeblk journal dependency. */
13117				retval += 1;
13118				if (!wantcount)
13119					goto out;
13120			}
13121			continue;
13122
13123		case D_ALLOCDIRECT:
13124		case D_ALLOCINDIR:
13125			newblk = WK_NEWBLK(wk);
13126			if (newblk->nb_jnewblk) {
13127				/* Journal allocate dependency. */
13128				retval += 1;
13129				if (!wantcount)
13130					goto out;
13131			}
13132			continue;
13133
13134		case D_MKDIR:
13135			mkdir = WK_MKDIR(wk);
13136			if (mkdir->md_jaddref) {
13137				/* Journal reference dependency. */
13138				retval += 1;
13139				if (!wantcount)
13140					goto out;
13141			}
13142			continue;
13143
13144		case D_FREEWORK:
13145		case D_FREEDEP:
13146		case D_JSEGDEP:
13147		case D_JSEG:
13148		case D_SBDEP:
13149			/* never a dependency on these blocks */
13150			continue;
13151
13152		default:
13153			panic("softdep_count_dependencies: Unexpected type %s",
13154			    TYPENAME(wk->wk_type));
13155			/* NOTREACHED */
13156		}
13157	}
13158out:
13159	FREE_LOCK(&lk);
13160	return retval;
13161}
13162
13163/*
13164 * Acquire exclusive access to a buffer.
13165 * Must be called with a locked mtx parameter.
13166 * Return acquired buffer or NULL on failure.
13167 */
13168static struct buf *
13169getdirtybuf(bp, mtx, waitfor)
13170	struct buf *bp;
13171	struct mtx *mtx;
13172	int waitfor;
13173{
13174	int error;
13175
13176	mtx_assert(mtx, MA_OWNED);
13177	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
13178		if (waitfor != MNT_WAIT)
13179			return (NULL);
13180		error = BUF_LOCK(bp,
13181		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
13182		/*
13183		 * Even if we sucessfully acquire bp here, we have dropped
13184		 * mtx, which may violates our guarantee.
13185		 */
13186		if (error == 0)
13187			BUF_UNLOCK(bp);
13188		else if (error != ENOLCK)
13189			panic("getdirtybuf: inconsistent lock: %d", error);
13190		mtx_lock(mtx);
13191		return (NULL);
13192	}
13193	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13194		if (mtx == &lk && waitfor == MNT_WAIT) {
13195			mtx_unlock(mtx);
13196			BO_LOCK(bp->b_bufobj);
13197			BUF_UNLOCK(bp);
13198			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13199				bp->b_vflags |= BV_BKGRDWAIT;
13200				msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
13201				       PRIBIO | PDROP, "getbuf", 0);
13202			} else
13203				BO_UNLOCK(bp->b_bufobj);
13204			mtx_lock(mtx);
13205			return (NULL);
13206		}
13207		BUF_UNLOCK(bp);
13208		if (waitfor != MNT_WAIT)
13209			return (NULL);
13210		/*
13211		 * The mtx argument must be bp->b_vp's mutex in
13212		 * this case.
13213		 */
13214#ifdef	DEBUG_VFS_LOCKS
13215		if (bp->b_vp->v_type != VCHR)
13216			ASSERT_BO_LOCKED(bp->b_bufobj);
13217#endif
13218		bp->b_vflags |= BV_BKGRDWAIT;
13219		msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
13220		return (NULL);
13221	}
13222	if ((bp->b_flags & B_DELWRI) == 0) {
13223		BUF_UNLOCK(bp);
13224		return (NULL);
13225	}
13226	bremfree(bp);
13227	return (bp);
13228}
13229
13230
13231/*
13232 * Check if it is safe to suspend the file system now.  On entry,
13233 * the vnode interlock for devvp should be held.  Return 0 with
13234 * the mount interlock held if the file system can be suspended now,
13235 * otherwise return EAGAIN with the mount interlock held.
13236 */
13237int
13238softdep_check_suspend(struct mount *mp,
13239		      struct vnode *devvp,
13240		      int softdep_deps,
13241		      int softdep_accdeps,
13242		      int secondary_writes,
13243		      int secondary_accwrites)
13244{
13245	struct bufobj *bo;
13246	struct ufsmount *ump;
13247	int error;
13248
13249	ump = VFSTOUFS(mp);
13250	bo = &devvp->v_bufobj;
13251	ASSERT_BO_LOCKED(bo);
13252
13253	for (;;) {
13254		if (!TRY_ACQUIRE_LOCK(&lk)) {
13255			BO_UNLOCK(bo);
13256			ACQUIRE_LOCK(&lk);
13257			FREE_LOCK(&lk);
13258			BO_LOCK(bo);
13259			continue;
13260		}
13261		MNT_ILOCK(mp);
13262		if (mp->mnt_secondary_writes != 0) {
13263			FREE_LOCK(&lk);
13264			BO_UNLOCK(bo);
13265			msleep(&mp->mnt_secondary_writes,
13266			       MNT_MTX(mp),
13267			       (PUSER - 1) | PDROP, "secwr", 0);
13268			BO_LOCK(bo);
13269			continue;
13270		}
13271		break;
13272	}
13273
13274	/*
13275	 * Reasons for needing more work before suspend:
13276	 * - Dirty buffers on devvp.
13277	 * - Softdep activity occurred after start of vnode sync loop
13278	 * - Secondary writes occurred after start of vnode sync loop
13279	 */
13280	error = 0;
13281	if (bo->bo_numoutput > 0 ||
13282	    bo->bo_dirty.bv_cnt > 0 ||
13283	    softdep_deps != 0 ||
13284	    ump->softdep_deps != 0 ||
13285	    softdep_accdeps != ump->softdep_accdeps ||
13286	    secondary_writes != 0 ||
13287	    mp->mnt_secondary_writes != 0 ||
13288	    secondary_accwrites != mp->mnt_secondary_accwrites)
13289		error = EAGAIN;
13290	FREE_LOCK(&lk);
13291	BO_UNLOCK(bo);
13292	return (error);
13293}
13294
13295
13296/*
13297 * Get the number of dependency structures for the file system, both
13298 * the current number and the total number allocated.  These will
13299 * later be used to detect that softdep processing has occurred.
13300 */
13301void
13302softdep_get_depcounts(struct mount *mp,
13303		      int *softdep_depsp,
13304		      int *softdep_accdepsp)
13305{
13306	struct ufsmount *ump;
13307
13308	ump = VFSTOUFS(mp);
13309	ACQUIRE_LOCK(&lk);
13310	*softdep_depsp = ump->softdep_deps;
13311	*softdep_accdepsp = ump->softdep_accdeps;
13312	FREE_LOCK(&lk);
13313}
13314
13315/*
13316 * Wait for pending output on a vnode to complete.
13317 * Must be called with vnode lock and interlock locked.
13318 *
13319 * XXX: Should just be a call to bufobj_wwait().
13320 */
13321static void
13322drain_output(vp)
13323	struct vnode *vp;
13324{
13325	struct bufobj *bo;
13326
13327	bo = &vp->v_bufobj;
13328	ASSERT_VOP_LOCKED(vp, "drain_output");
13329	ASSERT_BO_LOCKED(bo);
13330
13331	while (bo->bo_numoutput) {
13332		bo->bo_flag |= BO_WWAIT;
13333		msleep((caddr_t)&bo->bo_numoutput,
13334		    BO_MTX(bo), PRIBIO + 1, "drainvp", 0);
13335	}
13336}
13337
13338/*
13339 * Called whenever a buffer that is being invalidated or reallocated
13340 * contains dependencies. This should only happen if an I/O error has
13341 * occurred. The routine is called with the buffer locked.
13342 */
13343static void
13344softdep_deallocate_dependencies(bp)
13345	struct buf *bp;
13346{
13347
13348	if ((bp->b_ioflags & BIO_ERROR) == 0)
13349		panic("softdep_deallocate_dependencies: dangling deps");
13350	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
13351	panic("softdep_deallocate_dependencies: unrecovered I/O error");
13352}
13353
13354/*
13355 * Function to handle asynchronous write errors in the filesystem.
13356 */
13357static void
13358softdep_error(func, error)
13359	char *func;
13360	int error;
13361{
13362
13363	/* XXX should do something better! */
13364	printf("%s: got error %d while accessing filesystem\n", func, error);
13365}
13366
13367#ifdef DDB
13368
13369static void
13370inodedep_print(struct inodedep *inodedep, int verbose)
13371{
13372	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
13373	    " saveino %p\n",
13374	    inodedep, inodedep->id_fs, inodedep->id_state,
13375	    (intmax_t)inodedep->id_ino,
13376	    (intmax_t)fsbtodb(inodedep->id_fs,
13377	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
13378	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
13379	    inodedep->id_savedino1);
13380
13381	if (verbose == 0)
13382		return;
13383
13384	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
13385	    "mkdiradd %p\n",
13386	    LIST_FIRST(&inodedep->id_pendinghd),
13387	    LIST_FIRST(&inodedep->id_bufwait),
13388	    LIST_FIRST(&inodedep->id_inowait),
13389	    TAILQ_FIRST(&inodedep->id_inoreflst),
13390	    inodedep->id_mkdiradd);
13391	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
13392	    TAILQ_FIRST(&inodedep->id_inoupdt),
13393	    TAILQ_FIRST(&inodedep->id_newinoupdt),
13394	    TAILQ_FIRST(&inodedep->id_extupdt),
13395	    TAILQ_FIRST(&inodedep->id_newextupdt));
13396}
13397
13398DB_SHOW_COMMAND(inodedep, db_show_inodedep)
13399{
13400
13401	if (have_addr == 0) {
13402		db_printf("Address required\n");
13403		return;
13404	}
13405	inodedep_print((struct inodedep*)addr, 1);
13406}
13407
13408DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
13409{
13410	struct inodedep_hashhead *inodedephd;
13411	struct inodedep *inodedep;
13412	struct fs *fs;
13413	int cnt;
13414
13415	fs = have_addr ? (struct fs *)addr : NULL;
13416	for (cnt = 0; cnt < inodedep_hash; cnt++) {
13417		inodedephd = &inodedep_hashtbl[cnt];
13418		LIST_FOREACH(inodedep, inodedephd, id_hash) {
13419			if (fs != NULL && fs != inodedep->id_fs)
13420				continue;
13421			inodedep_print(inodedep, 0);
13422		}
13423	}
13424}
13425
13426DB_SHOW_COMMAND(worklist, db_show_worklist)
13427{
13428	struct worklist *wk;
13429
13430	if (have_addr == 0) {
13431		db_printf("Address required\n");
13432		return;
13433	}
13434	wk = (struct worklist *)addr;
13435	printf("worklist: %p type %s state 0x%X\n",
13436	    wk, TYPENAME(wk->wk_type), wk->wk_state);
13437}
13438
13439DB_SHOW_COMMAND(workhead, db_show_workhead)
13440{
13441	struct workhead *wkhd;
13442	struct worklist *wk;
13443	int i;
13444
13445	if (have_addr == 0) {
13446		db_printf("Address required\n");
13447		return;
13448	}
13449	wkhd = (struct workhead *)addr;
13450	wk = LIST_FIRST(wkhd);
13451	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
13452		db_printf("worklist: %p type %s state 0x%X",
13453		    wk, TYPENAME(wk->wk_type), wk->wk_state);
13454	if (i == 100)
13455		db_printf("workhead overflow");
13456	printf("\n");
13457}
13458
13459
13460DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
13461{
13462	struct jaddref *jaddref;
13463	struct diradd *diradd;
13464	struct mkdir *mkdir;
13465
13466	LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
13467		diradd = mkdir->md_diradd;
13468		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
13469		    mkdir, mkdir->md_state, diradd, diradd->da_state);
13470		if ((jaddref = mkdir->md_jaddref) != NULL)
13471			db_printf(" jaddref %p jaddref state 0x%X",
13472			    jaddref, jaddref->ja_state);
13473		db_printf("\n");
13474	}
13475}
13476
13477#endif /* DDB */
13478
13479#endif /* SOFTUPDATES */
13480