Deleted Added
full compact
ffs_softdep.c (76354) ffs_softdep.c (76357)
1/*
2 * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
3 *
4 * The soft updates code is derived from the appendix of a University
5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
6 * "Soft Updates: A Solution to the Metadata Update Problem in File
7 * Systems", CSE-TR-254-95, August 1995).
8 *
9 * Further information about soft updates can be obtained from:
10 *
11 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
12 * 1614 Oxford Street mckusick@mckusick.com
13 * Berkeley, CA 94709-1608 +1-510-843-9542
14 * USA
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 *
20 * 1. Redistributions of source code must retain the above copyright
21 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in the
24 * documentation and/or other materials provided with the distribution.
25 *
26 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
27 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
28 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
29 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
30 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
1/*
2 * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
3 *
4 * The soft updates code is derived from the appendix of a University
5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
6 * "Soft Updates: A Solution to the Metadata Update Problem in File
7 * Systems", CSE-TR-254-95, August 1995).
8 *
9 * Further information about soft updates can be obtained from:
10 *
11 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
12 * 1614 Oxford Street mckusick@mckusick.com
13 * Berkeley, CA 94709-1608 +1-510-843-9542
14 * USA
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 *
20 * 1. Redistributions of source code must retain the above copyright
21 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in the
24 * documentation and/or other materials provided with the distribution.
25 *
26 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
27 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
28 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
29 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
30 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
39 * $FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 76354 2001-05-08 07:13:00Z mckusick $
39 * $FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 76357 2001-05-08 07:42:20Z mckusick $
40 */
41
42/*
43 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
44 */
45#ifndef DIAGNOSTIC
46#define DIAGNOSTIC
47#endif
48#ifndef DEBUG
49#define DEBUG
50#endif
51
52#include <sys/param.h>
53#include <sys/kernel.h>
54#include <sys/systm.h>
55#include <sys/bio.h>
56#include <sys/buf.h>
57#include <sys/malloc.h>
58#include <sys/mount.h>
59#include <sys/proc.h>
40 */
41
42/*
43 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
44 */
45#ifndef DIAGNOSTIC
46#define DIAGNOSTIC
47#endif
48#ifndef DEBUG
49#define DEBUG
50#endif
51
52#include <sys/param.h>
53#include <sys/kernel.h>
54#include <sys/systm.h>
55#include <sys/bio.h>
56#include <sys/buf.h>
57#include <sys/malloc.h>
58#include <sys/mount.h>
59#include <sys/proc.h>
60#include <sys/stat.h>
60#include <sys/syslog.h>
61#include <sys/vnode.h>
62#include <sys/conf.h>
63#include <ufs/ufs/dir.h>
64#include <ufs/ufs/extattr.h>
65#include <ufs/ufs/quota.h>
66#include <ufs/ufs/inode.h>
67#include <ufs/ufs/ufsmount.h>
68#include <ufs/ffs/fs.h>
69#include <ufs/ffs/softdep.h>
70#include <ufs/ffs/ffs_extern.h>
71#include <ufs/ufs/ufs_extern.h>
72
73/*
74 * These definitions need to be adapted to the system to which
75 * this file is being ported.
76 */
77/*
78 * malloc types defined for the softdep system.
79 */
80static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
81static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
82static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
83static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
84static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
85static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
86static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
87static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
88static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
89static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
90static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
91static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
92static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
93
94#define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)
95
96#define D_PAGEDEP 0
97#define D_INODEDEP 1
98#define D_NEWBLK 2
99#define D_BMSAFEMAP 3
100#define D_ALLOCDIRECT 4
101#define D_INDIRDEP 5
102#define D_ALLOCINDIR 6
103#define D_FREEFRAG 7
104#define D_FREEBLKS 8
105#define D_FREEFILE 9
106#define D_DIRADD 10
107#define D_MKDIR 11
108#define D_DIRREM 12
109#define D_LAST D_DIRREM
110
111/*
112 * translate from workitem type to memory type
113 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
114 */
115static struct malloc_type *memtype[] = {
116 M_PAGEDEP,
117 M_INODEDEP,
118 M_NEWBLK,
119 M_BMSAFEMAP,
120 M_ALLOCDIRECT,
121 M_INDIRDEP,
122 M_ALLOCINDIR,
123 M_FREEFRAG,
124 M_FREEBLKS,
125 M_FREEFILE,
126 M_DIRADD,
127 M_MKDIR,
128 M_DIRREM
129};
130
131#define DtoM(type) (memtype[type])
132
133/*
134 * Names of malloc types.
135 */
136#define TYPENAME(type) \
137 ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
138/*
139 * End system adaptaion definitions.
140 */
141
142/*
143 * Internal function prototypes.
144 */
145static void softdep_error __P((char *, int));
146static void drain_output __P((struct vnode *, int));
147static int getdirtybuf __P((struct buf **, int));
148static void clear_remove __P((struct proc *));
149static void clear_inodedeps __P((struct proc *));
150static int flush_pagedep_deps __P((struct vnode *, struct mount *,
151 struct diraddhd *));
152static int flush_inodedep_deps __P((struct fs *, ino_t));
153static int handle_written_filepage __P((struct pagedep *, struct buf *));
154static void diradd_inode_written __P((struct diradd *, struct inodedep *));
155static int handle_written_inodeblock __P((struct inodedep *, struct buf *));
156static void handle_allocdirect_partdone __P((struct allocdirect *));
157static void handle_allocindir_partdone __P((struct allocindir *));
158static void initiate_write_filepage __P((struct pagedep *, struct buf *));
159static void handle_written_mkdir __P((struct mkdir *, int));
160static void initiate_write_inodeblock __P((struct inodedep *, struct buf *));
161static void handle_workitem_freefile __P((struct freefile *));
162static void handle_workitem_remove __P((struct dirrem *));
163static struct dirrem *newdirrem __P((struct buf *, struct inode *,
164 struct inode *, int, struct dirrem **));
165static void free_diradd __P((struct diradd *));
166static void free_allocindir __P((struct allocindir *, struct inodedep *));
167static int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t,
168 long *));
169static void deallocate_dependencies __P((struct buf *, struct inodedep *));
170static void free_allocdirect __P((struct allocdirectlst *,
171 struct allocdirect *, int));
172static int check_inode_unwritten __P((struct inodedep *));
173static int free_inodedep __P((struct inodedep *));
174static void handle_workitem_freeblocks __P((struct freeblks *, int));
175static void merge_inode_lists __P((struct inodedep *));
176static void setup_allocindir_phase2 __P((struct buf *, struct inode *,
177 struct allocindir *));
178static struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t,
179 ufs_daddr_t));
180static void handle_workitem_freefrag __P((struct freefrag *));
181static struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long));
182static void allocdirect_merge __P((struct allocdirectlst *,
183 struct allocdirect *, struct allocdirect *));
184static struct bmsafemap *bmsafemap_lookup __P((struct buf *));
185static int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
186 struct newblk **));
187static int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
188static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
189 struct pagedep **));
190static void pause_timer __P((void *));
191static int request_cleanup __P((int, int));
192static int process_worklist_item __P((struct mount *, int));
193static void add_to_worklist __P((struct worklist *));
194
195/*
196 * Exported softdep operations.
197 */
198static void softdep_disk_io_initiation __P((struct buf *));
199static void softdep_disk_write_complete __P((struct buf *));
200static void softdep_deallocate_dependencies __P((struct buf *));
201static void softdep_move_dependencies __P((struct buf *, struct buf *));
202static int softdep_count_dependencies __P((struct buf *bp, int));
203
204struct bio_ops bioops = {
205 softdep_disk_io_initiation, /* io_start */
206 softdep_disk_write_complete, /* io_complete */
207 softdep_deallocate_dependencies, /* io_deallocate */
208 softdep_move_dependencies, /* io_movedeps */
209 softdep_count_dependencies, /* io_countdeps */
210};
211
212/*
213 * Locking primitives.
214 *
215 * For a uniprocessor, all we need to do is protect against disk
216 * interrupts. For a multiprocessor, this lock would have to be
217 * a mutex. A single mutex is used throughout this file, though
218 * finer grain locking could be used if contention warranted it.
219 *
220 * For a multiprocessor, the sleep call would accept a lock and
221 * release it after the sleep processing was complete. In a uniprocessor
222 * implementation there is no such interlock, so we simple mark
223 * the places where it needs to be done with the `interlocked' form
224 * of the lock calls. Since the uniprocessor sleep already interlocks
225 * the spl, there is nothing that really needs to be done.
226 */
227#ifndef /* NOT */ DEBUG
228static struct lockit {
229 int lkt_spl;
230} lk = { 0 };
231#define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio()
232#define FREE_LOCK(lk) splx((lk)->lkt_spl)
233#define ACQUIRE_LOCK_INTERLOCKED(lk)
234#define FREE_LOCK_INTERLOCKED(lk)
235
236#else /* DEBUG */
237static struct lockit {
238 int lkt_spl;
239 pid_t lkt_held;
240} lk = { 0, -1 };
241static int lockcnt;
242
243static void acquire_lock __P((struct lockit *));
244static void free_lock __P((struct lockit *));
245static void acquire_lock_interlocked __P((struct lockit *));
246static void free_lock_interlocked __P((struct lockit *));
247
248#define ACQUIRE_LOCK(lk) acquire_lock(lk)
249#define FREE_LOCK(lk) free_lock(lk)
250#define ACQUIRE_LOCK_INTERLOCKED(lk) acquire_lock_interlocked(lk)
251#define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk)
252
253static void
254acquire_lock(lk)
255 struct lockit *lk;
256{
257 pid_t holder;
258
259 if (lk->lkt_held != -1) {
260 holder = lk->lkt_held;
261 FREE_LOCK(lk);
262 if (holder == CURPROC->p_pid)
263 panic("softdep_lock: locking against myself");
264 else
265 panic("softdep_lock: lock held by %d", holder);
266 }
267 lk->lkt_spl = splbio();
268 lk->lkt_held = CURPROC->p_pid;
269 lockcnt++;
270}
271
272static void
273free_lock(lk)
274 struct lockit *lk;
275{
276
277 if (lk->lkt_held == -1)
278 panic("softdep_unlock: lock not held");
279 lk->lkt_held = -1;
280 splx(lk->lkt_spl);
281}
282
283static void
284acquire_lock_interlocked(lk)
285 struct lockit *lk;
286{
287 pid_t holder;
288
289 if (lk->lkt_held != -1) {
290 holder = lk->lkt_held;
291 FREE_LOCK(lk);
292 if (holder == CURPROC->p_pid)
293 panic("softdep_lock_interlocked: locking against self");
294 else
295 panic("softdep_lock_interlocked: lock held by %d",
296 holder);
297 }
298 lk->lkt_held = CURPROC->p_pid;
299 lockcnt++;
300}
301
302static void
303free_lock_interlocked(lk)
304 struct lockit *lk;
305{
306
307 if (lk->lkt_held == -1)
308 panic("softdep_unlock_interlocked: lock not held");
309 lk->lkt_held = -1;
310}
311#endif /* DEBUG */
312
313/*
314 * Place holder for real semaphores.
315 */
316struct sema {
317 int value;
318 pid_t holder;
319 char *name;
320 int prio;
321 int timo;
322};
323static void sema_init __P((struct sema *, char *, int, int));
324static int sema_get __P((struct sema *, struct lockit *));
325static void sema_release __P((struct sema *));
326
327static void
328sema_init(semap, name, prio, timo)
329 struct sema *semap;
330 char *name;
331 int prio, timo;
332{
333
334 semap->holder = -1;
335 semap->value = 0;
336 semap->name = name;
337 semap->prio = prio;
338 semap->timo = timo;
339}
340
341static int
342sema_get(semap, interlock)
343 struct sema *semap;
344 struct lockit *interlock;
345{
346
347 if (semap->value++ > 0) {
348 if (interlock != NULL)
349 FREE_LOCK_INTERLOCKED(interlock);
350 tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo);
351 if (interlock != NULL) {
352 ACQUIRE_LOCK_INTERLOCKED(interlock);
353 FREE_LOCK(interlock);
354 }
355 return (0);
356 }
357 semap->holder = CURPROC->p_pid;
358 if (interlock != NULL)
359 FREE_LOCK(interlock);
360 return (1);
361}
362
363static void
364sema_release(semap)
365 struct sema *semap;
366{
367
368 if (semap->value <= 0 || semap->holder != CURPROC->p_pid) {
369 if (lk.lkt_held != -1)
370 FREE_LOCK(&lk);
371 panic("sema_release: not held");
372 }
373 if (--semap->value > 0) {
374 semap->value = 0;
375 wakeup(semap);
376 }
377 semap->holder = -1;
378}
379
380/*
381 * Worklist queue management.
382 * These routines require that the lock be held.
383 */
384#ifndef /* NOT */ DEBUG
385#define WORKLIST_INSERT(head, item) do { \
386 (item)->wk_state |= ONWORKLIST; \
387 LIST_INSERT_HEAD(head, item, wk_list); \
388} while (0)
389#define WORKLIST_REMOVE(item) do { \
390 (item)->wk_state &= ~ONWORKLIST; \
391 LIST_REMOVE(item, wk_list); \
392} while (0)
393#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
394
395#else /* DEBUG */
396static void worklist_insert __P((struct workhead *, struct worklist *));
397static void worklist_remove __P((struct worklist *));
398static void workitem_free __P((struct worklist *, int));
399
400#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
401#define WORKLIST_REMOVE(item) worklist_remove(item)
402#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
403
404static void
405worklist_insert(head, item)
406 struct workhead *head;
407 struct worklist *item;
408{
409
410 if (lk.lkt_held == -1)
411 panic("worklist_insert: lock not held");
412 if (item->wk_state & ONWORKLIST) {
413 FREE_LOCK(&lk);
414 panic("worklist_insert: already on list");
415 }
416 item->wk_state |= ONWORKLIST;
417 LIST_INSERT_HEAD(head, item, wk_list);
418}
419
420static void
421worklist_remove(item)
422 struct worklist *item;
423{
424
425 if (lk.lkt_held == -1)
426 panic("worklist_remove: lock not held");
427 if ((item->wk_state & ONWORKLIST) == 0) {
428 FREE_LOCK(&lk);
429 panic("worklist_remove: not on list");
430 }
431 item->wk_state &= ~ONWORKLIST;
432 LIST_REMOVE(item, wk_list);
433}
434
435static void
436workitem_free(item, type)
437 struct worklist *item;
438 int type;
439{
440
441 if (item->wk_state & ONWORKLIST) {
442 if (lk.lkt_held != -1)
443 FREE_LOCK(&lk);
444 panic("workitem_free: still on list");
445 }
446 if (item->wk_type != type) {
447 if (lk.lkt_held != -1)
448 FREE_LOCK(&lk);
449 panic("workitem_free: type mismatch");
450 }
451 FREE(item, DtoM(type));
452}
453#endif /* DEBUG */
454
455/*
456 * Workitem queue management
457 */
458static struct workhead softdep_workitem_pending;
459static int num_on_worklist; /* number of worklist items to be processed */
460static int softdep_worklist_busy; /* 1 => trying to do unmount */
461static int softdep_worklist_req; /* serialized waiters */
462static int max_softdeps; /* maximum number of structs before slowdown */
463static int tickdelay = 2; /* number of ticks to pause during slowdown */
464static int proc_waiting; /* tracks whether we have a timeout posted */
465static int *stat_countp; /* statistic to count in proc_waiting timeout */
466static struct callout_handle handle; /* handle on posted proc_waiting timeout */
467static struct proc *filesys_syncer; /* proc of filesystem syncer process */
468static int req_clear_inodedeps; /* syncer process flush some inodedeps */
469#define FLUSH_INODES 1
470static int req_clear_remove; /* syncer process flush some freeblks */
471#define FLUSH_REMOVE 2
472/*
473 * runtime statistics
474 */
475static int stat_worklist_push; /* number of worklist cleanups */
476static int stat_blk_limit_push; /* number of times block limit neared */
477static int stat_ino_limit_push; /* number of times inode limit neared */
478static int stat_blk_limit_hit; /* number of times block slowdown imposed */
479static int stat_ino_limit_hit; /* number of times inode slowdown imposed */
480static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
481static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
482static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
483static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
484static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
485#ifdef DEBUG
486#include <vm/vm.h>
487#include <sys/sysctl.h>
488SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
489SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
490SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
491SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
492SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
493SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
494SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
495SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
496SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
497SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
498SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
499SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
500#endif /* DEBUG */
501
502/*
503 * Add an item to the end of the work queue.
504 * This routine requires that the lock be held.
505 * This is the only routine that adds items to the list.
506 * The following routine is the only one that removes items
507 * and does so in order from first to last.
508 */
509static void
510add_to_worklist(wk)
511 struct worklist *wk;
512{
513 static struct worklist *worklist_tail;
514
515 if (wk->wk_state & ONWORKLIST) {
516 if (lk.lkt_held != -1)
517 FREE_LOCK(&lk);
518 panic("add_to_worklist: already on list");
519 }
520 wk->wk_state |= ONWORKLIST;
521 if (LIST_FIRST(&softdep_workitem_pending) == NULL)
522 LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
523 else
524 LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
525 worklist_tail = wk;
526 num_on_worklist += 1;
527}
528
529/*
530 * Process that runs once per second to handle items in the background queue.
531 *
532 * Note that we ensure that everything is done in the order in which they
533 * appear in the queue. The code below depends on this property to ensure
534 * that blocks of a file are freed before the inode itself is freed. This
535 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
536 * until all the old ones have been purged from the dependency lists.
537 */
538int
539softdep_process_worklist(matchmnt)
540 struct mount *matchmnt;
541{
542 struct proc *p = CURPROC;
543 int matchcnt, loopcount;
544 long starttime;
545
546 /*
547 * Record the process identifier of our caller so that we can give
548 * this process preferential treatment in request_cleanup below.
549 */
550 filesys_syncer = p;
551 matchcnt = 0;
552
553 /*
554 * There is no danger of having multiple processes run this
555 * code, but we have to single-thread it when softdep_flushfiles()
556 * is in operation to get an accurate count of the number of items
557 * related to its mount point that are in the list.
558 */
559 if (matchmnt == NULL) {
560 if (softdep_worklist_busy < 0)
561 return(-1);
562 softdep_worklist_busy += 1;
563 }
564
565 /*
566 * If requested, try removing inode or removal dependencies.
567 */
568 if (req_clear_inodedeps) {
569 clear_inodedeps(p);
570 req_clear_inodedeps -= 1;
571 wakeup_one(&proc_waiting);
572 }
573 if (req_clear_remove) {
574 clear_remove(p);
575 req_clear_remove -= 1;
576 wakeup_one(&proc_waiting);
577 }
578 loopcount = 1;
579 starttime = time_second;
580 while (num_on_worklist > 0) {
581 matchcnt += process_worklist_item(matchmnt, 0);
582
583 /*
584 * If a umount operation wants to run the worklist
585 * accurately, abort.
586 */
587 if (softdep_worklist_req && matchmnt == NULL) {
588 matchcnt = -1;
589 break;
590 }
591
592 /*
593 * If requested, try removing inode or removal dependencies.
594 */
595 if (req_clear_inodedeps) {
596 clear_inodedeps(p);
597 req_clear_inodedeps -= 1;
598 wakeup_one(&proc_waiting);
599 }
600 if (req_clear_remove) {
601 clear_remove(p);
602 req_clear_remove -= 1;
603 wakeup_one(&proc_waiting);
604 }
605 /*
606 * We do not generally want to stop for buffer space, but if
607 * we are really being a buffer hog, we will stop and wait.
608 */
609 if (loopcount++ % 128 == 0)
610 bwillwrite();
611 /*
612 * Never allow processing to run for more than one
613 * second. Otherwise the other syncer tasks may get
614 * excessively backlogged.
615 */
616 if (starttime != time_second && matchmnt == NULL) {
617 matchcnt = -1;
618 break;
619 }
620 }
621 if (matchmnt == NULL) {
622 softdep_worklist_busy -= 1;
623 if (softdep_worklist_req && softdep_worklist_busy == 0)
624 wakeup(&softdep_worklist_req);
625 }
626 return (matchcnt);
627}
628
629/*
630 * Process one item on the worklist.
631 */
632static int
633process_worklist_item(matchmnt, flags)
634 struct mount *matchmnt;
635 int flags;
636{
637 struct worklist *wk;
638 struct dirrem *dirrem;
639 struct mount *mp;
640 struct vnode *vp;
641 int matchcnt = 0;
642
643 ACQUIRE_LOCK(&lk);
644 /*
645 * Normally we just process each item on the worklist in order.
646 * However, if we are in a situation where we cannot lock any
647 * inodes, we have to skip over any dirrem requests whose
648 * vnodes are resident and locked.
649 */
650 LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
651 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
652 break;
653 dirrem = WK_DIRREM(wk);
654 vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
655 dirrem->dm_oldinum);
656 if (vp == NULL || !VOP_ISLOCKED(vp, CURPROC))
657 break;
658 }
659 if (wk == 0) {
660 FREE_LOCK(&lk);
661 return (0);
662 }
663 WORKLIST_REMOVE(wk);
664 num_on_worklist -= 1;
665 FREE_LOCK(&lk);
666 switch (wk->wk_type) {
667
668 case D_DIRREM:
669 /* removal of a directory entry */
670 mp = WK_DIRREM(wk)->dm_mnt;
671 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
672 panic("%s: dirrem on suspended filesystem",
673 "process_worklist_item");
674 if (mp == matchmnt)
675 matchcnt += 1;
676 handle_workitem_remove(WK_DIRREM(wk));
677 break;
678
679 case D_FREEBLKS:
680 /* releasing blocks and/or fragments from a file */
681 mp = WK_FREEBLKS(wk)->fb_mnt;
682 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
683 panic("%s: freeblks on suspended filesystem",
684 "process_worklist_item");
685 if (mp == matchmnt)
686 matchcnt += 1;
687 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
688 break;
689
690 case D_FREEFRAG:
691 /* releasing a fragment when replaced as a file grows */
692 mp = WK_FREEFRAG(wk)->ff_mnt;
693 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
694 panic("%s: freefrag on suspended filesystem",
695 "process_worklist_item");
696 if (mp == matchmnt)
697 matchcnt += 1;
698 handle_workitem_freefrag(WK_FREEFRAG(wk));
699 break;
700
701 case D_FREEFILE:
702 /* releasing an inode when its link count drops to 0 */
703 mp = WK_FREEFILE(wk)->fx_mnt;
704 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
705 panic("%s: freefile on suspended filesystem",
706 "process_worklist_item");
707 if (mp == matchmnt)
708 matchcnt += 1;
709 handle_workitem_freefile(WK_FREEFILE(wk));
710 break;
711
712 default:
713 panic("%s_process_worklist: Unknown type %s",
714 "softdep", TYPENAME(wk->wk_type));
715 /* NOTREACHED */
716 }
717 return (matchcnt);
718}
719
720/*
721 * Move dependencies from one buffer to another.
722 */
723static void
724softdep_move_dependencies(oldbp, newbp)
725 struct buf *oldbp;
726 struct buf *newbp;
727{
728 struct worklist *wk, *wktail;
729
730 if (LIST_FIRST(&newbp->b_dep) != NULL)
731 panic("softdep_move_dependencies: need merge code");
732 wktail = 0;
733 ACQUIRE_LOCK(&lk);
734 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
735 LIST_REMOVE(wk, wk_list);
736 if (wktail == 0)
737 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
738 else
739 LIST_INSERT_AFTER(wktail, wk, wk_list);
740 wktail = wk;
741 }
742 FREE_LOCK(&lk);
743}
744
745/*
746 * Purge the work list of all items associated with a particular mount point.
747 */
748int
749softdep_flushworklist(oldmnt, countp, p)
750 struct mount *oldmnt;
751 int *countp;
752 struct proc *p;
753{
754 struct vnode *devvp;
755 int count, error = 0;
756
757 /*
758 * Await our turn to clear out the queue, then serialize access.
759 */
760 while (softdep_worklist_busy) {
761 softdep_worklist_req += 1;
762 tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0);
763 softdep_worklist_req -= 1;
764 }
765 softdep_worklist_busy = -1;
766 /*
767 * Alternately flush the block device associated with the mount
768 * point and process any dependencies that the flushing
769 * creates. We continue until no more worklist dependencies
770 * are found.
771 */
772 *countp = 0;
773 devvp = VFSTOUFS(oldmnt)->um_devvp;
774 while ((count = softdep_process_worklist(oldmnt)) > 0) {
775 *countp += count;
776 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
777 error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
778 VOP_UNLOCK(devvp, 0, p);
779 if (error)
780 break;
781 }
782 softdep_worklist_busy = 0;
783 if (softdep_worklist_req)
784 wakeup(&softdep_worklist_req);
785 return (error);
786}
787
788/*
789 * Flush all vnodes and worklist items associated with a specified mount point.
790 */
791int
792softdep_flushfiles(oldmnt, flags, p)
793 struct mount *oldmnt;
794 int flags;
795 struct proc *p;
796{
797 int error, count, loopcnt;
798
799 /*
800 * Alternately flush the vnodes associated with the mount
801 * point and process any dependencies that the flushing
802 * creates. In theory, this loop can happen at most twice,
803 * but we give it a few extra just to be sure.
804 */
805 for (loopcnt = 10; loopcnt > 0; loopcnt--) {
806 /*
807 * Do another flush in case any vnodes were brought in
808 * as part of the cleanup operations.
809 */
810 if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
811 break;
812 if ((error = softdep_flushworklist(oldmnt, &count, p)) != 0 ||
813 count == 0)
814 break;
815 }
816 /*
817 * If we are unmounting then it is an error to fail. If we
818 * are simply trying to downgrade to read-only, then filesystem
819 * activity can keep us busy forever, so we just fail with EBUSY.
820 */
821 if (loopcnt == 0) {
822 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
823 panic("softdep_flushfiles: looping");
824 error = EBUSY;
825 }
826 return (error);
827}
828
829/*
830 * Structure hashing.
831 *
832 * There are three types of structures that can be looked up:
833 * 1) pagedep structures identified by mount point, inode number,
834 * and logical block.
835 * 2) inodedep structures identified by mount point and inode number.
836 * 3) newblk structures identified by mount point and
837 * physical block number.
838 *
839 * The "pagedep" and "inodedep" dependency structures are hashed
840 * separately from the file blocks and inodes to which they correspond.
841 * This separation helps when the in-memory copy of an inode or
842 * file block must be replaced. It also obviates the need to access
843 * an inode or file page when simply updating (or de-allocating)
844 * dependency structures. Lookup of newblk structures is needed to
845 * find newly allocated blocks when trying to associate them with
846 * their allocdirect or allocindir structure.
847 *
848 * The lookup routines optionally create and hash a new instance when
849 * an existing entry is not found.
850 */
851#define DEPALLOC 0x0001 /* allocate structure if lookup fails */
852#define NODELAY 0x0002 /* cannot do background work */
853
854/*
855 * Structures and routines associated with pagedep caching.
856 */
857LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
858u_long pagedep_hash; /* size of hash table - 1 */
859#define PAGEDEP_HASH(mp, inum, lbn) \
860 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
861 pagedep_hash])
862static struct sema pagedep_in_progress;
863
864/*
865 * Look up a pagedep. Return 1 if found, 0 if not found.
866 * If not found, allocate if DEPALLOC flag is passed.
867 * Found or allocated entry is returned in pagedeppp.
868 * This routine must be called with splbio interrupts blocked.
869 */
870static int
871pagedep_lookup(ip, lbn, flags, pagedeppp)
872 struct inode *ip;
873 ufs_lbn_t lbn;
874 int flags;
875 struct pagedep **pagedeppp;
876{
877 struct pagedep *pagedep;
878 struct pagedep_hashhead *pagedephd;
879 struct mount *mp;
880 int i;
881
882#ifdef DEBUG
883 if (lk.lkt_held == -1)
884 panic("pagedep_lookup: lock not held");
885#endif
886 mp = ITOV(ip)->v_mount;
887 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
888top:
889 LIST_FOREACH(pagedep, pagedephd, pd_hash)
890 if (ip->i_number == pagedep->pd_ino &&
891 lbn == pagedep->pd_lbn &&
892 mp == pagedep->pd_mnt)
893 break;
894 if (pagedep) {
895 *pagedeppp = pagedep;
896 return (1);
897 }
898 if ((flags & DEPALLOC) == 0) {
899 *pagedeppp = NULL;
900 return (0);
901 }
902 if (sema_get(&pagedep_in_progress, &lk) == 0) {
903 ACQUIRE_LOCK(&lk);
904 goto top;
905 }
906 MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
907 M_SOFTDEP_FLAGS|M_ZERO);
908 pagedep->pd_list.wk_type = D_PAGEDEP;
909 pagedep->pd_mnt = mp;
910 pagedep->pd_ino = ip->i_number;
911 pagedep->pd_lbn = lbn;
912 LIST_INIT(&pagedep->pd_dirremhd);
913 LIST_INIT(&pagedep->pd_pendinghd);
914 for (i = 0; i < DAHASHSZ; i++)
915 LIST_INIT(&pagedep->pd_diraddhd[i]);
916 ACQUIRE_LOCK(&lk);
917 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
918 sema_release(&pagedep_in_progress);
919 *pagedeppp = pagedep;
920 return (0);
921}
922
923/*
924 * Structures and routines associated with inodedep caching.
925 */
926LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
927static u_long inodedep_hash; /* size of hash table - 1 */
928static long num_inodedep; /* number of inodedep allocated */
929#define INODEDEP_HASH(fs, inum) \
930 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
931static struct sema inodedep_in_progress;
932
933/*
934 * Look up a inodedep. Return 1 if found, 0 if not found.
935 * If not found, allocate if DEPALLOC flag is passed.
936 * Found or allocated entry is returned in inodedeppp.
937 * This routine must be called with splbio interrupts blocked.
938 */
939static int
940inodedep_lookup(fs, inum, flags, inodedeppp)
941 struct fs *fs;
942 ino_t inum;
943 int flags;
944 struct inodedep **inodedeppp;
945{
946 struct inodedep *inodedep;
947 struct inodedep_hashhead *inodedephd;
948 int firsttry;
949
950#ifdef DEBUG
951 if (lk.lkt_held == -1)
952 panic("inodedep_lookup: lock not held");
953#endif
954 firsttry = 1;
955 inodedephd = INODEDEP_HASH(fs, inum);
956top:
957 LIST_FOREACH(inodedep, inodedephd, id_hash)
958 if (inum == inodedep->id_ino && fs == inodedep->id_fs)
959 break;
960 if (inodedep) {
961 *inodedeppp = inodedep;
962 return (1);
963 }
964 if ((flags & DEPALLOC) == 0) {
965 *inodedeppp = NULL;
966 return (0);
967 }
968 /*
969 * If we are over our limit, try to improve the situation.
970 */
971 if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 &&
972 request_cleanup(FLUSH_INODES, 1)) {
973 firsttry = 0;
974 goto top;
975 }
976 if (sema_get(&inodedep_in_progress, &lk) == 0) {
977 ACQUIRE_LOCK(&lk);
978 goto top;
979 }
980 num_inodedep += 1;
981 MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
982 M_INODEDEP, M_SOFTDEP_FLAGS);
983 inodedep->id_list.wk_type = D_INODEDEP;
984 inodedep->id_fs = fs;
985 inodedep->id_ino = inum;
986 inodedep->id_state = ALLCOMPLETE;
987 inodedep->id_nlinkdelta = 0;
988 inodedep->id_savedino = NULL;
989 inodedep->id_savedsize = -1;
990 inodedep->id_buf = NULL;
991 LIST_INIT(&inodedep->id_pendinghd);
992 LIST_INIT(&inodedep->id_inowait);
993 LIST_INIT(&inodedep->id_bufwait);
994 TAILQ_INIT(&inodedep->id_inoupdt);
995 TAILQ_INIT(&inodedep->id_newinoupdt);
996 ACQUIRE_LOCK(&lk);
997 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
998 sema_release(&inodedep_in_progress);
999 *inodedeppp = inodedep;
1000 return (0);
1001}
1002
1003/*
1004 * Structures and routines associated with newblk caching.
1005 */
1006LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1007u_long newblk_hash; /* size of hash table - 1 */
1008#define NEWBLK_HASH(fs, inum) \
1009 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1010static struct sema newblk_in_progress;
1011
1012/*
1013 * Look up a newblk. Return 1 if found, 0 if not found.
1014 * If not found, allocate if DEPALLOC flag is passed.
1015 * Found or allocated entry is returned in newblkpp.
1016 */
1017static int
1018newblk_lookup(fs, newblkno, flags, newblkpp)
1019 struct fs *fs;
1020 ufs_daddr_t newblkno;
1021 int flags;
1022 struct newblk **newblkpp;
1023{
1024 struct newblk *newblk;
1025 struct newblk_hashhead *newblkhd;
1026
1027 newblkhd = NEWBLK_HASH(fs, newblkno);
1028top:
1029 LIST_FOREACH(newblk, newblkhd, nb_hash)
1030 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
1031 break;
1032 if (newblk) {
1033 *newblkpp = newblk;
1034 return (1);
1035 }
1036 if ((flags & DEPALLOC) == 0) {
1037 *newblkpp = NULL;
1038 return (0);
1039 }
1040 if (sema_get(&newblk_in_progress, 0) == 0)
1041 goto top;
1042 MALLOC(newblk, struct newblk *, sizeof(struct newblk),
1043 M_NEWBLK, M_SOFTDEP_FLAGS);
1044 newblk->nb_state = 0;
1045 newblk->nb_fs = fs;
1046 newblk->nb_newblkno = newblkno;
1047 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1048 sema_release(&newblk_in_progress);
1049 *newblkpp = newblk;
1050 return (0);
1051}
1052
1053/*
1054 * Executed during filesystem system initialization before
1055 * mounting any file systems.
1056 */
1057void
1058softdep_initialize()
1059{
1060
1061 LIST_INIT(&mkdirlisthd);
1062 LIST_INIT(&softdep_workitem_pending);
1063 max_softdeps = min(desiredvnodes * 8,
1064 M_INODEDEP->ks_limit / (2 * sizeof(struct inodedep)));
1065 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
1066 &pagedep_hash);
1067 sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
1068 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
1069 sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
1070 newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
1071 sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
1072}
1073
1074/*
1075 * Called at mount time to notify the dependency code that a
1076 * filesystem wishes to use it.
1077 */
1078int
1079softdep_mount(devvp, mp, fs, cred)
1080 struct vnode *devvp;
1081 struct mount *mp;
1082 struct fs *fs;
1083 struct ucred *cred;
1084{
1085 struct csum cstotal;
1086 struct cg *cgp;
1087 struct buf *bp;
1088 int error, cyl;
1089
1090 mp->mnt_flag &= ~MNT_ASYNC;
1091 mp->mnt_flag |= MNT_SOFTDEP;
1092 /*
1093 * When doing soft updates, the counters in the
1094 * superblock may have gotten out of sync, so we have
1095 * to scan the cylinder groups and recalculate them.
1096 */
1097 if (fs->fs_clean != 0)
1098 return (0);
1099 bzero(&cstotal, sizeof cstotal);
1100 for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1101 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
1102 fs->fs_cgsize, cred, &bp)) != 0) {
1103 brelse(bp);
1104 return (error);
1105 }
1106 cgp = (struct cg *)bp->b_data;
1107 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1108 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1109 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1110 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1111 fs->fs_cs(fs, cyl) = cgp->cg_cs;
1112 brelse(bp);
1113 }
1114#ifdef DEBUG
1115 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1116 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
1117#endif
1118 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1119 return (0);
1120}
1121
1122/*
1123 * Protecting the freemaps (or bitmaps).
1124 *
1125 * To eliminate the need to execute fsck before mounting a file system
1126 * after a power failure, one must (conservatively) guarantee that the
1127 * on-disk copy of the bitmaps never indicate that a live inode or block is
1128 * free. So, when a block or inode is allocated, the bitmap should be
1129 * updated (on disk) before any new pointers. When a block or inode is
1130 * freed, the bitmap should not be updated until all pointers have been
1131 * reset. The latter dependency is handled by the delayed de-allocation
1132 * approach described below for block and inode de-allocation. The former
1133 * dependency is handled by calling the following procedure when a block or
1134 * inode is allocated. When an inode is allocated an "inodedep" is created
1135 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1136 * Each "inodedep" is also inserted into the hash indexing structure so
1137 * that any additional link additions can be made dependent on the inode
1138 * allocation.
1139 *
1140 * The ufs file system maintains a number of free block counts (e.g., per
1141 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1142 * in addition to the bitmaps. These counts are used to improve efficiency
1143 * during allocation and therefore must be consistent with the bitmaps.
1144 * There is no convenient way to guarantee post-crash consistency of these
1145 * counts with simple update ordering, for two main reasons: (1) The counts
1146 * and bitmaps for a single cylinder group block are not in the same disk
1147 * sector. If a disk write is interrupted (e.g., by power failure), one may
1148 * be written and the other not. (2) Some of the counts are located in the
1149 * superblock rather than the cylinder group block. So, we focus our soft
1150 * updates implementation on protecting the bitmaps. When mounting a
1151 * filesystem, we recompute the auxiliary counts from the bitmaps.
1152 */
1153
1154/*
1155 * Called just after updating the cylinder group block to allocate an inode.
1156 */
1157void
1158softdep_setup_inomapdep(bp, ip, newinum)
1159 struct buf *bp; /* buffer for cylgroup block with inode map */
1160 struct inode *ip; /* inode related to allocation */
1161 ino_t newinum; /* new inode number being allocated */
1162{
1163 struct inodedep *inodedep;
1164 struct bmsafemap *bmsafemap;
1165
1166 /*
1167 * Create a dependency for the newly allocated inode.
1168 * Panic if it already exists as something is seriously wrong.
1169 * Otherwise add it to the dependency list for the buffer holding
1170 * the cylinder group map from which it was allocated.
1171 */
1172 ACQUIRE_LOCK(&lk);
1173 if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC|NODELAY, &inodedep))) {
1174 FREE_LOCK(&lk);
1175 panic("softdep_setup_inomapdep: found inode");
1176 }
1177 inodedep->id_buf = bp;
1178 inodedep->id_state &= ~DEPCOMPLETE;
1179 bmsafemap = bmsafemap_lookup(bp);
1180 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1181 FREE_LOCK(&lk);
1182}
1183
1184/*
1185 * Called just after updating the cylinder group block to
1186 * allocate block or fragment.
1187 */
1188void
1189softdep_setup_blkmapdep(bp, fs, newblkno)
1190 struct buf *bp; /* buffer for cylgroup block with block map */
1191 struct fs *fs; /* filesystem doing allocation */
1192 ufs_daddr_t newblkno; /* number of newly allocated block */
1193{
1194 struct newblk *newblk;
1195 struct bmsafemap *bmsafemap;
1196
1197 /*
1198 * Create a dependency for the newly allocated block.
1199 * Add it to the dependency list for the buffer holding
1200 * the cylinder group map from which it was allocated.
1201 */
1202 if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1203 panic("softdep_setup_blkmapdep: found block");
1204 ACQUIRE_LOCK(&lk);
1205 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1206 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1207 FREE_LOCK(&lk);
1208}
1209
1210/*
1211 * Find the bmsafemap associated with a cylinder group buffer.
1212 * If none exists, create one. The buffer must be locked when
1213 * this routine is called and this routine must be called with
1214 * splbio interrupts blocked.
1215 */
1216static struct bmsafemap *
1217bmsafemap_lookup(bp)
1218 struct buf *bp;
1219{
1220 struct bmsafemap *bmsafemap;
1221 struct worklist *wk;
1222
1223#ifdef DEBUG
1224 if (lk.lkt_held == -1)
1225 panic("bmsafemap_lookup: lock not held");
1226#endif
1227 LIST_FOREACH(wk, &bp->b_dep, wk_list)
1228 if (wk->wk_type == D_BMSAFEMAP)
1229 return (WK_BMSAFEMAP(wk));
1230 FREE_LOCK(&lk);
1231 MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1232 M_BMSAFEMAP, M_SOFTDEP_FLAGS);
1233 bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1234 bmsafemap->sm_list.wk_state = 0;
1235 bmsafemap->sm_buf = bp;
1236 LIST_INIT(&bmsafemap->sm_allocdirecthd);
1237 LIST_INIT(&bmsafemap->sm_allocindirhd);
1238 LIST_INIT(&bmsafemap->sm_inodedephd);
1239 LIST_INIT(&bmsafemap->sm_newblkhd);
1240 ACQUIRE_LOCK(&lk);
1241 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1242 return (bmsafemap);
1243}
1244
1245/*
1246 * Direct block allocation dependencies.
1247 *
1248 * When a new block is allocated, the corresponding disk locations must be
1249 * initialized (with zeros or new data) before the on-disk inode points to
1250 * them. Also, the freemap from which the block was allocated must be
1251 * updated (on disk) before the inode's pointer. These two dependencies are
1252 * independent of each other and are needed for all file blocks and indirect
1253 * blocks that are pointed to directly by the inode. Just before the
1254 * "in-core" version of the inode is updated with a newly allocated block
1255 * number, a procedure (below) is called to setup allocation dependency
1256 * structures. These structures are removed when the corresponding
1257 * dependencies are satisfied or when the block allocation becomes obsolete
1258 * (i.e., the file is deleted, the block is de-allocated, or the block is a
1259 * fragment that gets upgraded). All of these cases are handled in
1260 * procedures described later.
1261 *
1262 * When a file extension causes a fragment to be upgraded, either to a larger
1263 * fragment or to a full block, the on-disk location may change (if the
1264 * previous fragment could not simply be extended). In this case, the old
1265 * fragment must be de-allocated, but not until after the inode's pointer has
1266 * been updated. In most cases, this is handled by later procedures, which
1267 * will construct a "freefrag" structure to be added to the workitem queue
1268 * when the inode update is complete (or obsolete). The main exception to
1269 * this is when an allocation occurs while a pending allocation dependency
1270 * (for the same block pointer) remains. This case is handled in the main
1271 * allocation dependency setup procedure by immediately freeing the
1272 * unreferenced fragments.
1273 */
1274void
1275softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1276 struct inode *ip; /* inode to which block is being added */
1277 ufs_lbn_t lbn; /* block pointer within inode */
1278 ufs_daddr_t newblkno; /* disk block number being added */
1279 ufs_daddr_t oldblkno; /* previous block number, 0 unless frag */
1280 long newsize; /* size of new block */
1281 long oldsize; /* size of new block */
1282 struct buf *bp; /* bp for allocated block */
1283{
1284 struct allocdirect *adp, *oldadp;
1285 struct allocdirectlst *adphead;
1286 struct bmsafemap *bmsafemap;
1287 struct inodedep *inodedep;
1288 struct pagedep *pagedep;
1289 struct newblk *newblk;
1290
1291 MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1292 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1293 adp->ad_list.wk_type = D_ALLOCDIRECT;
1294 adp->ad_lbn = lbn;
1295 adp->ad_newblkno = newblkno;
1296 adp->ad_oldblkno = oldblkno;
1297 adp->ad_newsize = newsize;
1298 adp->ad_oldsize = oldsize;
1299 adp->ad_state = ATTACHED;
1300 if (newblkno == oldblkno)
1301 adp->ad_freefrag = NULL;
1302 else
1303 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1304
1305 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1306 panic("softdep_setup_allocdirect: lost block");
1307
1308 ACQUIRE_LOCK(&lk);
1309 inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1310 adp->ad_inodedep = inodedep;
1311
1312 if (newblk->nb_state == DEPCOMPLETE) {
1313 adp->ad_state |= DEPCOMPLETE;
1314 adp->ad_buf = NULL;
1315 } else {
1316 bmsafemap = newblk->nb_bmsafemap;
1317 adp->ad_buf = bmsafemap->sm_buf;
1318 LIST_REMOVE(newblk, nb_deps);
1319 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1320 }
1321 LIST_REMOVE(newblk, nb_hash);
1322 FREE(newblk, M_NEWBLK);
1323
1324 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1325 if (lbn >= NDADDR) {
1326 /* allocating an indirect block */
1327 if (oldblkno != 0) {
1328 FREE_LOCK(&lk);
1329 panic("softdep_setup_allocdirect: non-zero indir");
1330 }
1331 } else {
1332 /*
1333 * Allocating a direct block.
1334 *
1335 * If we are allocating a directory block, then we must
1336 * allocate an associated pagedep to track additions and
1337 * deletions.
1338 */
1339 if ((ip->i_mode & IFMT) == IFDIR &&
1340 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1341 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1342 }
1343 /*
1344 * The list of allocdirects must be kept in sorted and ascending
1345 * order so that the rollback routines can quickly determine the
1346 * first uncommitted block (the size of the file stored on disk
1347 * ends at the end of the lowest committed fragment, or if there
1348 * are no fragments, at the end of the highest committed block).
1349 * Since files generally grow, the typical case is that the new
1350 * block is to be added at the end of the list. We speed this
1351 * special case by checking against the last allocdirect in the
1352 * list before laboriously traversing the list looking for the
1353 * insertion point.
1354 */
1355 adphead = &inodedep->id_newinoupdt;
1356 oldadp = TAILQ_LAST(adphead, allocdirectlst);
1357 if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1358 /* insert at end of list */
1359 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1360 if (oldadp != NULL && oldadp->ad_lbn == lbn)
1361 allocdirect_merge(adphead, adp, oldadp);
1362 FREE_LOCK(&lk);
1363 return;
1364 }
1365 TAILQ_FOREACH(oldadp, adphead, ad_next) {
1366 if (oldadp->ad_lbn >= lbn)
1367 break;
1368 }
1369 if (oldadp == NULL) {
1370 FREE_LOCK(&lk);
1371 panic("softdep_setup_allocdirect: lost entry");
1372 }
1373 /* insert in middle of list */
1374 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1375 if (oldadp->ad_lbn == lbn)
1376 allocdirect_merge(adphead, adp, oldadp);
1377 FREE_LOCK(&lk);
1378}
1379
1380/*
1381 * Replace an old allocdirect dependency with a newer one.
1382 * This routine must be called with splbio interrupts blocked.
1383 */
1384static void
1385allocdirect_merge(adphead, newadp, oldadp)
1386 struct allocdirectlst *adphead; /* head of list holding allocdirects */
1387 struct allocdirect *newadp; /* allocdirect being added */
1388 struct allocdirect *oldadp; /* existing allocdirect being checked */
1389{
1390 struct freefrag *freefrag;
1391
1392#ifdef DEBUG
1393 if (lk.lkt_held == -1)
1394 panic("allocdirect_merge: lock not held");
1395#endif
1396 if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1397 newadp->ad_oldsize != oldadp->ad_newsize ||
1398 newadp->ad_lbn >= NDADDR) {
1399 FREE_LOCK(&lk);
1400 panic("allocdirect_check: old %d != new %d || lbn %ld >= %d",
1401 newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
1402 NDADDR);
1403 }
1404 newadp->ad_oldblkno = oldadp->ad_oldblkno;
1405 newadp->ad_oldsize = oldadp->ad_oldsize;
1406 /*
1407 * If the old dependency had a fragment to free or had never
1408 * previously had a block allocated, then the new dependency
1409 * can immediately post its freefrag and adopt the old freefrag.
1410 * This action is done by swapping the freefrag dependencies.
1411 * The new dependency gains the old one's freefrag, and the
1412 * old one gets the new one and then immediately puts it on
1413 * the worklist when it is freed by free_allocdirect. It is
1414 * not possible to do this swap when the old dependency had a
1415 * non-zero size but no previous fragment to free. This condition
1416 * arises when the new block is an extension of the old block.
1417 * Here, the first part of the fragment allocated to the new
1418 * dependency is part of the block currently claimed on disk by
1419 * the old dependency, so cannot legitimately be freed until the
1420 * conditions for the new dependency are fulfilled.
1421 */
1422 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1423 freefrag = newadp->ad_freefrag;
1424 newadp->ad_freefrag = oldadp->ad_freefrag;
1425 oldadp->ad_freefrag = freefrag;
1426 }
1427 free_allocdirect(adphead, oldadp, 0);
1428}
1429
1430/*
1431 * Allocate a new freefrag structure if needed.
1432 */
1433static struct freefrag *
1434newfreefrag(ip, blkno, size)
1435 struct inode *ip;
1436 ufs_daddr_t blkno;
1437 long size;
1438{
1439 struct freefrag *freefrag;
1440 struct fs *fs;
1441
1442 if (blkno == 0)
1443 return (NULL);
1444 fs = ip->i_fs;
1445 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1446 panic("newfreefrag: frag size");
1447 MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1448 M_FREEFRAG, M_SOFTDEP_FLAGS);
1449 freefrag->ff_list.wk_type = D_FREEFRAG;
1450 freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */
1451 freefrag->ff_inum = ip->i_number;
1452 freefrag->ff_mnt = ITOV(ip)->v_mount;
1453 freefrag->ff_devvp = ip->i_devvp;
1454 freefrag->ff_blkno = blkno;
1455 freefrag->ff_fragsize = size;
1456 return (freefrag);
1457}
1458
1459/*
1460 * This workitem de-allocates fragments that were replaced during
1461 * file block allocation.
1462 */
1463static void
1464handle_workitem_freefrag(freefrag)
1465 struct freefrag *freefrag;
1466{
1467 struct inode tip;
1468
1469 tip.i_vnode = NULL;
1470 tip.i_fs = VFSTOUFS(freefrag->ff_mnt)->um_fs;
1471 tip.i_devvp = freefrag->ff_devvp;
1472 tip.i_dev = freefrag->ff_devvp->v_rdev;
1473 tip.i_number = freefrag->ff_inum;
1474 tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above */
1475 ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1476 FREE(freefrag, M_FREEFRAG);
1477}
1478
1479/*
1480 * Indirect block allocation dependencies.
1481 *
1482 * The same dependencies that exist for a direct block also exist when
1483 * a new block is allocated and pointed to by an entry in a block of
1484 * indirect pointers. The undo/redo states described above are also
1485 * used here. Because an indirect block contains many pointers that
1486 * may have dependencies, a second copy of the entire in-memory indirect
1487 * block is kept. The buffer cache copy is always completely up-to-date.
1488 * The second copy, which is used only as a source for disk writes,
1489 * contains only the safe pointers (i.e., those that have no remaining
1490 * update dependencies). The second copy is freed when all pointers
1491 * are safe. The cache is not allowed to replace indirect blocks with
1492 * pending update dependencies. If a buffer containing an indirect
1493 * block with dependencies is written, these routines will mark it
1494 * dirty again. It can only be successfully written once all the
1495 * dependencies are removed. The ffs_fsync routine in conjunction with
1496 * softdep_sync_metadata work together to get all the dependencies
1497 * removed so that a file can be successfully written to disk. Three
1498 * procedures are used when setting up indirect block pointer
1499 * dependencies. The division is necessary because of the organization
1500 * of the "balloc" routine and because of the distinction between file
1501 * pages and file metadata blocks.
1502 */
1503
1504/*
1505 * Allocate a new allocindir structure.
1506 */
1507static struct allocindir *
1508newallocindir(ip, ptrno, newblkno, oldblkno)
1509 struct inode *ip; /* inode for file being extended */
1510 int ptrno; /* offset of pointer in indirect block */
1511 ufs_daddr_t newblkno; /* disk block number being added */
1512 ufs_daddr_t oldblkno; /* previous block number, 0 if none */
1513{
1514 struct allocindir *aip;
1515
1516 MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1517 M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
1518 aip->ai_list.wk_type = D_ALLOCINDIR;
1519 aip->ai_state = ATTACHED;
1520 aip->ai_offset = ptrno;
1521 aip->ai_newblkno = newblkno;
1522 aip->ai_oldblkno = oldblkno;
1523 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1524 return (aip);
1525}
1526
1527/*
1528 * Called just before setting an indirect block pointer
1529 * to a newly allocated file page.
1530 */
1531void
1532softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1533 struct inode *ip; /* inode for file being extended */
1534 ufs_lbn_t lbn; /* allocated block number within file */
1535 struct buf *bp; /* buffer with indirect blk referencing page */
1536 int ptrno; /* offset of pointer in indirect block */
1537 ufs_daddr_t newblkno; /* disk block number being added */
1538 ufs_daddr_t oldblkno; /* previous block number, 0 if none */
1539 struct buf *nbp; /* buffer holding allocated page */
1540{
1541 struct allocindir *aip;
1542 struct pagedep *pagedep;
1543
1544 aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1545 ACQUIRE_LOCK(&lk);
1546 /*
1547 * If we are allocating a directory page, then we must
1548 * allocate an associated pagedep to track additions and
1549 * deletions.
1550 */
1551 if ((ip->i_mode & IFMT) == IFDIR &&
1552 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1553 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1554 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1555 FREE_LOCK(&lk);
1556 setup_allocindir_phase2(bp, ip, aip);
1557}
1558
1559/*
1560 * Called just before setting an indirect block pointer to a
1561 * newly allocated indirect block.
1562 */
1563void
1564softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1565 struct buf *nbp; /* newly allocated indirect block */
1566 struct inode *ip; /* inode for file being extended */
1567 struct buf *bp; /* indirect block referencing allocated block */
1568 int ptrno; /* offset of pointer in indirect block */
1569 ufs_daddr_t newblkno; /* disk block number being added */
1570{
1571 struct allocindir *aip;
1572
1573 aip = newallocindir(ip, ptrno, newblkno, 0);
1574 ACQUIRE_LOCK(&lk);
1575 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1576 FREE_LOCK(&lk);
1577 setup_allocindir_phase2(bp, ip, aip);
1578}
1579
1580/*
1581 * Called to finish the allocation of the "aip" allocated
1582 * by one of the two routines above.
1583 */
1584static void
1585setup_allocindir_phase2(bp, ip, aip)
1586 struct buf *bp; /* in-memory copy of the indirect block */
1587 struct inode *ip; /* inode for file being extended */
1588 struct allocindir *aip; /* allocindir allocated by the above routines */
1589{
1590 struct worklist *wk;
1591 struct indirdep *indirdep, *newindirdep;
1592 struct bmsafemap *bmsafemap;
1593 struct allocindir *oldaip;
1594 struct freefrag *freefrag;
1595 struct newblk *newblk;
1596
1597 if (bp->b_lblkno >= 0)
1598 panic("setup_allocindir_phase2: not indir blk");
1599 for (indirdep = NULL, newindirdep = NULL; ; ) {
1600 ACQUIRE_LOCK(&lk);
1601 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
1602 if (wk->wk_type != D_INDIRDEP)
1603 continue;
1604 indirdep = WK_INDIRDEP(wk);
1605 break;
1606 }
1607 if (indirdep == NULL && newindirdep) {
1608 indirdep = newindirdep;
1609 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1610 newindirdep = NULL;
1611 }
1612 FREE_LOCK(&lk);
1613 if (indirdep) {
1614 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1615 &newblk) == 0)
1616 panic("setup_allocindir: lost block");
1617 ACQUIRE_LOCK(&lk);
1618 if (newblk->nb_state == DEPCOMPLETE) {
1619 aip->ai_state |= DEPCOMPLETE;
1620 aip->ai_buf = NULL;
1621 } else {
1622 bmsafemap = newblk->nb_bmsafemap;
1623 aip->ai_buf = bmsafemap->sm_buf;
1624 LIST_REMOVE(newblk, nb_deps);
1625 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1626 aip, ai_deps);
1627 }
1628 LIST_REMOVE(newblk, nb_hash);
1629 FREE(newblk, M_NEWBLK);
1630 aip->ai_indirdep = indirdep;
1631 /*
1632 * Check to see if there is an existing dependency
1633 * for this block. If there is, merge the old
1634 * dependency into the new one.
1635 */
1636 if (aip->ai_oldblkno == 0)
1637 oldaip = NULL;
1638 else
1639
1640 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
1641 if (oldaip->ai_offset == aip->ai_offset)
1642 break;
1643 freefrag = NULL;
1644 if (oldaip != NULL) {
1645 if (oldaip->ai_newblkno != aip->ai_oldblkno) {
1646 FREE_LOCK(&lk);
1647 panic("setup_allocindir_phase2: blkno");
1648 }
1649 aip->ai_oldblkno = oldaip->ai_oldblkno;
1650 freefrag = aip->ai_freefrag;
1651 aip->ai_freefrag = oldaip->ai_freefrag;
1652 oldaip->ai_freefrag = NULL;
1653 free_allocindir(oldaip, NULL);
1654 }
1655 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1656 ((ufs_daddr_t *)indirdep->ir_savebp->b_data)
1657 [aip->ai_offset] = aip->ai_oldblkno;
1658 FREE_LOCK(&lk);
1659 if (freefrag != NULL)
1660 handle_workitem_freefrag(freefrag);
1661 }
1662 if (newindirdep) {
1663 if (indirdep->ir_savebp != NULL)
1664 brelse(newindirdep->ir_savebp);
1665 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
1666 }
1667 if (indirdep)
1668 break;
1669 MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
1670 M_INDIRDEP, M_SOFTDEP_FLAGS);
1671 newindirdep->ir_list.wk_type = D_INDIRDEP;
1672 newindirdep->ir_state = ATTACHED;
1673 LIST_INIT(&newindirdep->ir_deplisthd);
1674 LIST_INIT(&newindirdep->ir_donehd);
1675 if (bp->b_blkno == bp->b_lblkno)
1676 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &bp->b_blkno, NULL, NULL);
1677 newindirdep->ir_savebp =
1678 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1679 BUF_KERNPROC(newindirdep->ir_savebp);
1680 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1681 }
1682}
1683
1684/*
1685 * Block de-allocation dependencies.
1686 *
1687 * When blocks are de-allocated, the on-disk pointers must be nullified before
1688 * the blocks are made available for use by other files. (The true
1689 * requirement is that old pointers must be nullified before new on-disk
1690 * pointers are set. We chose this slightly more stringent requirement to
1691 * reduce complexity.) Our implementation handles this dependency by updating
1692 * the inode (or indirect block) appropriately but delaying the actual block
1693 * de-allocation (i.e., freemap and free space count manipulation) until
1694 * after the updated versions reach stable storage. After the disk is
1695 * updated, the blocks can be safely de-allocated whenever it is convenient.
1696 * This implementation handles only the common case of reducing a file's
1697 * length to zero. Other cases are handled by the conventional synchronous
1698 * write approach.
1699 *
1700 * The ffs implementation with which we worked double-checks
1701 * the state of the block pointers and file size as it reduces
1702 * a file's length. Some of this code is replicated here in our
1703 * soft updates implementation. The freeblks->fb_chkcnt field is
1704 * used to transfer a part of this information to the procedure
1705 * that eventually de-allocates the blocks.
1706 *
1707 * This routine should be called from the routine that shortens
1708 * a file's length, before the inode's size or block pointers
1709 * are modified. It will save the block pointer information for
1710 * later release and zero the inode so that the calling routine
1711 * can release it.
1712 */
1713void
1714softdep_setup_freeblocks(ip, length)
1715 struct inode *ip; /* The inode whose length is to be reduced */
1716 off_t length; /* The new length for the file */
1717{
1718 struct freeblks *freeblks;
1719 struct inodedep *inodedep;
1720 struct allocdirect *adp;
1721 struct vnode *vp;
1722 struct buf *bp;
1723 struct fs *fs;
1724 int i, delay, error;
1725
1726 fs = ip->i_fs;
1727 if (length != 0)
61#include <sys/syslog.h>
62#include <sys/vnode.h>
63#include <sys/conf.h>
64#include <ufs/ufs/dir.h>
65#include <ufs/ufs/extattr.h>
66#include <ufs/ufs/quota.h>
67#include <ufs/ufs/inode.h>
68#include <ufs/ufs/ufsmount.h>
69#include <ufs/ffs/fs.h>
70#include <ufs/ffs/softdep.h>
71#include <ufs/ffs/ffs_extern.h>
72#include <ufs/ufs/ufs_extern.h>
73
74/*
75 * These definitions need to be adapted to the system to which
76 * this file is being ported.
77 */
78/*
79 * malloc types defined for the softdep system.
80 */
81static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
82static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
83static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
84static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
85static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
86static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
87static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
88static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
89static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
90static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
91static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
92static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
93static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
94
95#define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)
96
97#define D_PAGEDEP 0
98#define D_INODEDEP 1
99#define D_NEWBLK 2
100#define D_BMSAFEMAP 3
101#define D_ALLOCDIRECT 4
102#define D_INDIRDEP 5
103#define D_ALLOCINDIR 6
104#define D_FREEFRAG 7
105#define D_FREEBLKS 8
106#define D_FREEFILE 9
107#define D_DIRADD 10
108#define D_MKDIR 11
109#define D_DIRREM 12
110#define D_LAST D_DIRREM
111
112/*
113 * translate from workitem type to memory type
114 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
115 */
116static struct malloc_type *memtype[] = {
117 M_PAGEDEP,
118 M_INODEDEP,
119 M_NEWBLK,
120 M_BMSAFEMAP,
121 M_ALLOCDIRECT,
122 M_INDIRDEP,
123 M_ALLOCINDIR,
124 M_FREEFRAG,
125 M_FREEBLKS,
126 M_FREEFILE,
127 M_DIRADD,
128 M_MKDIR,
129 M_DIRREM
130};
131
132#define DtoM(type) (memtype[type])
133
134/*
135 * Names of malloc types.
136 */
137#define TYPENAME(type) \
138 ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
139/*
140 * End system adaptaion definitions.
141 */
142
143/*
144 * Internal function prototypes.
145 */
146static void softdep_error __P((char *, int));
147static void drain_output __P((struct vnode *, int));
148static int getdirtybuf __P((struct buf **, int));
149static void clear_remove __P((struct proc *));
150static void clear_inodedeps __P((struct proc *));
151static int flush_pagedep_deps __P((struct vnode *, struct mount *,
152 struct diraddhd *));
153static int flush_inodedep_deps __P((struct fs *, ino_t));
154static int handle_written_filepage __P((struct pagedep *, struct buf *));
155static void diradd_inode_written __P((struct diradd *, struct inodedep *));
156static int handle_written_inodeblock __P((struct inodedep *, struct buf *));
157static void handle_allocdirect_partdone __P((struct allocdirect *));
158static void handle_allocindir_partdone __P((struct allocindir *));
159static void initiate_write_filepage __P((struct pagedep *, struct buf *));
160static void handle_written_mkdir __P((struct mkdir *, int));
161static void initiate_write_inodeblock __P((struct inodedep *, struct buf *));
162static void handle_workitem_freefile __P((struct freefile *));
163static void handle_workitem_remove __P((struct dirrem *));
164static struct dirrem *newdirrem __P((struct buf *, struct inode *,
165 struct inode *, int, struct dirrem **));
166static void free_diradd __P((struct diradd *));
167static void free_allocindir __P((struct allocindir *, struct inodedep *));
168static int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t,
169 long *));
170static void deallocate_dependencies __P((struct buf *, struct inodedep *));
171static void free_allocdirect __P((struct allocdirectlst *,
172 struct allocdirect *, int));
173static int check_inode_unwritten __P((struct inodedep *));
174static int free_inodedep __P((struct inodedep *));
175static void handle_workitem_freeblocks __P((struct freeblks *, int));
176static void merge_inode_lists __P((struct inodedep *));
177static void setup_allocindir_phase2 __P((struct buf *, struct inode *,
178 struct allocindir *));
179static struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t,
180 ufs_daddr_t));
181static void handle_workitem_freefrag __P((struct freefrag *));
182static struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long));
183static void allocdirect_merge __P((struct allocdirectlst *,
184 struct allocdirect *, struct allocdirect *));
185static struct bmsafemap *bmsafemap_lookup __P((struct buf *));
186static int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
187 struct newblk **));
188static int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
189static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
190 struct pagedep **));
191static void pause_timer __P((void *));
192static int request_cleanup __P((int, int));
193static int process_worklist_item __P((struct mount *, int));
194static void add_to_worklist __P((struct worklist *));
195
196/*
197 * Exported softdep operations.
198 */
199static void softdep_disk_io_initiation __P((struct buf *));
200static void softdep_disk_write_complete __P((struct buf *));
201static void softdep_deallocate_dependencies __P((struct buf *));
202static void softdep_move_dependencies __P((struct buf *, struct buf *));
203static int softdep_count_dependencies __P((struct buf *bp, int));
204
205struct bio_ops bioops = {
206 softdep_disk_io_initiation, /* io_start */
207 softdep_disk_write_complete, /* io_complete */
208 softdep_deallocate_dependencies, /* io_deallocate */
209 softdep_move_dependencies, /* io_movedeps */
210 softdep_count_dependencies, /* io_countdeps */
211};
212
213/*
214 * Locking primitives.
215 *
216 * For a uniprocessor, all we need to do is protect against disk
217 * interrupts. For a multiprocessor, this lock would have to be
218 * a mutex. A single mutex is used throughout this file, though
219 * finer grain locking could be used if contention warranted it.
220 *
221 * For a multiprocessor, the sleep call would accept a lock and
222 * release it after the sleep processing was complete. In a uniprocessor
223 * implementation there is no such interlock, so we simple mark
224 * the places where it needs to be done with the `interlocked' form
225 * of the lock calls. Since the uniprocessor sleep already interlocks
226 * the spl, there is nothing that really needs to be done.
227 */
228#ifndef /* NOT */ DEBUG
229static struct lockit {
230 int lkt_spl;
231} lk = { 0 };
232#define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio()
233#define FREE_LOCK(lk) splx((lk)->lkt_spl)
234#define ACQUIRE_LOCK_INTERLOCKED(lk)
235#define FREE_LOCK_INTERLOCKED(lk)
236
237#else /* DEBUG */
238static struct lockit {
239 int lkt_spl;
240 pid_t lkt_held;
241} lk = { 0, -1 };
242static int lockcnt;
243
244static void acquire_lock __P((struct lockit *));
245static void free_lock __P((struct lockit *));
246static void acquire_lock_interlocked __P((struct lockit *));
247static void free_lock_interlocked __P((struct lockit *));
248
249#define ACQUIRE_LOCK(lk) acquire_lock(lk)
250#define FREE_LOCK(lk) free_lock(lk)
251#define ACQUIRE_LOCK_INTERLOCKED(lk) acquire_lock_interlocked(lk)
252#define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk)
253
254static void
255acquire_lock(lk)
256 struct lockit *lk;
257{
258 pid_t holder;
259
260 if (lk->lkt_held != -1) {
261 holder = lk->lkt_held;
262 FREE_LOCK(lk);
263 if (holder == CURPROC->p_pid)
264 panic("softdep_lock: locking against myself");
265 else
266 panic("softdep_lock: lock held by %d", holder);
267 }
268 lk->lkt_spl = splbio();
269 lk->lkt_held = CURPROC->p_pid;
270 lockcnt++;
271}
272
273static void
274free_lock(lk)
275 struct lockit *lk;
276{
277
278 if (lk->lkt_held == -1)
279 panic("softdep_unlock: lock not held");
280 lk->lkt_held = -1;
281 splx(lk->lkt_spl);
282}
283
284static void
285acquire_lock_interlocked(lk)
286 struct lockit *lk;
287{
288 pid_t holder;
289
290 if (lk->lkt_held != -1) {
291 holder = lk->lkt_held;
292 FREE_LOCK(lk);
293 if (holder == CURPROC->p_pid)
294 panic("softdep_lock_interlocked: locking against self");
295 else
296 panic("softdep_lock_interlocked: lock held by %d",
297 holder);
298 }
299 lk->lkt_held = CURPROC->p_pid;
300 lockcnt++;
301}
302
303static void
304free_lock_interlocked(lk)
305 struct lockit *lk;
306{
307
308 if (lk->lkt_held == -1)
309 panic("softdep_unlock_interlocked: lock not held");
310 lk->lkt_held = -1;
311}
312#endif /* DEBUG */
313
314/*
315 * Place holder for real semaphores.
316 */
317struct sema {
318 int value;
319 pid_t holder;
320 char *name;
321 int prio;
322 int timo;
323};
324static void sema_init __P((struct sema *, char *, int, int));
325static int sema_get __P((struct sema *, struct lockit *));
326static void sema_release __P((struct sema *));
327
328static void
329sema_init(semap, name, prio, timo)
330 struct sema *semap;
331 char *name;
332 int prio, timo;
333{
334
335 semap->holder = -1;
336 semap->value = 0;
337 semap->name = name;
338 semap->prio = prio;
339 semap->timo = timo;
340}
341
342static int
343sema_get(semap, interlock)
344 struct sema *semap;
345 struct lockit *interlock;
346{
347
348 if (semap->value++ > 0) {
349 if (interlock != NULL)
350 FREE_LOCK_INTERLOCKED(interlock);
351 tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo);
352 if (interlock != NULL) {
353 ACQUIRE_LOCK_INTERLOCKED(interlock);
354 FREE_LOCK(interlock);
355 }
356 return (0);
357 }
358 semap->holder = CURPROC->p_pid;
359 if (interlock != NULL)
360 FREE_LOCK(interlock);
361 return (1);
362}
363
364static void
365sema_release(semap)
366 struct sema *semap;
367{
368
369 if (semap->value <= 0 || semap->holder != CURPROC->p_pid) {
370 if (lk.lkt_held != -1)
371 FREE_LOCK(&lk);
372 panic("sema_release: not held");
373 }
374 if (--semap->value > 0) {
375 semap->value = 0;
376 wakeup(semap);
377 }
378 semap->holder = -1;
379}
380
381/*
382 * Worklist queue management.
383 * These routines require that the lock be held.
384 */
385#ifndef /* NOT */ DEBUG
386#define WORKLIST_INSERT(head, item) do { \
387 (item)->wk_state |= ONWORKLIST; \
388 LIST_INSERT_HEAD(head, item, wk_list); \
389} while (0)
390#define WORKLIST_REMOVE(item) do { \
391 (item)->wk_state &= ~ONWORKLIST; \
392 LIST_REMOVE(item, wk_list); \
393} while (0)
394#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
395
396#else /* DEBUG */
397static void worklist_insert __P((struct workhead *, struct worklist *));
398static void worklist_remove __P((struct worklist *));
399static void workitem_free __P((struct worklist *, int));
400
401#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
402#define WORKLIST_REMOVE(item) worklist_remove(item)
403#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
404
405static void
406worklist_insert(head, item)
407 struct workhead *head;
408 struct worklist *item;
409{
410
411 if (lk.lkt_held == -1)
412 panic("worklist_insert: lock not held");
413 if (item->wk_state & ONWORKLIST) {
414 FREE_LOCK(&lk);
415 panic("worklist_insert: already on list");
416 }
417 item->wk_state |= ONWORKLIST;
418 LIST_INSERT_HEAD(head, item, wk_list);
419}
420
421static void
422worklist_remove(item)
423 struct worklist *item;
424{
425
426 if (lk.lkt_held == -1)
427 panic("worklist_remove: lock not held");
428 if ((item->wk_state & ONWORKLIST) == 0) {
429 FREE_LOCK(&lk);
430 panic("worklist_remove: not on list");
431 }
432 item->wk_state &= ~ONWORKLIST;
433 LIST_REMOVE(item, wk_list);
434}
435
436static void
437workitem_free(item, type)
438 struct worklist *item;
439 int type;
440{
441
442 if (item->wk_state & ONWORKLIST) {
443 if (lk.lkt_held != -1)
444 FREE_LOCK(&lk);
445 panic("workitem_free: still on list");
446 }
447 if (item->wk_type != type) {
448 if (lk.lkt_held != -1)
449 FREE_LOCK(&lk);
450 panic("workitem_free: type mismatch");
451 }
452 FREE(item, DtoM(type));
453}
454#endif /* DEBUG */
455
456/*
457 * Workitem queue management
458 */
459static struct workhead softdep_workitem_pending;
460static int num_on_worklist; /* number of worklist items to be processed */
461static int softdep_worklist_busy; /* 1 => trying to do unmount */
462static int softdep_worklist_req; /* serialized waiters */
463static int max_softdeps; /* maximum number of structs before slowdown */
464static int tickdelay = 2; /* number of ticks to pause during slowdown */
465static int proc_waiting; /* tracks whether we have a timeout posted */
466static int *stat_countp; /* statistic to count in proc_waiting timeout */
467static struct callout_handle handle; /* handle on posted proc_waiting timeout */
468static struct proc *filesys_syncer; /* proc of filesystem syncer process */
469static int req_clear_inodedeps; /* syncer process flush some inodedeps */
470#define FLUSH_INODES 1
471static int req_clear_remove; /* syncer process flush some freeblks */
472#define FLUSH_REMOVE 2
473/*
474 * runtime statistics
475 */
476static int stat_worklist_push; /* number of worklist cleanups */
477static int stat_blk_limit_push; /* number of times block limit neared */
478static int stat_ino_limit_push; /* number of times inode limit neared */
479static int stat_blk_limit_hit; /* number of times block slowdown imposed */
480static int stat_ino_limit_hit; /* number of times inode slowdown imposed */
481static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
482static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
483static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
484static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
485static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
486#ifdef DEBUG
487#include <vm/vm.h>
488#include <sys/sysctl.h>
489SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
490SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
491SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
492SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
493SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
494SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
495SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
496SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
497SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
498SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
499SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
500SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
501#endif /* DEBUG */
502
503/*
504 * Add an item to the end of the work queue.
505 * This routine requires that the lock be held.
506 * This is the only routine that adds items to the list.
507 * The following routine is the only one that removes items
508 * and does so in order from first to last.
509 */
510static void
511add_to_worklist(wk)
512 struct worklist *wk;
513{
514 static struct worklist *worklist_tail;
515
516 if (wk->wk_state & ONWORKLIST) {
517 if (lk.lkt_held != -1)
518 FREE_LOCK(&lk);
519 panic("add_to_worklist: already on list");
520 }
521 wk->wk_state |= ONWORKLIST;
522 if (LIST_FIRST(&softdep_workitem_pending) == NULL)
523 LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
524 else
525 LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
526 worklist_tail = wk;
527 num_on_worklist += 1;
528}
529
530/*
531 * Process that runs once per second to handle items in the background queue.
532 *
533 * Note that we ensure that everything is done in the order in which they
534 * appear in the queue. The code below depends on this property to ensure
535 * that blocks of a file are freed before the inode itself is freed. This
536 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
537 * until all the old ones have been purged from the dependency lists.
538 */
539int
540softdep_process_worklist(matchmnt)
541 struct mount *matchmnt;
542{
543 struct proc *p = CURPROC;
544 int matchcnt, loopcount;
545 long starttime;
546
547 /*
548 * Record the process identifier of our caller so that we can give
549 * this process preferential treatment in request_cleanup below.
550 */
551 filesys_syncer = p;
552 matchcnt = 0;
553
554 /*
555 * There is no danger of having multiple processes run this
556 * code, but we have to single-thread it when softdep_flushfiles()
557 * is in operation to get an accurate count of the number of items
558 * related to its mount point that are in the list.
559 */
560 if (matchmnt == NULL) {
561 if (softdep_worklist_busy < 0)
562 return(-1);
563 softdep_worklist_busy += 1;
564 }
565
566 /*
567 * If requested, try removing inode or removal dependencies.
568 */
569 if (req_clear_inodedeps) {
570 clear_inodedeps(p);
571 req_clear_inodedeps -= 1;
572 wakeup_one(&proc_waiting);
573 }
574 if (req_clear_remove) {
575 clear_remove(p);
576 req_clear_remove -= 1;
577 wakeup_one(&proc_waiting);
578 }
579 loopcount = 1;
580 starttime = time_second;
581 while (num_on_worklist > 0) {
582 matchcnt += process_worklist_item(matchmnt, 0);
583
584 /*
585 * If a umount operation wants to run the worklist
586 * accurately, abort.
587 */
588 if (softdep_worklist_req && matchmnt == NULL) {
589 matchcnt = -1;
590 break;
591 }
592
593 /*
594 * If requested, try removing inode or removal dependencies.
595 */
596 if (req_clear_inodedeps) {
597 clear_inodedeps(p);
598 req_clear_inodedeps -= 1;
599 wakeup_one(&proc_waiting);
600 }
601 if (req_clear_remove) {
602 clear_remove(p);
603 req_clear_remove -= 1;
604 wakeup_one(&proc_waiting);
605 }
606 /*
607 * We do not generally want to stop for buffer space, but if
608 * we are really being a buffer hog, we will stop and wait.
609 */
610 if (loopcount++ % 128 == 0)
611 bwillwrite();
612 /*
613 * Never allow processing to run for more than one
614 * second. Otherwise the other syncer tasks may get
615 * excessively backlogged.
616 */
617 if (starttime != time_second && matchmnt == NULL) {
618 matchcnt = -1;
619 break;
620 }
621 }
622 if (matchmnt == NULL) {
623 softdep_worklist_busy -= 1;
624 if (softdep_worklist_req && softdep_worklist_busy == 0)
625 wakeup(&softdep_worklist_req);
626 }
627 return (matchcnt);
628}
629
630/*
631 * Process one item on the worklist.
632 */
633static int
634process_worklist_item(matchmnt, flags)
635 struct mount *matchmnt;
636 int flags;
637{
638 struct worklist *wk;
639 struct dirrem *dirrem;
640 struct mount *mp;
641 struct vnode *vp;
642 int matchcnt = 0;
643
644 ACQUIRE_LOCK(&lk);
645 /*
646 * Normally we just process each item on the worklist in order.
647 * However, if we are in a situation where we cannot lock any
648 * inodes, we have to skip over any dirrem requests whose
649 * vnodes are resident and locked.
650 */
651 LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
652 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
653 break;
654 dirrem = WK_DIRREM(wk);
655 vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
656 dirrem->dm_oldinum);
657 if (vp == NULL || !VOP_ISLOCKED(vp, CURPROC))
658 break;
659 }
660 if (wk == 0) {
661 FREE_LOCK(&lk);
662 return (0);
663 }
664 WORKLIST_REMOVE(wk);
665 num_on_worklist -= 1;
666 FREE_LOCK(&lk);
667 switch (wk->wk_type) {
668
669 case D_DIRREM:
670 /* removal of a directory entry */
671 mp = WK_DIRREM(wk)->dm_mnt;
672 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
673 panic("%s: dirrem on suspended filesystem",
674 "process_worklist_item");
675 if (mp == matchmnt)
676 matchcnt += 1;
677 handle_workitem_remove(WK_DIRREM(wk));
678 break;
679
680 case D_FREEBLKS:
681 /* releasing blocks and/or fragments from a file */
682 mp = WK_FREEBLKS(wk)->fb_mnt;
683 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
684 panic("%s: freeblks on suspended filesystem",
685 "process_worklist_item");
686 if (mp == matchmnt)
687 matchcnt += 1;
688 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
689 break;
690
691 case D_FREEFRAG:
692 /* releasing a fragment when replaced as a file grows */
693 mp = WK_FREEFRAG(wk)->ff_mnt;
694 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
695 panic("%s: freefrag on suspended filesystem",
696 "process_worklist_item");
697 if (mp == matchmnt)
698 matchcnt += 1;
699 handle_workitem_freefrag(WK_FREEFRAG(wk));
700 break;
701
702 case D_FREEFILE:
703 /* releasing an inode when its link count drops to 0 */
704 mp = WK_FREEFILE(wk)->fx_mnt;
705 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
706 panic("%s: freefile on suspended filesystem",
707 "process_worklist_item");
708 if (mp == matchmnt)
709 matchcnt += 1;
710 handle_workitem_freefile(WK_FREEFILE(wk));
711 break;
712
713 default:
714 panic("%s_process_worklist: Unknown type %s",
715 "softdep", TYPENAME(wk->wk_type));
716 /* NOTREACHED */
717 }
718 return (matchcnt);
719}
720
721/*
722 * Move dependencies from one buffer to another.
723 */
724static void
725softdep_move_dependencies(oldbp, newbp)
726 struct buf *oldbp;
727 struct buf *newbp;
728{
729 struct worklist *wk, *wktail;
730
731 if (LIST_FIRST(&newbp->b_dep) != NULL)
732 panic("softdep_move_dependencies: need merge code");
733 wktail = 0;
734 ACQUIRE_LOCK(&lk);
735 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
736 LIST_REMOVE(wk, wk_list);
737 if (wktail == 0)
738 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
739 else
740 LIST_INSERT_AFTER(wktail, wk, wk_list);
741 wktail = wk;
742 }
743 FREE_LOCK(&lk);
744}
745
746/*
747 * Purge the work list of all items associated with a particular mount point.
748 */
749int
750softdep_flushworklist(oldmnt, countp, p)
751 struct mount *oldmnt;
752 int *countp;
753 struct proc *p;
754{
755 struct vnode *devvp;
756 int count, error = 0;
757
758 /*
759 * Await our turn to clear out the queue, then serialize access.
760 */
761 while (softdep_worklist_busy) {
762 softdep_worklist_req += 1;
763 tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0);
764 softdep_worklist_req -= 1;
765 }
766 softdep_worklist_busy = -1;
767 /*
768 * Alternately flush the block device associated with the mount
769 * point and process any dependencies that the flushing
770 * creates. We continue until no more worklist dependencies
771 * are found.
772 */
773 *countp = 0;
774 devvp = VFSTOUFS(oldmnt)->um_devvp;
775 while ((count = softdep_process_worklist(oldmnt)) > 0) {
776 *countp += count;
777 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
778 error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
779 VOP_UNLOCK(devvp, 0, p);
780 if (error)
781 break;
782 }
783 softdep_worklist_busy = 0;
784 if (softdep_worklist_req)
785 wakeup(&softdep_worklist_req);
786 return (error);
787}
788
789/*
790 * Flush all vnodes and worklist items associated with a specified mount point.
791 */
792int
793softdep_flushfiles(oldmnt, flags, p)
794 struct mount *oldmnt;
795 int flags;
796 struct proc *p;
797{
798 int error, count, loopcnt;
799
800 /*
801 * Alternately flush the vnodes associated with the mount
802 * point and process any dependencies that the flushing
803 * creates. In theory, this loop can happen at most twice,
804 * but we give it a few extra just to be sure.
805 */
806 for (loopcnt = 10; loopcnt > 0; loopcnt--) {
807 /*
808 * Do another flush in case any vnodes were brought in
809 * as part of the cleanup operations.
810 */
811 if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
812 break;
813 if ((error = softdep_flushworklist(oldmnt, &count, p)) != 0 ||
814 count == 0)
815 break;
816 }
817 /*
818 * If we are unmounting then it is an error to fail. If we
819 * are simply trying to downgrade to read-only, then filesystem
820 * activity can keep us busy forever, so we just fail with EBUSY.
821 */
822 if (loopcnt == 0) {
823 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
824 panic("softdep_flushfiles: looping");
825 error = EBUSY;
826 }
827 return (error);
828}
829
830/*
831 * Structure hashing.
832 *
833 * There are three types of structures that can be looked up:
834 * 1) pagedep structures identified by mount point, inode number,
835 * and logical block.
836 * 2) inodedep structures identified by mount point and inode number.
837 * 3) newblk structures identified by mount point and
838 * physical block number.
839 *
840 * The "pagedep" and "inodedep" dependency structures are hashed
841 * separately from the file blocks and inodes to which they correspond.
842 * This separation helps when the in-memory copy of an inode or
843 * file block must be replaced. It also obviates the need to access
844 * an inode or file page when simply updating (or de-allocating)
845 * dependency structures. Lookup of newblk structures is needed to
846 * find newly allocated blocks when trying to associate them with
847 * their allocdirect or allocindir structure.
848 *
849 * The lookup routines optionally create and hash a new instance when
850 * an existing entry is not found.
851 */
852#define DEPALLOC 0x0001 /* allocate structure if lookup fails */
853#define NODELAY 0x0002 /* cannot do background work */
854
855/*
856 * Structures and routines associated with pagedep caching.
857 */
858LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
859u_long pagedep_hash; /* size of hash table - 1 */
860#define PAGEDEP_HASH(mp, inum, lbn) \
861 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
862 pagedep_hash])
863static struct sema pagedep_in_progress;
864
865/*
866 * Look up a pagedep. Return 1 if found, 0 if not found.
867 * If not found, allocate if DEPALLOC flag is passed.
868 * Found or allocated entry is returned in pagedeppp.
869 * This routine must be called with splbio interrupts blocked.
870 */
871static int
872pagedep_lookup(ip, lbn, flags, pagedeppp)
873 struct inode *ip;
874 ufs_lbn_t lbn;
875 int flags;
876 struct pagedep **pagedeppp;
877{
878 struct pagedep *pagedep;
879 struct pagedep_hashhead *pagedephd;
880 struct mount *mp;
881 int i;
882
883#ifdef DEBUG
884 if (lk.lkt_held == -1)
885 panic("pagedep_lookup: lock not held");
886#endif
887 mp = ITOV(ip)->v_mount;
888 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
889top:
890 LIST_FOREACH(pagedep, pagedephd, pd_hash)
891 if (ip->i_number == pagedep->pd_ino &&
892 lbn == pagedep->pd_lbn &&
893 mp == pagedep->pd_mnt)
894 break;
895 if (pagedep) {
896 *pagedeppp = pagedep;
897 return (1);
898 }
899 if ((flags & DEPALLOC) == 0) {
900 *pagedeppp = NULL;
901 return (0);
902 }
903 if (sema_get(&pagedep_in_progress, &lk) == 0) {
904 ACQUIRE_LOCK(&lk);
905 goto top;
906 }
907 MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
908 M_SOFTDEP_FLAGS|M_ZERO);
909 pagedep->pd_list.wk_type = D_PAGEDEP;
910 pagedep->pd_mnt = mp;
911 pagedep->pd_ino = ip->i_number;
912 pagedep->pd_lbn = lbn;
913 LIST_INIT(&pagedep->pd_dirremhd);
914 LIST_INIT(&pagedep->pd_pendinghd);
915 for (i = 0; i < DAHASHSZ; i++)
916 LIST_INIT(&pagedep->pd_diraddhd[i]);
917 ACQUIRE_LOCK(&lk);
918 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
919 sema_release(&pagedep_in_progress);
920 *pagedeppp = pagedep;
921 return (0);
922}
923
924/*
925 * Structures and routines associated with inodedep caching.
926 */
927LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
928static u_long inodedep_hash; /* size of hash table - 1 */
929static long num_inodedep; /* number of inodedep allocated */
930#define INODEDEP_HASH(fs, inum) \
931 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
932static struct sema inodedep_in_progress;
933
934/*
935 * Look up a inodedep. Return 1 if found, 0 if not found.
936 * If not found, allocate if DEPALLOC flag is passed.
937 * Found or allocated entry is returned in inodedeppp.
938 * This routine must be called with splbio interrupts blocked.
939 */
940static int
941inodedep_lookup(fs, inum, flags, inodedeppp)
942 struct fs *fs;
943 ino_t inum;
944 int flags;
945 struct inodedep **inodedeppp;
946{
947 struct inodedep *inodedep;
948 struct inodedep_hashhead *inodedephd;
949 int firsttry;
950
951#ifdef DEBUG
952 if (lk.lkt_held == -1)
953 panic("inodedep_lookup: lock not held");
954#endif
955 firsttry = 1;
956 inodedephd = INODEDEP_HASH(fs, inum);
957top:
958 LIST_FOREACH(inodedep, inodedephd, id_hash)
959 if (inum == inodedep->id_ino && fs == inodedep->id_fs)
960 break;
961 if (inodedep) {
962 *inodedeppp = inodedep;
963 return (1);
964 }
965 if ((flags & DEPALLOC) == 0) {
966 *inodedeppp = NULL;
967 return (0);
968 }
969 /*
970 * If we are over our limit, try to improve the situation.
971 */
972 if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 &&
973 request_cleanup(FLUSH_INODES, 1)) {
974 firsttry = 0;
975 goto top;
976 }
977 if (sema_get(&inodedep_in_progress, &lk) == 0) {
978 ACQUIRE_LOCK(&lk);
979 goto top;
980 }
981 num_inodedep += 1;
982 MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
983 M_INODEDEP, M_SOFTDEP_FLAGS);
984 inodedep->id_list.wk_type = D_INODEDEP;
985 inodedep->id_fs = fs;
986 inodedep->id_ino = inum;
987 inodedep->id_state = ALLCOMPLETE;
988 inodedep->id_nlinkdelta = 0;
989 inodedep->id_savedino = NULL;
990 inodedep->id_savedsize = -1;
991 inodedep->id_buf = NULL;
992 LIST_INIT(&inodedep->id_pendinghd);
993 LIST_INIT(&inodedep->id_inowait);
994 LIST_INIT(&inodedep->id_bufwait);
995 TAILQ_INIT(&inodedep->id_inoupdt);
996 TAILQ_INIT(&inodedep->id_newinoupdt);
997 ACQUIRE_LOCK(&lk);
998 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
999 sema_release(&inodedep_in_progress);
1000 *inodedeppp = inodedep;
1001 return (0);
1002}
1003
1004/*
1005 * Structures and routines associated with newblk caching.
1006 */
1007LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1008u_long newblk_hash; /* size of hash table - 1 */
1009#define NEWBLK_HASH(fs, inum) \
1010 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1011static struct sema newblk_in_progress;
1012
1013/*
1014 * Look up a newblk. Return 1 if found, 0 if not found.
1015 * If not found, allocate if DEPALLOC flag is passed.
1016 * Found or allocated entry is returned in newblkpp.
1017 */
1018static int
1019newblk_lookup(fs, newblkno, flags, newblkpp)
1020 struct fs *fs;
1021 ufs_daddr_t newblkno;
1022 int flags;
1023 struct newblk **newblkpp;
1024{
1025 struct newblk *newblk;
1026 struct newblk_hashhead *newblkhd;
1027
1028 newblkhd = NEWBLK_HASH(fs, newblkno);
1029top:
1030 LIST_FOREACH(newblk, newblkhd, nb_hash)
1031 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
1032 break;
1033 if (newblk) {
1034 *newblkpp = newblk;
1035 return (1);
1036 }
1037 if ((flags & DEPALLOC) == 0) {
1038 *newblkpp = NULL;
1039 return (0);
1040 }
1041 if (sema_get(&newblk_in_progress, 0) == 0)
1042 goto top;
1043 MALLOC(newblk, struct newblk *, sizeof(struct newblk),
1044 M_NEWBLK, M_SOFTDEP_FLAGS);
1045 newblk->nb_state = 0;
1046 newblk->nb_fs = fs;
1047 newblk->nb_newblkno = newblkno;
1048 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1049 sema_release(&newblk_in_progress);
1050 *newblkpp = newblk;
1051 return (0);
1052}
1053
1054/*
1055 * Executed during filesystem system initialization before
1056 * mounting any file systems.
1057 */
1058void
1059softdep_initialize()
1060{
1061
1062 LIST_INIT(&mkdirlisthd);
1063 LIST_INIT(&softdep_workitem_pending);
1064 max_softdeps = min(desiredvnodes * 8,
1065 M_INODEDEP->ks_limit / (2 * sizeof(struct inodedep)));
1066 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
1067 &pagedep_hash);
1068 sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
1069 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
1070 sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
1071 newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
1072 sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
1073}
1074
1075/*
1076 * Called at mount time to notify the dependency code that a
1077 * filesystem wishes to use it.
1078 */
1079int
1080softdep_mount(devvp, mp, fs, cred)
1081 struct vnode *devvp;
1082 struct mount *mp;
1083 struct fs *fs;
1084 struct ucred *cred;
1085{
1086 struct csum cstotal;
1087 struct cg *cgp;
1088 struct buf *bp;
1089 int error, cyl;
1090
1091 mp->mnt_flag &= ~MNT_ASYNC;
1092 mp->mnt_flag |= MNT_SOFTDEP;
1093 /*
1094 * When doing soft updates, the counters in the
1095 * superblock may have gotten out of sync, so we have
1096 * to scan the cylinder groups and recalculate them.
1097 */
1098 if (fs->fs_clean != 0)
1099 return (0);
1100 bzero(&cstotal, sizeof cstotal);
1101 for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1102 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
1103 fs->fs_cgsize, cred, &bp)) != 0) {
1104 brelse(bp);
1105 return (error);
1106 }
1107 cgp = (struct cg *)bp->b_data;
1108 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1109 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1110 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1111 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1112 fs->fs_cs(fs, cyl) = cgp->cg_cs;
1113 brelse(bp);
1114 }
1115#ifdef DEBUG
1116 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1117 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
1118#endif
1119 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1120 return (0);
1121}
1122
1123/*
1124 * Protecting the freemaps (or bitmaps).
1125 *
1126 * To eliminate the need to execute fsck before mounting a file system
1127 * after a power failure, one must (conservatively) guarantee that the
1128 * on-disk copy of the bitmaps never indicate that a live inode or block is
1129 * free. So, when a block or inode is allocated, the bitmap should be
1130 * updated (on disk) before any new pointers. When a block or inode is
1131 * freed, the bitmap should not be updated until all pointers have been
1132 * reset. The latter dependency is handled by the delayed de-allocation
1133 * approach described below for block and inode de-allocation. The former
1134 * dependency is handled by calling the following procedure when a block or
1135 * inode is allocated. When an inode is allocated an "inodedep" is created
1136 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1137 * Each "inodedep" is also inserted into the hash indexing structure so
1138 * that any additional link additions can be made dependent on the inode
1139 * allocation.
1140 *
1141 * The ufs file system maintains a number of free block counts (e.g., per
1142 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1143 * in addition to the bitmaps. These counts are used to improve efficiency
1144 * during allocation and therefore must be consistent with the bitmaps.
1145 * There is no convenient way to guarantee post-crash consistency of these
1146 * counts with simple update ordering, for two main reasons: (1) The counts
1147 * and bitmaps for a single cylinder group block are not in the same disk
1148 * sector. If a disk write is interrupted (e.g., by power failure), one may
1149 * be written and the other not. (2) Some of the counts are located in the
1150 * superblock rather than the cylinder group block. So, we focus our soft
1151 * updates implementation on protecting the bitmaps. When mounting a
1152 * filesystem, we recompute the auxiliary counts from the bitmaps.
1153 */
1154
1155/*
1156 * Called just after updating the cylinder group block to allocate an inode.
1157 */
1158void
1159softdep_setup_inomapdep(bp, ip, newinum)
1160 struct buf *bp; /* buffer for cylgroup block with inode map */
1161 struct inode *ip; /* inode related to allocation */
1162 ino_t newinum; /* new inode number being allocated */
1163{
1164 struct inodedep *inodedep;
1165 struct bmsafemap *bmsafemap;
1166
1167 /*
1168 * Create a dependency for the newly allocated inode.
1169 * Panic if it already exists as something is seriously wrong.
1170 * Otherwise add it to the dependency list for the buffer holding
1171 * the cylinder group map from which it was allocated.
1172 */
1173 ACQUIRE_LOCK(&lk);
1174 if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC|NODELAY, &inodedep))) {
1175 FREE_LOCK(&lk);
1176 panic("softdep_setup_inomapdep: found inode");
1177 }
1178 inodedep->id_buf = bp;
1179 inodedep->id_state &= ~DEPCOMPLETE;
1180 bmsafemap = bmsafemap_lookup(bp);
1181 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1182 FREE_LOCK(&lk);
1183}
1184
1185/*
1186 * Called just after updating the cylinder group block to
1187 * allocate block or fragment.
1188 */
1189void
1190softdep_setup_blkmapdep(bp, fs, newblkno)
1191 struct buf *bp; /* buffer for cylgroup block with block map */
1192 struct fs *fs; /* filesystem doing allocation */
1193 ufs_daddr_t newblkno; /* number of newly allocated block */
1194{
1195 struct newblk *newblk;
1196 struct bmsafemap *bmsafemap;
1197
1198 /*
1199 * Create a dependency for the newly allocated block.
1200 * Add it to the dependency list for the buffer holding
1201 * the cylinder group map from which it was allocated.
1202 */
1203 if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1204 panic("softdep_setup_blkmapdep: found block");
1205 ACQUIRE_LOCK(&lk);
1206 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1207 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1208 FREE_LOCK(&lk);
1209}
1210
1211/*
1212 * Find the bmsafemap associated with a cylinder group buffer.
1213 * If none exists, create one. The buffer must be locked when
1214 * this routine is called and this routine must be called with
1215 * splbio interrupts blocked.
1216 */
1217static struct bmsafemap *
1218bmsafemap_lookup(bp)
1219 struct buf *bp;
1220{
1221 struct bmsafemap *bmsafemap;
1222 struct worklist *wk;
1223
1224#ifdef DEBUG
1225 if (lk.lkt_held == -1)
1226 panic("bmsafemap_lookup: lock not held");
1227#endif
1228 LIST_FOREACH(wk, &bp->b_dep, wk_list)
1229 if (wk->wk_type == D_BMSAFEMAP)
1230 return (WK_BMSAFEMAP(wk));
1231 FREE_LOCK(&lk);
1232 MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1233 M_BMSAFEMAP, M_SOFTDEP_FLAGS);
1234 bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1235 bmsafemap->sm_list.wk_state = 0;
1236 bmsafemap->sm_buf = bp;
1237 LIST_INIT(&bmsafemap->sm_allocdirecthd);
1238 LIST_INIT(&bmsafemap->sm_allocindirhd);
1239 LIST_INIT(&bmsafemap->sm_inodedephd);
1240 LIST_INIT(&bmsafemap->sm_newblkhd);
1241 ACQUIRE_LOCK(&lk);
1242 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1243 return (bmsafemap);
1244}
1245
1246/*
1247 * Direct block allocation dependencies.
1248 *
1249 * When a new block is allocated, the corresponding disk locations must be
1250 * initialized (with zeros or new data) before the on-disk inode points to
1251 * them. Also, the freemap from which the block was allocated must be
1252 * updated (on disk) before the inode's pointer. These two dependencies are
1253 * independent of each other and are needed for all file blocks and indirect
1254 * blocks that are pointed to directly by the inode. Just before the
1255 * "in-core" version of the inode is updated with a newly allocated block
1256 * number, a procedure (below) is called to setup allocation dependency
1257 * structures. These structures are removed when the corresponding
1258 * dependencies are satisfied or when the block allocation becomes obsolete
1259 * (i.e., the file is deleted, the block is de-allocated, or the block is a
1260 * fragment that gets upgraded). All of these cases are handled in
1261 * procedures described later.
1262 *
1263 * When a file extension causes a fragment to be upgraded, either to a larger
1264 * fragment or to a full block, the on-disk location may change (if the
1265 * previous fragment could not simply be extended). In this case, the old
1266 * fragment must be de-allocated, but not until after the inode's pointer has
1267 * been updated. In most cases, this is handled by later procedures, which
1268 * will construct a "freefrag" structure to be added to the workitem queue
1269 * when the inode update is complete (or obsolete). The main exception to
1270 * this is when an allocation occurs while a pending allocation dependency
1271 * (for the same block pointer) remains. This case is handled in the main
1272 * allocation dependency setup procedure by immediately freeing the
1273 * unreferenced fragments.
1274 */
1275void
1276softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1277 struct inode *ip; /* inode to which block is being added */
1278 ufs_lbn_t lbn; /* block pointer within inode */
1279 ufs_daddr_t newblkno; /* disk block number being added */
1280 ufs_daddr_t oldblkno; /* previous block number, 0 unless frag */
1281 long newsize; /* size of new block */
1282 long oldsize; /* size of new block */
1283 struct buf *bp; /* bp for allocated block */
1284{
1285 struct allocdirect *adp, *oldadp;
1286 struct allocdirectlst *adphead;
1287 struct bmsafemap *bmsafemap;
1288 struct inodedep *inodedep;
1289 struct pagedep *pagedep;
1290 struct newblk *newblk;
1291
1292 MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1293 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1294 adp->ad_list.wk_type = D_ALLOCDIRECT;
1295 adp->ad_lbn = lbn;
1296 adp->ad_newblkno = newblkno;
1297 adp->ad_oldblkno = oldblkno;
1298 adp->ad_newsize = newsize;
1299 adp->ad_oldsize = oldsize;
1300 adp->ad_state = ATTACHED;
1301 if (newblkno == oldblkno)
1302 adp->ad_freefrag = NULL;
1303 else
1304 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1305
1306 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1307 panic("softdep_setup_allocdirect: lost block");
1308
1309 ACQUIRE_LOCK(&lk);
1310 inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1311 adp->ad_inodedep = inodedep;
1312
1313 if (newblk->nb_state == DEPCOMPLETE) {
1314 adp->ad_state |= DEPCOMPLETE;
1315 adp->ad_buf = NULL;
1316 } else {
1317 bmsafemap = newblk->nb_bmsafemap;
1318 adp->ad_buf = bmsafemap->sm_buf;
1319 LIST_REMOVE(newblk, nb_deps);
1320 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1321 }
1322 LIST_REMOVE(newblk, nb_hash);
1323 FREE(newblk, M_NEWBLK);
1324
1325 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1326 if (lbn >= NDADDR) {
1327 /* allocating an indirect block */
1328 if (oldblkno != 0) {
1329 FREE_LOCK(&lk);
1330 panic("softdep_setup_allocdirect: non-zero indir");
1331 }
1332 } else {
1333 /*
1334 * Allocating a direct block.
1335 *
1336 * If we are allocating a directory block, then we must
1337 * allocate an associated pagedep to track additions and
1338 * deletions.
1339 */
1340 if ((ip->i_mode & IFMT) == IFDIR &&
1341 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1342 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1343 }
1344 /*
1345 * The list of allocdirects must be kept in sorted and ascending
1346 * order so that the rollback routines can quickly determine the
1347 * first uncommitted block (the size of the file stored on disk
1348 * ends at the end of the lowest committed fragment, or if there
1349 * are no fragments, at the end of the highest committed block).
1350 * Since files generally grow, the typical case is that the new
1351 * block is to be added at the end of the list. We speed this
1352 * special case by checking against the last allocdirect in the
1353 * list before laboriously traversing the list looking for the
1354 * insertion point.
1355 */
1356 adphead = &inodedep->id_newinoupdt;
1357 oldadp = TAILQ_LAST(adphead, allocdirectlst);
1358 if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1359 /* insert at end of list */
1360 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1361 if (oldadp != NULL && oldadp->ad_lbn == lbn)
1362 allocdirect_merge(adphead, adp, oldadp);
1363 FREE_LOCK(&lk);
1364 return;
1365 }
1366 TAILQ_FOREACH(oldadp, adphead, ad_next) {
1367 if (oldadp->ad_lbn >= lbn)
1368 break;
1369 }
1370 if (oldadp == NULL) {
1371 FREE_LOCK(&lk);
1372 panic("softdep_setup_allocdirect: lost entry");
1373 }
1374 /* insert in middle of list */
1375 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1376 if (oldadp->ad_lbn == lbn)
1377 allocdirect_merge(adphead, adp, oldadp);
1378 FREE_LOCK(&lk);
1379}
1380
1381/*
1382 * Replace an old allocdirect dependency with a newer one.
1383 * This routine must be called with splbio interrupts blocked.
1384 */
1385static void
1386allocdirect_merge(adphead, newadp, oldadp)
1387 struct allocdirectlst *adphead; /* head of list holding allocdirects */
1388 struct allocdirect *newadp; /* allocdirect being added */
1389 struct allocdirect *oldadp; /* existing allocdirect being checked */
1390{
1391 struct freefrag *freefrag;
1392
1393#ifdef DEBUG
1394 if (lk.lkt_held == -1)
1395 panic("allocdirect_merge: lock not held");
1396#endif
1397 if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1398 newadp->ad_oldsize != oldadp->ad_newsize ||
1399 newadp->ad_lbn >= NDADDR) {
1400 FREE_LOCK(&lk);
1401 panic("allocdirect_check: old %d != new %d || lbn %ld >= %d",
1402 newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
1403 NDADDR);
1404 }
1405 newadp->ad_oldblkno = oldadp->ad_oldblkno;
1406 newadp->ad_oldsize = oldadp->ad_oldsize;
1407 /*
1408 * If the old dependency had a fragment to free or had never
1409 * previously had a block allocated, then the new dependency
1410 * can immediately post its freefrag and adopt the old freefrag.
1411 * This action is done by swapping the freefrag dependencies.
1412 * The new dependency gains the old one's freefrag, and the
1413 * old one gets the new one and then immediately puts it on
1414 * the worklist when it is freed by free_allocdirect. It is
1415 * not possible to do this swap when the old dependency had a
1416 * non-zero size but no previous fragment to free. This condition
1417 * arises when the new block is an extension of the old block.
1418 * Here, the first part of the fragment allocated to the new
1419 * dependency is part of the block currently claimed on disk by
1420 * the old dependency, so cannot legitimately be freed until the
1421 * conditions for the new dependency are fulfilled.
1422 */
1423 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1424 freefrag = newadp->ad_freefrag;
1425 newadp->ad_freefrag = oldadp->ad_freefrag;
1426 oldadp->ad_freefrag = freefrag;
1427 }
1428 free_allocdirect(adphead, oldadp, 0);
1429}
1430
1431/*
1432 * Allocate a new freefrag structure if needed.
1433 */
1434static struct freefrag *
1435newfreefrag(ip, blkno, size)
1436 struct inode *ip;
1437 ufs_daddr_t blkno;
1438 long size;
1439{
1440 struct freefrag *freefrag;
1441 struct fs *fs;
1442
1443 if (blkno == 0)
1444 return (NULL);
1445 fs = ip->i_fs;
1446 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1447 panic("newfreefrag: frag size");
1448 MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1449 M_FREEFRAG, M_SOFTDEP_FLAGS);
1450 freefrag->ff_list.wk_type = D_FREEFRAG;
1451 freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */
1452 freefrag->ff_inum = ip->i_number;
1453 freefrag->ff_mnt = ITOV(ip)->v_mount;
1454 freefrag->ff_devvp = ip->i_devvp;
1455 freefrag->ff_blkno = blkno;
1456 freefrag->ff_fragsize = size;
1457 return (freefrag);
1458}
1459
1460/*
1461 * This workitem de-allocates fragments that were replaced during
1462 * file block allocation.
1463 */
1464static void
1465handle_workitem_freefrag(freefrag)
1466 struct freefrag *freefrag;
1467{
1468 struct inode tip;
1469
1470 tip.i_vnode = NULL;
1471 tip.i_fs = VFSTOUFS(freefrag->ff_mnt)->um_fs;
1472 tip.i_devvp = freefrag->ff_devvp;
1473 tip.i_dev = freefrag->ff_devvp->v_rdev;
1474 tip.i_number = freefrag->ff_inum;
1475 tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above */
1476 ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1477 FREE(freefrag, M_FREEFRAG);
1478}
1479
1480/*
1481 * Indirect block allocation dependencies.
1482 *
1483 * The same dependencies that exist for a direct block also exist when
1484 * a new block is allocated and pointed to by an entry in a block of
1485 * indirect pointers. The undo/redo states described above are also
1486 * used here. Because an indirect block contains many pointers that
1487 * may have dependencies, a second copy of the entire in-memory indirect
1488 * block is kept. The buffer cache copy is always completely up-to-date.
1489 * The second copy, which is used only as a source for disk writes,
1490 * contains only the safe pointers (i.e., those that have no remaining
1491 * update dependencies). The second copy is freed when all pointers
1492 * are safe. The cache is not allowed to replace indirect blocks with
1493 * pending update dependencies. If a buffer containing an indirect
1494 * block with dependencies is written, these routines will mark it
1495 * dirty again. It can only be successfully written once all the
1496 * dependencies are removed. The ffs_fsync routine in conjunction with
1497 * softdep_sync_metadata work together to get all the dependencies
1498 * removed so that a file can be successfully written to disk. Three
1499 * procedures are used when setting up indirect block pointer
1500 * dependencies. The division is necessary because of the organization
1501 * of the "balloc" routine and because of the distinction between file
1502 * pages and file metadata blocks.
1503 */
1504
1505/*
1506 * Allocate a new allocindir structure.
1507 */
1508static struct allocindir *
1509newallocindir(ip, ptrno, newblkno, oldblkno)
1510 struct inode *ip; /* inode for file being extended */
1511 int ptrno; /* offset of pointer in indirect block */
1512 ufs_daddr_t newblkno; /* disk block number being added */
1513 ufs_daddr_t oldblkno; /* previous block number, 0 if none */
1514{
1515 struct allocindir *aip;
1516
1517 MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1518 M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
1519 aip->ai_list.wk_type = D_ALLOCINDIR;
1520 aip->ai_state = ATTACHED;
1521 aip->ai_offset = ptrno;
1522 aip->ai_newblkno = newblkno;
1523 aip->ai_oldblkno = oldblkno;
1524 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1525 return (aip);
1526}
1527
1528/*
1529 * Called just before setting an indirect block pointer
1530 * to a newly allocated file page.
1531 */
1532void
1533softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1534 struct inode *ip; /* inode for file being extended */
1535 ufs_lbn_t lbn; /* allocated block number within file */
1536 struct buf *bp; /* buffer with indirect blk referencing page */
1537 int ptrno; /* offset of pointer in indirect block */
1538 ufs_daddr_t newblkno; /* disk block number being added */
1539 ufs_daddr_t oldblkno; /* previous block number, 0 if none */
1540 struct buf *nbp; /* buffer holding allocated page */
1541{
1542 struct allocindir *aip;
1543 struct pagedep *pagedep;
1544
1545 aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1546 ACQUIRE_LOCK(&lk);
1547 /*
1548 * If we are allocating a directory page, then we must
1549 * allocate an associated pagedep to track additions and
1550 * deletions.
1551 */
1552 if ((ip->i_mode & IFMT) == IFDIR &&
1553 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1554 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1555 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1556 FREE_LOCK(&lk);
1557 setup_allocindir_phase2(bp, ip, aip);
1558}
1559
1560/*
1561 * Called just before setting an indirect block pointer to a
1562 * newly allocated indirect block.
1563 */
1564void
1565softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1566 struct buf *nbp; /* newly allocated indirect block */
1567 struct inode *ip; /* inode for file being extended */
1568 struct buf *bp; /* indirect block referencing allocated block */
1569 int ptrno; /* offset of pointer in indirect block */
1570 ufs_daddr_t newblkno; /* disk block number being added */
1571{
1572 struct allocindir *aip;
1573
1574 aip = newallocindir(ip, ptrno, newblkno, 0);
1575 ACQUIRE_LOCK(&lk);
1576 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1577 FREE_LOCK(&lk);
1578 setup_allocindir_phase2(bp, ip, aip);
1579}
1580
1581/*
1582 * Called to finish the allocation of the "aip" allocated
1583 * by one of the two routines above.
1584 */
1585static void
1586setup_allocindir_phase2(bp, ip, aip)
1587 struct buf *bp; /* in-memory copy of the indirect block */
1588 struct inode *ip; /* inode for file being extended */
1589 struct allocindir *aip; /* allocindir allocated by the above routines */
1590{
1591 struct worklist *wk;
1592 struct indirdep *indirdep, *newindirdep;
1593 struct bmsafemap *bmsafemap;
1594 struct allocindir *oldaip;
1595 struct freefrag *freefrag;
1596 struct newblk *newblk;
1597
1598 if (bp->b_lblkno >= 0)
1599 panic("setup_allocindir_phase2: not indir blk");
1600 for (indirdep = NULL, newindirdep = NULL; ; ) {
1601 ACQUIRE_LOCK(&lk);
1602 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
1603 if (wk->wk_type != D_INDIRDEP)
1604 continue;
1605 indirdep = WK_INDIRDEP(wk);
1606 break;
1607 }
1608 if (indirdep == NULL && newindirdep) {
1609 indirdep = newindirdep;
1610 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1611 newindirdep = NULL;
1612 }
1613 FREE_LOCK(&lk);
1614 if (indirdep) {
1615 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1616 &newblk) == 0)
1617 panic("setup_allocindir: lost block");
1618 ACQUIRE_LOCK(&lk);
1619 if (newblk->nb_state == DEPCOMPLETE) {
1620 aip->ai_state |= DEPCOMPLETE;
1621 aip->ai_buf = NULL;
1622 } else {
1623 bmsafemap = newblk->nb_bmsafemap;
1624 aip->ai_buf = bmsafemap->sm_buf;
1625 LIST_REMOVE(newblk, nb_deps);
1626 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1627 aip, ai_deps);
1628 }
1629 LIST_REMOVE(newblk, nb_hash);
1630 FREE(newblk, M_NEWBLK);
1631 aip->ai_indirdep = indirdep;
1632 /*
1633 * Check to see if there is an existing dependency
1634 * for this block. If there is, merge the old
1635 * dependency into the new one.
1636 */
1637 if (aip->ai_oldblkno == 0)
1638 oldaip = NULL;
1639 else
1640
1641 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
1642 if (oldaip->ai_offset == aip->ai_offset)
1643 break;
1644 freefrag = NULL;
1645 if (oldaip != NULL) {
1646 if (oldaip->ai_newblkno != aip->ai_oldblkno) {
1647 FREE_LOCK(&lk);
1648 panic("setup_allocindir_phase2: blkno");
1649 }
1650 aip->ai_oldblkno = oldaip->ai_oldblkno;
1651 freefrag = aip->ai_freefrag;
1652 aip->ai_freefrag = oldaip->ai_freefrag;
1653 oldaip->ai_freefrag = NULL;
1654 free_allocindir(oldaip, NULL);
1655 }
1656 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1657 ((ufs_daddr_t *)indirdep->ir_savebp->b_data)
1658 [aip->ai_offset] = aip->ai_oldblkno;
1659 FREE_LOCK(&lk);
1660 if (freefrag != NULL)
1661 handle_workitem_freefrag(freefrag);
1662 }
1663 if (newindirdep) {
1664 if (indirdep->ir_savebp != NULL)
1665 brelse(newindirdep->ir_savebp);
1666 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
1667 }
1668 if (indirdep)
1669 break;
1670 MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
1671 M_INDIRDEP, M_SOFTDEP_FLAGS);
1672 newindirdep->ir_list.wk_type = D_INDIRDEP;
1673 newindirdep->ir_state = ATTACHED;
1674 LIST_INIT(&newindirdep->ir_deplisthd);
1675 LIST_INIT(&newindirdep->ir_donehd);
1676 if (bp->b_blkno == bp->b_lblkno)
1677 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &bp->b_blkno, NULL, NULL);
1678 newindirdep->ir_savebp =
1679 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1680 BUF_KERNPROC(newindirdep->ir_savebp);
1681 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1682 }
1683}
1684
1685/*
1686 * Block de-allocation dependencies.
1687 *
1688 * When blocks are de-allocated, the on-disk pointers must be nullified before
1689 * the blocks are made available for use by other files. (The true
1690 * requirement is that old pointers must be nullified before new on-disk
1691 * pointers are set. We chose this slightly more stringent requirement to
1692 * reduce complexity.) Our implementation handles this dependency by updating
1693 * the inode (or indirect block) appropriately but delaying the actual block
1694 * de-allocation (i.e., freemap and free space count manipulation) until
1695 * after the updated versions reach stable storage. After the disk is
1696 * updated, the blocks can be safely de-allocated whenever it is convenient.
1697 * This implementation handles only the common case of reducing a file's
1698 * length to zero. Other cases are handled by the conventional synchronous
1699 * write approach.
1700 *
1701 * The ffs implementation with which we worked double-checks
1702 * the state of the block pointers and file size as it reduces
1703 * a file's length. Some of this code is replicated here in our
1704 * soft updates implementation. The freeblks->fb_chkcnt field is
1705 * used to transfer a part of this information to the procedure
1706 * that eventually de-allocates the blocks.
1707 *
1708 * This routine should be called from the routine that shortens
1709 * a file's length, before the inode's size or block pointers
1710 * are modified. It will save the block pointer information for
1711 * later release and zero the inode so that the calling routine
1712 * can release it.
1713 */
1714void
1715softdep_setup_freeblocks(ip, length)
1716 struct inode *ip; /* The inode whose length is to be reduced */
1717 off_t length; /* The new length for the file */
1718{
1719 struct freeblks *freeblks;
1720 struct inodedep *inodedep;
1721 struct allocdirect *adp;
1722 struct vnode *vp;
1723 struct buf *bp;
1724 struct fs *fs;
1725 int i, delay, error;
1726
1727 fs = ip->i_fs;
1728 if (length != 0)
1728 panic("softde_setup_freeblocks: non-zero length");
1729 panic("softdep_setup_freeblocks: non-zero length");
1729 MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
1730 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
1731 freeblks->fb_list.wk_type = D_FREEBLKS;
1732 freeblks->fb_uid = ip->i_uid;
1733 freeblks->fb_previousinum = ip->i_number;
1734 freeblks->fb_devvp = ip->i_devvp;
1735 freeblks->fb_mnt = ITOV(ip)->v_mount;
1736 freeblks->fb_oldsize = ip->i_size;
1737 freeblks->fb_newsize = length;
1738 freeblks->fb_chkcnt = ip->i_blocks;
1739 for (i = 0; i < NDADDR; i++) {
1740 freeblks->fb_dblks[i] = ip->i_db[i];
1741 ip->i_db[i] = 0;
1742 }
1743 for (i = 0; i < NIADDR; i++) {
1744 freeblks->fb_iblks[i] = ip->i_ib[i];
1745 ip->i_ib[i] = 0;
1746 }
1747 ip->i_blocks = 0;
1748 ip->i_size = 0;
1749 /*
1730 MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
1731 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
1732 freeblks->fb_list.wk_type = D_FREEBLKS;
1733 freeblks->fb_uid = ip->i_uid;
1734 freeblks->fb_previousinum = ip->i_number;
1735 freeblks->fb_devvp = ip->i_devvp;
1736 freeblks->fb_mnt = ITOV(ip)->v_mount;
1737 freeblks->fb_oldsize = ip->i_size;
1738 freeblks->fb_newsize = length;
1739 freeblks->fb_chkcnt = ip->i_blocks;
1740 for (i = 0; i < NDADDR; i++) {
1741 freeblks->fb_dblks[i] = ip->i_db[i];
1742 ip->i_db[i] = 0;
1743 }
1744 for (i = 0; i < NIADDR; i++) {
1745 freeblks->fb_iblks[i] = ip->i_ib[i];
1746 ip->i_ib[i] = 0;
1747 }
1748 ip->i_blocks = 0;
1749 ip->i_size = 0;
1750 /*
1751 * If the file was removed, then the space being freed was
1752 * accounted for then (see softdep_filereleased()). If the
1753 * file is merely being truncated, then we account for it now.
1754 */
1755 if ((ip->i_flag & IN_SPACECOUNTED) == 0)
1756 fs->fs_pendingblocks += freeblks->fb_chkcnt;
1757 /*
1750 * Push the zero'ed inode to to its disk buffer so that we are free
1751 * to delete its dependencies below. Once the dependencies are gone
1752 * the buffer can be safely released.
1753 */
1754 if ((error = bread(ip->i_devvp,
1755 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1756 (int)fs->fs_bsize, NOCRED, &bp)) != 0)
1757 softdep_error("softdep_setup_freeblocks", error);
1758 *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
1759 ip->i_din;
1760 /*
1761 * Find and eliminate any inode dependencies.
1762 */
1763 ACQUIRE_LOCK(&lk);
1764 (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1765 if ((inodedep->id_state & IOSTARTED) != 0) {
1766 FREE_LOCK(&lk);
1767 panic("softdep_setup_freeblocks: inode busy");
1768 }
1769 /*
1770 * Add the freeblks structure to the list of operations that
1771 * must await the zero'ed inode being written to disk. If we
1772 * still have a bitmap dependency (delay == 0), then the inode
1773 * has never been written to disk, so we can process the
1774 * freeblks below once we have deleted the dependencies.
1775 */
1776 delay = (inodedep->id_state & DEPCOMPLETE);
1777 if (delay)
1778 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1779 /*
1780 * Because the file length has been truncated to zero, any
1781 * pending block allocation dependency structures associated
1782 * with this inode are obsolete and can simply be de-allocated.
1783 * We must first merge the two dependency lists to get rid of
1784 * any duplicate freefrag structures, then purge the merged list.
1785 * If we still have a bitmap dependency, then the inode has never
1786 * been written to disk, so we can free any fragments without delay.
1787 */
1788 merge_inode_lists(inodedep);
1789 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
1790 free_allocdirect(&inodedep->id_inoupdt, adp, delay);
1791 FREE_LOCK(&lk);
1792 bdwrite(bp);
1793 /*
1794 * We must wait for any I/O in progress to finish so that
1795 * all potential buffers on the dirty list will be visible.
1796 * Once they are all there, walk the list and get rid of
1797 * any dependencies.
1798 */
1799 vp = ITOV(ip);
1800 ACQUIRE_LOCK(&lk);
1801 drain_output(vp, 1);
1802 while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
1803 bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
1804 (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
1805 deallocate_dependencies(bp, inodedep);
1806 bp->b_flags |= B_INVAL | B_NOCACHE;
1807 FREE_LOCK(&lk);
1808 brelse(bp);
1809 ACQUIRE_LOCK(&lk);
1810 }
1811 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
1812 (void) free_inodedep(inodedep);
1813 FREE_LOCK(&lk);
1814 /*
1815 * If the inode has never been written to disk (delay == 0),
1816 * then we can process the freeblks now that we have deleted
1817 * the dependencies.
1818 */
1819 if (!delay)
1820 handle_workitem_freeblocks(freeblks, 0);
1821}
1822
1823/*
1824 * Reclaim any dependency structures from a buffer that is about to
1825 * be reallocated to a new vnode. The buffer must be locked, thus,
1826 * no I/O completion operations can occur while we are manipulating
1827 * its associated dependencies. The mutex is held so that other I/O's
1828 * associated with related dependencies do not occur.
1829 */
1830static void
1831deallocate_dependencies(bp, inodedep)
1832 struct buf *bp;
1833 struct inodedep *inodedep;
1834{
1835 struct worklist *wk;
1836 struct indirdep *indirdep;
1837 struct allocindir *aip;
1838 struct pagedep *pagedep;
1839 struct dirrem *dirrem;
1840 struct diradd *dap;
1841 int i;
1842
1843 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
1844 switch (wk->wk_type) {
1845
1846 case D_INDIRDEP:
1847 indirdep = WK_INDIRDEP(wk);
1848 /*
1849 * None of the indirect pointers will ever be visible,
1850 * so they can simply be tossed. GOINGAWAY ensures
1851 * that allocated pointers will be saved in the buffer
1852 * cache until they are freed. Note that they will
1853 * only be able to be found by their physical address
1854 * since the inode mapping the logical address will
1855 * be gone. The save buffer used for the safe copy
1856 * was allocated in setup_allocindir_phase2 using
1857 * the physical address so it could be used for this
1858 * purpose. Hence we swap the safe copy with the real
1859 * copy, allowing the safe copy to be freed and holding
1860 * on to the real copy for later use in indir_trunc.
1861 */
1862 if (indirdep->ir_state & GOINGAWAY) {
1863 FREE_LOCK(&lk);
1864 panic("deallocate_dependencies: already gone");
1865 }
1866 indirdep->ir_state |= GOINGAWAY;
1867 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
1868 free_allocindir(aip, inodedep);
1869 if (bp->b_lblkno >= 0 ||
1870 bp->b_blkno != indirdep->ir_savebp->b_lblkno) {
1871 FREE_LOCK(&lk);
1872 panic("deallocate_dependencies: not indir");
1873 }
1874 bcopy(bp->b_data, indirdep->ir_savebp->b_data,
1875 bp->b_bcount);
1876 WORKLIST_REMOVE(wk);
1877 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
1878 continue;
1879
1880 case D_PAGEDEP:
1881 pagedep = WK_PAGEDEP(wk);
1882 /*
1883 * None of the directory additions will ever be
1884 * visible, so they can simply be tossed.
1885 */
1886 for (i = 0; i < DAHASHSZ; i++)
1887 while ((dap =
1888 LIST_FIRST(&pagedep->pd_diraddhd[i])))
1889 free_diradd(dap);
1890 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
1891 free_diradd(dap);
1892 /*
1893 * Copy any directory remove dependencies to the list
1894 * to be processed after the zero'ed inode is written.
1895 * If the inode has already been written, then they
1896 * can be dumped directly onto the work list.
1897 */
1898 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
1899 LIST_REMOVE(dirrem, dm_next);
1900 dirrem->dm_dirinum = pagedep->pd_ino;
1901 if (inodedep == NULL ||
1902 (inodedep->id_state & ALLCOMPLETE) ==
1903 ALLCOMPLETE)
1904 add_to_worklist(&dirrem->dm_list);
1905 else
1906 WORKLIST_INSERT(&inodedep->id_bufwait,
1907 &dirrem->dm_list);
1908 }
1909 WORKLIST_REMOVE(&pagedep->pd_list);
1910 LIST_REMOVE(pagedep, pd_hash);
1911 WORKITEM_FREE(pagedep, D_PAGEDEP);
1912 continue;
1913
1914 case D_ALLOCINDIR:
1915 free_allocindir(WK_ALLOCINDIR(wk), inodedep);
1916 continue;
1917
1918 case D_ALLOCDIRECT:
1919 case D_INODEDEP:
1920 FREE_LOCK(&lk);
1921 panic("deallocate_dependencies: Unexpected type %s",
1922 TYPENAME(wk->wk_type));
1923 /* NOTREACHED */
1924
1925 default:
1926 FREE_LOCK(&lk);
1927 panic("deallocate_dependencies: Unknown type %s",
1928 TYPENAME(wk->wk_type));
1929 /* NOTREACHED */
1930 }
1931 }
1932}
1933
1934/*
1935 * Free an allocdirect. Generate a new freefrag work request if appropriate.
1936 * This routine must be called with splbio interrupts blocked.
1937 */
1938static void
1939free_allocdirect(adphead, adp, delay)
1940 struct allocdirectlst *adphead;
1941 struct allocdirect *adp;
1942 int delay;
1943{
1944
1945#ifdef DEBUG
1946 if (lk.lkt_held == -1)
1947 panic("free_allocdirect: lock not held");
1948#endif
1949 if ((adp->ad_state & DEPCOMPLETE) == 0)
1950 LIST_REMOVE(adp, ad_deps);
1951 TAILQ_REMOVE(adphead, adp, ad_next);
1952 if ((adp->ad_state & COMPLETE) == 0)
1953 WORKLIST_REMOVE(&adp->ad_list);
1954 if (adp->ad_freefrag != NULL) {
1955 if (delay)
1956 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
1957 &adp->ad_freefrag->ff_list);
1958 else
1959 add_to_worklist(&adp->ad_freefrag->ff_list);
1960 }
1961 WORKITEM_FREE(adp, D_ALLOCDIRECT);
1962}
1963
1964/*
1965 * Prepare an inode to be freed. The actual free operation is not
1966 * done until the zero'ed inode has been written to disk.
1967 */
1968void
1969softdep_freefile(pvp, ino, mode)
1970 struct vnode *pvp;
1971 ino_t ino;
1972 int mode;
1973{
1974 struct inode *ip = VTOI(pvp);
1975 struct inodedep *inodedep;
1976 struct freefile *freefile;
1977
1978 /*
1979 * This sets up the inode de-allocation dependency.
1980 */
1981 MALLOC(freefile, struct freefile *, sizeof(struct freefile),
1982 M_FREEFILE, M_SOFTDEP_FLAGS);
1983 freefile->fx_list.wk_type = D_FREEFILE;
1984 freefile->fx_list.wk_state = 0;
1985 freefile->fx_mode = mode;
1986 freefile->fx_oldinum = ino;
1987 freefile->fx_devvp = ip->i_devvp;
1988 freefile->fx_mnt = ITOV(ip)->v_mount;
1758 * Push the zero'ed inode to to its disk buffer so that we are free
1759 * to delete its dependencies below. Once the dependencies are gone
1760 * the buffer can be safely released.
1761 */
1762 if ((error = bread(ip->i_devvp,
1763 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1764 (int)fs->fs_bsize, NOCRED, &bp)) != 0)
1765 softdep_error("softdep_setup_freeblocks", error);
1766 *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
1767 ip->i_din;
1768 /*
1769 * Find and eliminate any inode dependencies.
1770 */
1771 ACQUIRE_LOCK(&lk);
1772 (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1773 if ((inodedep->id_state & IOSTARTED) != 0) {
1774 FREE_LOCK(&lk);
1775 panic("softdep_setup_freeblocks: inode busy");
1776 }
1777 /*
1778 * Add the freeblks structure to the list of operations that
1779 * must await the zero'ed inode being written to disk. If we
1780 * still have a bitmap dependency (delay == 0), then the inode
1781 * has never been written to disk, so we can process the
1782 * freeblks below once we have deleted the dependencies.
1783 */
1784 delay = (inodedep->id_state & DEPCOMPLETE);
1785 if (delay)
1786 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1787 /*
1788 * Because the file length has been truncated to zero, any
1789 * pending block allocation dependency structures associated
1790 * with this inode are obsolete and can simply be de-allocated.
1791 * We must first merge the two dependency lists to get rid of
1792 * any duplicate freefrag structures, then purge the merged list.
1793 * If we still have a bitmap dependency, then the inode has never
1794 * been written to disk, so we can free any fragments without delay.
1795 */
1796 merge_inode_lists(inodedep);
1797 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
1798 free_allocdirect(&inodedep->id_inoupdt, adp, delay);
1799 FREE_LOCK(&lk);
1800 bdwrite(bp);
1801 /*
1802 * We must wait for any I/O in progress to finish so that
1803 * all potential buffers on the dirty list will be visible.
1804 * Once they are all there, walk the list and get rid of
1805 * any dependencies.
1806 */
1807 vp = ITOV(ip);
1808 ACQUIRE_LOCK(&lk);
1809 drain_output(vp, 1);
1810 while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
1811 bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
1812 (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
1813 deallocate_dependencies(bp, inodedep);
1814 bp->b_flags |= B_INVAL | B_NOCACHE;
1815 FREE_LOCK(&lk);
1816 brelse(bp);
1817 ACQUIRE_LOCK(&lk);
1818 }
1819 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
1820 (void) free_inodedep(inodedep);
1821 FREE_LOCK(&lk);
1822 /*
1823 * If the inode has never been written to disk (delay == 0),
1824 * then we can process the freeblks now that we have deleted
1825 * the dependencies.
1826 */
1827 if (!delay)
1828 handle_workitem_freeblocks(freeblks, 0);
1829}
1830
1831/*
1832 * Reclaim any dependency structures from a buffer that is about to
1833 * be reallocated to a new vnode. The buffer must be locked, thus,
1834 * no I/O completion operations can occur while we are manipulating
1835 * its associated dependencies. The mutex is held so that other I/O's
1836 * associated with related dependencies do not occur.
1837 */
1838static void
1839deallocate_dependencies(bp, inodedep)
1840 struct buf *bp;
1841 struct inodedep *inodedep;
1842{
1843 struct worklist *wk;
1844 struct indirdep *indirdep;
1845 struct allocindir *aip;
1846 struct pagedep *pagedep;
1847 struct dirrem *dirrem;
1848 struct diradd *dap;
1849 int i;
1850
1851 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
1852 switch (wk->wk_type) {
1853
1854 case D_INDIRDEP:
1855 indirdep = WK_INDIRDEP(wk);
1856 /*
1857 * None of the indirect pointers will ever be visible,
1858 * so they can simply be tossed. GOINGAWAY ensures
1859 * that allocated pointers will be saved in the buffer
1860 * cache until they are freed. Note that they will
1861 * only be able to be found by their physical address
1862 * since the inode mapping the logical address will
1863 * be gone. The save buffer used for the safe copy
1864 * was allocated in setup_allocindir_phase2 using
1865 * the physical address so it could be used for this
1866 * purpose. Hence we swap the safe copy with the real
1867 * copy, allowing the safe copy to be freed and holding
1868 * on to the real copy for later use in indir_trunc.
1869 */
1870 if (indirdep->ir_state & GOINGAWAY) {
1871 FREE_LOCK(&lk);
1872 panic("deallocate_dependencies: already gone");
1873 }
1874 indirdep->ir_state |= GOINGAWAY;
1875 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
1876 free_allocindir(aip, inodedep);
1877 if (bp->b_lblkno >= 0 ||
1878 bp->b_blkno != indirdep->ir_savebp->b_lblkno) {
1879 FREE_LOCK(&lk);
1880 panic("deallocate_dependencies: not indir");
1881 }
1882 bcopy(bp->b_data, indirdep->ir_savebp->b_data,
1883 bp->b_bcount);
1884 WORKLIST_REMOVE(wk);
1885 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
1886 continue;
1887
1888 case D_PAGEDEP:
1889 pagedep = WK_PAGEDEP(wk);
1890 /*
1891 * None of the directory additions will ever be
1892 * visible, so they can simply be tossed.
1893 */
1894 for (i = 0; i < DAHASHSZ; i++)
1895 while ((dap =
1896 LIST_FIRST(&pagedep->pd_diraddhd[i])))
1897 free_diradd(dap);
1898 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
1899 free_diradd(dap);
1900 /*
1901 * Copy any directory remove dependencies to the list
1902 * to be processed after the zero'ed inode is written.
1903 * If the inode has already been written, then they
1904 * can be dumped directly onto the work list.
1905 */
1906 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
1907 LIST_REMOVE(dirrem, dm_next);
1908 dirrem->dm_dirinum = pagedep->pd_ino;
1909 if (inodedep == NULL ||
1910 (inodedep->id_state & ALLCOMPLETE) ==
1911 ALLCOMPLETE)
1912 add_to_worklist(&dirrem->dm_list);
1913 else
1914 WORKLIST_INSERT(&inodedep->id_bufwait,
1915 &dirrem->dm_list);
1916 }
1917 WORKLIST_REMOVE(&pagedep->pd_list);
1918 LIST_REMOVE(pagedep, pd_hash);
1919 WORKITEM_FREE(pagedep, D_PAGEDEP);
1920 continue;
1921
1922 case D_ALLOCINDIR:
1923 free_allocindir(WK_ALLOCINDIR(wk), inodedep);
1924 continue;
1925
1926 case D_ALLOCDIRECT:
1927 case D_INODEDEP:
1928 FREE_LOCK(&lk);
1929 panic("deallocate_dependencies: Unexpected type %s",
1930 TYPENAME(wk->wk_type));
1931 /* NOTREACHED */
1932
1933 default:
1934 FREE_LOCK(&lk);
1935 panic("deallocate_dependencies: Unknown type %s",
1936 TYPENAME(wk->wk_type));
1937 /* NOTREACHED */
1938 }
1939 }
1940}
1941
1942/*
1943 * Free an allocdirect. Generate a new freefrag work request if appropriate.
1944 * This routine must be called with splbio interrupts blocked.
1945 */
1946static void
1947free_allocdirect(adphead, adp, delay)
1948 struct allocdirectlst *adphead;
1949 struct allocdirect *adp;
1950 int delay;
1951{
1952
1953#ifdef DEBUG
1954 if (lk.lkt_held == -1)
1955 panic("free_allocdirect: lock not held");
1956#endif
1957 if ((adp->ad_state & DEPCOMPLETE) == 0)
1958 LIST_REMOVE(adp, ad_deps);
1959 TAILQ_REMOVE(adphead, adp, ad_next);
1960 if ((adp->ad_state & COMPLETE) == 0)
1961 WORKLIST_REMOVE(&adp->ad_list);
1962 if (adp->ad_freefrag != NULL) {
1963 if (delay)
1964 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
1965 &adp->ad_freefrag->ff_list);
1966 else
1967 add_to_worklist(&adp->ad_freefrag->ff_list);
1968 }
1969 WORKITEM_FREE(adp, D_ALLOCDIRECT);
1970}
1971
1972/*
1973 * Prepare an inode to be freed. The actual free operation is not
1974 * done until the zero'ed inode has been written to disk.
1975 */
1976void
1977softdep_freefile(pvp, ino, mode)
1978 struct vnode *pvp;
1979 ino_t ino;
1980 int mode;
1981{
1982 struct inode *ip = VTOI(pvp);
1983 struct inodedep *inodedep;
1984 struct freefile *freefile;
1985
1986 /*
1987 * This sets up the inode de-allocation dependency.
1988 */
1989 MALLOC(freefile, struct freefile *, sizeof(struct freefile),
1990 M_FREEFILE, M_SOFTDEP_FLAGS);
1991 freefile->fx_list.wk_type = D_FREEFILE;
1992 freefile->fx_list.wk_state = 0;
1993 freefile->fx_mode = mode;
1994 freefile->fx_oldinum = ino;
1995 freefile->fx_devvp = ip->i_devvp;
1996 freefile->fx_mnt = ITOV(ip)->v_mount;
1997 if ((ip->i_flag & IN_SPACECOUNTED) == 0)
1998 ip->i_fs->fs_pendinginodes += 1;
1989
1990 /*
1991 * If the inodedep does not exist, then the zero'ed inode has
1992 * been written to disk. If the allocated inode has never been
1993 * written to disk, then the on-disk inode is zero'ed. In either
1994 * case we can free the file immediately.
1995 */
1996 ACQUIRE_LOCK(&lk);
1997 if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
1998 check_inode_unwritten(inodedep)) {
1999 FREE_LOCK(&lk);
2000 handle_workitem_freefile(freefile);
2001 return;
2002 }
2003 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
2004 FREE_LOCK(&lk);
2005}
2006
2007/*
2008 * Check to see if an inode has never been written to disk. If
2009 * so free the inodedep and return success, otherwise return failure.
2010 * This routine must be called with splbio interrupts blocked.
2011 *
2012 * If we still have a bitmap dependency, then the inode has never
2013 * been written to disk. Drop the dependency as it is no longer
2014 * necessary since the inode is being deallocated. We set the
2015 * ALLCOMPLETE flags since the bitmap now properly shows that the
2016 * inode is not allocated. Even if the inode is actively being
2017 * written, it has been rolled back to its zero'ed state, so we
2018 * are ensured that a zero inode is what is on the disk. For short
2019 * lived files, this change will usually result in removing all the
2020 * dependencies from the inode so that it can be freed immediately.
2021 */
2022static int
2023check_inode_unwritten(inodedep)
2024 struct inodedep *inodedep;
2025{
2026
2027 if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
2028 LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2029 LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2030 LIST_FIRST(&inodedep->id_inowait) != NULL ||
2031 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2032 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2033 inodedep->id_nlinkdelta != 0)
2034 return (0);
2035 inodedep->id_state |= ALLCOMPLETE;
2036 LIST_REMOVE(inodedep, id_deps);
2037 inodedep->id_buf = NULL;
2038 if (inodedep->id_state & ONWORKLIST)
2039 WORKLIST_REMOVE(&inodedep->id_list);
2040 if (inodedep->id_savedino != NULL) {
2041 FREE(inodedep->id_savedino, M_INODEDEP);
2042 inodedep->id_savedino = NULL;
2043 }
2044 if (free_inodedep(inodedep) == 0) {
2045 FREE_LOCK(&lk);
2046 panic("check_inode_unwritten: busy inode");
2047 }
2048 return (1);
2049}
2050
2051/*
2052 * Try to free an inodedep structure. Return 1 if it could be freed.
2053 */
2054static int
2055free_inodedep(inodedep)
2056 struct inodedep *inodedep;
2057{
2058
2059 if ((inodedep->id_state & ONWORKLIST) != 0 ||
2060 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
2061 LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2062 LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2063 LIST_FIRST(&inodedep->id_inowait) != NULL ||
2064 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2065 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2066 inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL)
2067 return (0);
2068 LIST_REMOVE(inodedep, id_hash);
2069 WORKITEM_FREE(inodedep, D_INODEDEP);
2070 num_inodedep -= 1;
2071 return (1);
2072}
2073
2074/*
2075 * This workitem routine performs the block de-allocation.
2076 * The workitem is added to the pending list after the updated
2077 * inode block has been written to disk. As mentioned above,
2078 * checks regarding the number of blocks de-allocated (compared
2079 * to the number of blocks allocated for the file) are also
2080 * performed in this function.
2081 */
2082static void
2083handle_workitem_freeblocks(freeblks, flags)
2084 struct freeblks *freeblks;
2085 int flags;
2086{
2087 struct inode tip, *ip;
2088 struct vnode *vp;
2089 ufs_daddr_t bn;
2090 struct fs *fs;
2091 int i, level, bsize;
2092 long nblocks, blocksreleased = 0;
2093 int error, allerror = 0;
2094 ufs_lbn_t baselbns[NIADDR], tmpval;
2095
2096 tip.i_fs = fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
2097 tip.i_number = freeblks->fb_previousinum;
2098 tip.i_devvp = freeblks->fb_devvp;
2099 tip.i_dev = freeblks->fb_devvp->v_rdev;
2100 tip.i_size = freeblks->fb_oldsize;
2101 tip.i_uid = freeblks->fb_uid;
2102 tip.i_vnode = NULL;
2103 tmpval = 1;
2104 baselbns[0] = NDADDR;
2105 for (i = 1; i < NIADDR; i++) {
2106 tmpval *= NINDIR(fs);
2107 baselbns[i] = baselbns[i - 1] + tmpval;
2108 }
2109 nblocks = btodb(fs->fs_bsize);
2110 blocksreleased = 0;
2111 /*
2112 * Indirect blocks first.
2113 */
2114 for (level = (NIADDR - 1); level >= 0; level--) {
2115 if ((bn = freeblks->fb_iblks[level]) == 0)
2116 continue;
2117 if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
2118 baselbns[level], &blocksreleased)) == 0)
2119 allerror = error;
2120 ffs_blkfree(&tip, bn, fs->fs_bsize);
1999
2000 /*
2001 * If the inodedep does not exist, then the zero'ed inode has
2002 * been written to disk. If the allocated inode has never been
2003 * written to disk, then the on-disk inode is zero'ed. In either
2004 * case we can free the file immediately.
2005 */
2006 ACQUIRE_LOCK(&lk);
2007 if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
2008 check_inode_unwritten(inodedep)) {
2009 FREE_LOCK(&lk);
2010 handle_workitem_freefile(freefile);
2011 return;
2012 }
2013 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
2014 FREE_LOCK(&lk);
2015}
2016
2017/*
2018 * Check to see if an inode has never been written to disk. If
2019 * so free the inodedep and return success, otherwise return failure.
2020 * This routine must be called with splbio interrupts blocked.
2021 *
2022 * If we still have a bitmap dependency, then the inode has never
2023 * been written to disk. Drop the dependency as it is no longer
2024 * necessary since the inode is being deallocated. We set the
2025 * ALLCOMPLETE flags since the bitmap now properly shows that the
2026 * inode is not allocated. Even if the inode is actively being
2027 * written, it has been rolled back to its zero'ed state, so we
2028 * are ensured that a zero inode is what is on the disk. For short
2029 * lived files, this change will usually result in removing all the
2030 * dependencies from the inode so that it can be freed immediately.
2031 */
2032static int
2033check_inode_unwritten(inodedep)
2034 struct inodedep *inodedep;
2035{
2036
2037 if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
2038 LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2039 LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2040 LIST_FIRST(&inodedep->id_inowait) != NULL ||
2041 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2042 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2043 inodedep->id_nlinkdelta != 0)
2044 return (0);
2045 inodedep->id_state |= ALLCOMPLETE;
2046 LIST_REMOVE(inodedep, id_deps);
2047 inodedep->id_buf = NULL;
2048 if (inodedep->id_state & ONWORKLIST)
2049 WORKLIST_REMOVE(&inodedep->id_list);
2050 if (inodedep->id_savedino != NULL) {
2051 FREE(inodedep->id_savedino, M_INODEDEP);
2052 inodedep->id_savedino = NULL;
2053 }
2054 if (free_inodedep(inodedep) == 0) {
2055 FREE_LOCK(&lk);
2056 panic("check_inode_unwritten: busy inode");
2057 }
2058 return (1);
2059}
2060
2061/*
2062 * Try to free an inodedep structure. Return 1 if it could be freed.
2063 */
2064static int
2065free_inodedep(inodedep)
2066 struct inodedep *inodedep;
2067{
2068
2069 if ((inodedep->id_state & ONWORKLIST) != 0 ||
2070 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
2071 LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2072 LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2073 LIST_FIRST(&inodedep->id_inowait) != NULL ||
2074 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2075 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2076 inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL)
2077 return (0);
2078 LIST_REMOVE(inodedep, id_hash);
2079 WORKITEM_FREE(inodedep, D_INODEDEP);
2080 num_inodedep -= 1;
2081 return (1);
2082}
2083
2084/*
2085 * This workitem routine performs the block de-allocation.
2086 * The workitem is added to the pending list after the updated
2087 * inode block has been written to disk. As mentioned above,
2088 * checks regarding the number of blocks de-allocated (compared
2089 * to the number of blocks allocated for the file) are also
2090 * performed in this function.
2091 */
2092static void
2093handle_workitem_freeblocks(freeblks, flags)
2094 struct freeblks *freeblks;
2095 int flags;
2096{
2097 struct inode tip, *ip;
2098 struct vnode *vp;
2099 ufs_daddr_t bn;
2100 struct fs *fs;
2101 int i, level, bsize;
2102 long nblocks, blocksreleased = 0;
2103 int error, allerror = 0;
2104 ufs_lbn_t baselbns[NIADDR], tmpval;
2105
2106 tip.i_fs = fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
2107 tip.i_number = freeblks->fb_previousinum;
2108 tip.i_devvp = freeblks->fb_devvp;
2109 tip.i_dev = freeblks->fb_devvp->v_rdev;
2110 tip.i_size = freeblks->fb_oldsize;
2111 tip.i_uid = freeblks->fb_uid;
2112 tip.i_vnode = NULL;
2113 tmpval = 1;
2114 baselbns[0] = NDADDR;
2115 for (i = 1; i < NIADDR; i++) {
2116 tmpval *= NINDIR(fs);
2117 baselbns[i] = baselbns[i - 1] + tmpval;
2118 }
2119 nblocks = btodb(fs->fs_bsize);
2120 blocksreleased = 0;
2121 /*
2122 * Indirect blocks first.
2123 */
2124 for (level = (NIADDR - 1); level >= 0; level--) {
2125 if ((bn = freeblks->fb_iblks[level]) == 0)
2126 continue;
2127 if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
2128 baselbns[level], &blocksreleased)) == 0)
2129 allerror = error;
2130 ffs_blkfree(&tip, bn, fs->fs_bsize);
2131 fs->fs_pendingblocks -= nblocks;
2121 blocksreleased += nblocks;
2122 }
2123 /*
2124 * All direct blocks or frags.
2125 */
2126 for (i = (NDADDR - 1); i >= 0; i--) {
2127 if ((bn = freeblks->fb_dblks[i]) == 0)
2128 continue;
2129 bsize = blksize(fs, &tip, i);
2130 ffs_blkfree(&tip, bn, bsize);
2132 blocksreleased += nblocks;
2133 }
2134 /*
2135 * All direct blocks or frags.
2136 */
2137 for (i = (NDADDR - 1); i >= 0; i--) {
2138 if ((bn = freeblks->fb_dblks[i]) == 0)
2139 continue;
2140 bsize = blksize(fs, &tip, i);
2141 ffs_blkfree(&tip, bn, bsize);
2142 fs->fs_pendingblocks -= btodb(bsize);
2131 blocksreleased += btodb(bsize);
2132 }
2133 /*
2134 * If we still have not finished background cleanup, then check
2135 * to see if the block count needs to be adjusted.
2136 */
2137 if (freeblks->fb_chkcnt != blocksreleased &&
2138 (fs->fs_flags & FS_UNCLEAN) != 0 && (flags & LK_NOWAIT) == 0 &&
2139 VFS_VGET(freeblks->fb_mnt, freeblks->fb_previousinum, &vp) == 0) {
2140 ip = VTOI(vp);
2141 ip->i_blocks += freeblks->fb_chkcnt - blocksreleased;
2142 ip->i_flag |= IN_CHANGE;
2143 vput(vp);
2144 }
2145
2146#ifdef DIAGNOSTIC
2147 if (freeblks->fb_chkcnt != blocksreleased &&
2148 ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
2149 printf("handle_workitem_freeblocks: block count");
2150 if (allerror)
2151 softdep_error("handle_workitem_freeblks", allerror);
2152#endif /* DIAGNOSTIC */
2153
2154 WORKITEM_FREE(freeblks, D_FREEBLKS);
2155}
2156
2157/*
2158 * Release blocks associated with the inode ip and stored in the indirect
2159 * block dbn. If level is greater than SINGLE, the block is an indirect block
2160 * and recursive calls to indirtrunc must be used to cleanse other indirect
2161 * blocks.
2162 */
2163static int
2164indir_trunc(ip, dbn, level, lbn, countp)
2165 struct inode *ip;
2166 ufs_daddr_t dbn;
2167 int level;
2168 ufs_lbn_t lbn;
2169 long *countp;
2170{
2171 struct buf *bp;
2172 ufs_daddr_t *bap;
2173 ufs_daddr_t nb;
2174 struct fs *fs;
2175 struct worklist *wk;
2176 struct indirdep *indirdep;
2177 int i, lbnadd, nblocks;
2178 int error, allerror = 0;
2179
2180 fs = ip->i_fs;
2181 lbnadd = 1;
2182 for (i = level; i > 0; i--)
2183 lbnadd *= NINDIR(fs);
2184 /*
2185 * Get buffer of block pointers to be freed. This routine is not
2186 * called until the zero'ed inode has been written, so it is safe
2187 * to free blocks as they are encountered. Because the inode has
2188 * been zero'ed, calls to bmap on these blocks will fail. So, we
2189 * have to use the on-disk address and the block device for the
2190 * filesystem to look them up. If the file was deleted before its
2191 * indirect blocks were all written to disk, the routine that set
2192 * us up (deallocate_dependencies) will have arranged to leave
2193 * a complete copy of the indirect block in memory for our use.
2194 * Otherwise we have to read the blocks in from the disk.
2195 */
2196 ACQUIRE_LOCK(&lk);
2197 if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
2198 (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2199 if (wk->wk_type != D_INDIRDEP ||
2200 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2201 (indirdep->ir_state & GOINGAWAY) == 0) {
2202 FREE_LOCK(&lk);
2203 panic("indir_trunc: lost indirdep");
2204 }
2205 WORKLIST_REMOVE(wk);
2206 WORKITEM_FREE(indirdep, D_INDIRDEP);
2207 if (LIST_FIRST(&bp->b_dep) != NULL) {
2208 FREE_LOCK(&lk);
2209 panic("indir_trunc: dangling dep");
2210 }
2211 FREE_LOCK(&lk);
2212 } else {
2213 FREE_LOCK(&lk);
2214 error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
2215 if (error)
2216 return (error);
2217 }
2218 /*
2219 * Recursively free indirect blocks.
2220 */
2221 bap = (ufs_daddr_t *)bp->b_data;
2222 nblocks = btodb(fs->fs_bsize);
2223 for (i = NINDIR(fs) - 1; i >= 0; i--) {
2224 if ((nb = bap[i]) == 0)
2225 continue;
2226 if (level != 0) {
2227 if ((error = indir_trunc(ip, fsbtodb(fs, nb),
2228 level - 1, lbn + (i * lbnadd), countp)) != 0)
2229 allerror = error;
2230 }
2231 ffs_blkfree(ip, nb, fs->fs_bsize);
2143 blocksreleased += btodb(bsize);
2144 }
2145 /*
2146 * If we still have not finished background cleanup, then check
2147 * to see if the block count needs to be adjusted.
2148 */
2149 if (freeblks->fb_chkcnt != blocksreleased &&
2150 (fs->fs_flags & FS_UNCLEAN) != 0 && (flags & LK_NOWAIT) == 0 &&
2151 VFS_VGET(freeblks->fb_mnt, freeblks->fb_previousinum, &vp) == 0) {
2152 ip = VTOI(vp);
2153 ip->i_blocks += freeblks->fb_chkcnt - blocksreleased;
2154 ip->i_flag |= IN_CHANGE;
2155 vput(vp);
2156 }
2157
2158#ifdef DIAGNOSTIC
2159 if (freeblks->fb_chkcnt != blocksreleased &&
2160 ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
2161 printf("handle_workitem_freeblocks: block count");
2162 if (allerror)
2163 softdep_error("handle_workitem_freeblks", allerror);
2164#endif /* DIAGNOSTIC */
2165
2166 WORKITEM_FREE(freeblks, D_FREEBLKS);
2167}
2168
2169/*
2170 * Release blocks associated with the inode ip and stored in the indirect
2171 * block dbn. If level is greater than SINGLE, the block is an indirect block
2172 * and recursive calls to indirtrunc must be used to cleanse other indirect
2173 * blocks.
2174 */
2175static int
2176indir_trunc(ip, dbn, level, lbn, countp)
2177 struct inode *ip;
2178 ufs_daddr_t dbn;
2179 int level;
2180 ufs_lbn_t lbn;
2181 long *countp;
2182{
2183 struct buf *bp;
2184 ufs_daddr_t *bap;
2185 ufs_daddr_t nb;
2186 struct fs *fs;
2187 struct worklist *wk;
2188 struct indirdep *indirdep;
2189 int i, lbnadd, nblocks;
2190 int error, allerror = 0;
2191
2192 fs = ip->i_fs;
2193 lbnadd = 1;
2194 for (i = level; i > 0; i--)
2195 lbnadd *= NINDIR(fs);
2196 /*
2197 * Get buffer of block pointers to be freed. This routine is not
2198 * called until the zero'ed inode has been written, so it is safe
2199 * to free blocks as they are encountered. Because the inode has
2200 * been zero'ed, calls to bmap on these blocks will fail. So, we
2201 * have to use the on-disk address and the block device for the
2202 * filesystem to look them up. If the file was deleted before its
2203 * indirect blocks were all written to disk, the routine that set
2204 * us up (deallocate_dependencies) will have arranged to leave
2205 * a complete copy of the indirect block in memory for our use.
2206 * Otherwise we have to read the blocks in from the disk.
2207 */
2208 ACQUIRE_LOCK(&lk);
2209 if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
2210 (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2211 if (wk->wk_type != D_INDIRDEP ||
2212 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2213 (indirdep->ir_state & GOINGAWAY) == 0) {
2214 FREE_LOCK(&lk);
2215 panic("indir_trunc: lost indirdep");
2216 }
2217 WORKLIST_REMOVE(wk);
2218 WORKITEM_FREE(indirdep, D_INDIRDEP);
2219 if (LIST_FIRST(&bp->b_dep) != NULL) {
2220 FREE_LOCK(&lk);
2221 panic("indir_trunc: dangling dep");
2222 }
2223 FREE_LOCK(&lk);
2224 } else {
2225 FREE_LOCK(&lk);
2226 error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
2227 if (error)
2228 return (error);
2229 }
2230 /*
2231 * Recursively free indirect blocks.
2232 */
2233 bap = (ufs_daddr_t *)bp->b_data;
2234 nblocks = btodb(fs->fs_bsize);
2235 for (i = NINDIR(fs) - 1; i >= 0; i--) {
2236 if ((nb = bap[i]) == 0)
2237 continue;
2238 if (level != 0) {
2239 if ((error = indir_trunc(ip, fsbtodb(fs, nb),
2240 level - 1, lbn + (i * lbnadd), countp)) != 0)
2241 allerror = error;
2242 }
2243 ffs_blkfree(ip, nb, fs->fs_bsize);
2244 fs->fs_pendingblocks -= nblocks;
2232 *countp += nblocks;
2233 }
2234 bp->b_flags |= B_INVAL | B_NOCACHE;
2235 brelse(bp);
2236 return (allerror);
2237}
2238
2239/*
2240 * Free an allocindir.
2241 * This routine must be called with splbio interrupts blocked.
2242 */
2243static void
2244free_allocindir(aip, inodedep)
2245 struct allocindir *aip;
2246 struct inodedep *inodedep;
2247{
2248 struct freefrag *freefrag;
2249
2250#ifdef DEBUG
2251 if (lk.lkt_held == -1)
2252 panic("free_allocindir: lock not held");
2253#endif
2254 if ((aip->ai_state & DEPCOMPLETE) == 0)
2255 LIST_REMOVE(aip, ai_deps);
2256 if (aip->ai_state & ONWORKLIST)
2257 WORKLIST_REMOVE(&aip->ai_list);
2258 LIST_REMOVE(aip, ai_next);
2259 if ((freefrag = aip->ai_freefrag) != NULL) {
2260 if (inodedep == NULL)
2261 add_to_worklist(&freefrag->ff_list);
2262 else
2263 WORKLIST_INSERT(&inodedep->id_bufwait,
2264 &freefrag->ff_list);
2265 }
2266 WORKITEM_FREE(aip, D_ALLOCINDIR);
2267}
2268
2269/*
2270 * Directory entry addition dependencies.
2271 *
2272 * When adding a new directory entry, the inode (with its incremented link
2273 * count) must be written to disk before the directory entry's pointer to it.
2274 * Also, if the inode is newly allocated, the corresponding freemap must be
2275 * updated (on disk) before the directory entry's pointer. These requirements
2276 * are met via undo/redo on the directory entry's pointer, which consists
2277 * simply of the inode number.
2278 *
2279 * As directory entries are added and deleted, the free space within a
2280 * directory block can become fragmented. The ufs file system will compact
2281 * a fragmented directory block to make space for a new entry. When this
2282 * occurs, the offsets of previously added entries change. Any "diradd"
2283 * dependency structures corresponding to these entries must be updated with
2284 * the new offsets.
2285 */
2286
2287/*
2288 * This routine is called after the in-memory inode's link
2289 * count has been incremented, but before the directory entry's
2290 * pointer to the inode has been set.
2291 */
2292void
2293softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
2294 struct buf *bp; /* buffer containing directory block */
2295 struct inode *dp; /* inode for directory */
2296 off_t diroffset; /* offset of new entry in directory */
2297 long newinum; /* inode referenced by new directory entry */
2298 struct buf *newdirbp; /* non-NULL => contents of new mkdir */
2299{
2300 int offset; /* offset of new entry within directory block */
2301 ufs_lbn_t lbn; /* block in directory containing new entry */
2302 struct fs *fs;
2303 struct diradd *dap;
2304 struct pagedep *pagedep;
2305 struct inodedep *inodedep;
2306 struct mkdir *mkdir1, *mkdir2;
2307
2308 /*
2309 * Whiteouts have no dependencies.
2310 */
2311 if (newinum == WINO) {
2312 if (newdirbp != NULL)
2313 bdwrite(newdirbp);
2314 return;
2315 }
2316
2317 fs = dp->i_fs;
2318 lbn = lblkno(fs, diroffset);
2319 offset = blkoff(fs, diroffset);
2320 MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
2321 M_SOFTDEP_FLAGS|M_ZERO);
2322 dap->da_list.wk_type = D_DIRADD;
2323 dap->da_offset = offset;
2324 dap->da_newinum = newinum;
2325 dap->da_state = ATTACHED;
2326 if (newdirbp == NULL) {
2327 dap->da_state |= DEPCOMPLETE;
2328 ACQUIRE_LOCK(&lk);
2329 } else {
2330 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2331 MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2332 M_SOFTDEP_FLAGS);
2333 mkdir1->md_list.wk_type = D_MKDIR;
2334 mkdir1->md_state = MKDIR_BODY;
2335 mkdir1->md_diradd = dap;
2336 MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2337 M_SOFTDEP_FLAGS);
2338 mkdir2->md_list.wk_type = D_MKDIR;
2339 mkdir2->md_state = MKDIR_PARENT;
2340 mkdir2->md_diradd = dap;
2341 /*
2342 * Dependency on "." and ".." being written to disk.
2343 */
2344 mkdir1->md_buf = newdirbp;
2345 ACQUIRE_LOCK(&lk);
2346 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2347 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2348 FREE_LOCK(&lk);
2349 bdwrite(newdirbp);
2350 /*
2351 * Dependency on link count increase for parent directory
2352 */
2353 ACQUIRE_LOCK(&lk);
2354 if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
2355 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2356 dap->da_state &= ~MKDIR_PARENT;
2357 WORKITEM_FREE(mkdir2, D_MKDIR);
2358 } else {
2359 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2360 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2361 }
2362 }
2363 /*
2364 * Link into parent directory pagedep to await its being written.
2365 */
2366 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2367 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2368 dap->da_pagedep = pagedep;
2369 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2370 da_pdlist);
2371 /*
2372 * Link into its inodedep. Put it on the id_bufwait list if the inode
2373 * is not yet written. If it is written, do the post-inode write
2374 * processing to put it on the id_pendinghd list.
2375 */
2376 (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2377 if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2378 diradd_inode_written(dap, inodedep);
2379 else
2380 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2381 FREE_LOCK(&lk);
2382}
2383
2384/*
2385 * This procedure is called to change the offset of a directory
2386 * entry when compacting a directory block which must be owned
2387 * exclusively by the caller. Note that the actual entry movement
2388 * must be done in this procedure to ensure that no I/O completions
2389 * occur while the move is in progress.
2390 */
2391void
2392softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2393 struct inode *dp; /* inode for directory */
2394 caddr_t base; /* address of dp->i_offset */
2395 caddr_t oldloc; /* address of old directory location */
2396 caddr_t newloc; /* address of new directory location */
2397 int entrysize; /* size of directory entry */
2398{
2399 int offset, oldoffset, newoffset;
2400 struct pagedep *pagedep;
2401 struct diradd *dap;
2402 ufs_lbn_t lbn;
2403
2404 ACQUIRE_LOCK(&lk);
2405 lbn = lblkno(dp->i_fs, dp->i_offset);
2406 offset = blkoff(dp->i_fs, dp->i_offset);
2407 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2408 goto done;
2409 oldoffset = offset + (oldloc - base);
2410 newoffset = offset + (newloc - base);
2411
2412 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
2413 if (dap->da_offset != oldoffset)
2414 continue;
2415 dap->da_offset = newoffset;
2416 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2417 break;
2418 LIST_REMOVE(dap, da_pdlist);
2419 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2420 dap, da_pdlist);
2421 break;
2422 }
2423 if (dap == NULL) {
2424
2425 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
2426 if (dap->da_offset == oldoffset) {
2427 dap->da_offset = newoffset;
2428 break;
2429 }
2430 }
2431 }
2432done:
2433 bcopy(oldloc, newloc, entrysize);
2434 FREE_LOCK(&lk);
2435}
2436
2437/*
2438 * Free a diradd dependency structure. This routine must be called
2439 * with splbio interrupts blocked.
2440 */
2441static void
2442free_diradd(dap)
2443 struct diradd *dap;
2444{
2445 struct dirrem *dirrem;
2446 struct pagedep *pagedep;
2447 struct inodedep *inodedep;
2448 struct mkdir *mkdir, *nextmd;
2449
2450#ifdef DEBUG
2451 if (lk.lkt_held == -1)
2452 panic("free_diradd: lock not held");
2453#endif
2454 WORKLIST_REMOVE(&dap->da_list);
2455 LIST_REMOVE(dap, da_pdlist);
2456 if ((dap->da_state & DIRCHG) == 0) {
2457 pagedep = dap->da_pagedep;
2458 } else {
2459 dirrem = dap->da_previous;
2460 pagedep = dirrem->dm_pagedep;
2461 dirrem->dm_dirinum = pagedep->pd_ino;
2462 add_to_worklist(&dirrem->dm_list);
2463 }
2464 if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2465 0, &inodedep) != 0)
2466 (void) free_inodedep(inodedep);
2467 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2468 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2469 nextmd = LIST_NEXT(mkdir, md_mkdirs);
2470 if (mkdir->md_diradd != dap)
2471 continue;
2472 dap->da_state &= ~mkdir->md_state;
2473 WORKLIST_REMOVE(&mkdir->md_list);
2474 LIST_REMOVE(mkdir, md_mkdirs);
2475 WORKITEM_FREE(mkdir, D_MKDIR);
2476 }
2477 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2478 FREE_LOCK(&lk);
2479 panic("free_diradd: unfound ref");
2480 }
2481 }
2482 WORKITEM_FREE(dap, D_DIRADD);
2483}
2484
2485/*
2486 * Directory entry removal dependencies.
2487 *
2488 * When removing a directory entry, the entry's inode pointer must be
2489 * zero'ed on disk before the corresponding inode's link count is decremented
2490 * (possibly freeing the inode for re-use). This dependency is handled by
2491 * updating the directory entry but delaying the inode count reduction until
2492 * after the directory block has been written to disk. After this point, the
2493 * inode count can be decremented whenever it is convenient.
2494 */
2495
2496/*
2497 * This routine should be called immediately after removing
2498 * a directory entry. The inode's link count should not be
2499 * decremented by the calling procedure -- the soft updates
2500 * code will do this task when it is safe.
2501 */
2502void
2503softdep_setup_remove(bp, dp, ip, isrmdir)
2504 struct buf *bp; /* buffer containing directory block */
2505 struct inode *dp; /* inode for the directory being modified */
2506 struct inode *ip; /* inode for directory entry being removed */
2507 int isrmdir; /* indicates if doing RMDIR */
2508{
2509 struct dirrem *dirrem, *prevdirrem;
2510
2511 /*
2512 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2513 */
2514 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2515
2516 /*
2517 * If the COMPLETE flag is clear, then there were no active
2518 * entries and we want to roll back to a zeroed entry until
2519 * the new inode is committed to disk. If the COMPLETE flag is
2520 * set then we have deleted an entry that never made it to
2521 * disk. If the entry we deleted resulted from a name change,
2522 * then the old name still resides on disk. We cannot delete
2523 * its inode (returned to us in prevdirrem) until the zeroed
2524 * directory entry gets to disk. The new inode has never been
2525 * referenced on the disk, so can be deleted immediately.
2526 */
2527 if ((dirrem->dm_state & COMPLETE) == 0) {
2528 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2529 dm_next);
2530 FREE_LOCK(&lk);
2531 } else {
2532 if (prevdirrem != NULL)
2533 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
2534 prevdirrem, dm_next);
2535 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2536 FREE_LOCK(&lk);
2537 handle_workitem_remove(dirrem);
2538 }
2539}
2540
2541/*
2542 * Allocate a new dirrem if appropriate and return it along with
2543 * its associated pagedep. Called without a lock, returns with lock.
2544 */
2545static long num_dirrem; /* number of dirrem allocated */
2546static struct dirrem *
2547newdirrem(bp, dp, ip, isrmdir, prevdirremp)
2548 struct buf *bp; /* buffer containing directory block */
2549 struct inode *dp; /* inode for the directory being modified */
2550 struct inode *ip; /* inode for directory entry being removed */
2551 int isrmdir; /* indicates if doing RMDIR */
2552 struct dirrem **prevdirremp; /* previously referenced inode, if any */
2553{
2554 int offset;
2555 ufs_lbn_t lbn;
2556 struct diradd *dap;
2557 struct dirrem *dirrem;
2558 struct pagedep *pagedep;
2559
2560 /*
2561 * Whiteouts have no deletion dependencies.
2562 */
2563 if (ip == NULL)
2564 panic("newdirrem: whiteout");
2565 /*
2566 * If we are over our limit, try to improve the situation.
2567 * Limiting the number of dirrem structures will also limit
2568 * the number of freefile and freeblks structures.
2569 */
2570 if (num_dirrem > max_softdeps / 2)
2571 (void) request_cleanup(FLUSH_REMOVE, 0);
2572 num_dirrem += 1;
2573 MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
2574 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
2575 dirrem->dm_list.wk_type = D_DIRREM;
2576 dirrem->dm_state = isrmdir ? RMDIR : 0;
2577 dirrem->dm_mnt = ITOV(ip)->v_mount;
2578 dirrem->dm_oldinum = ip->i_number;
2579 *prevdirremp = NULL;
2580
2581 ACQUIRE_LOCK(&lk);
2582 lbn = lblkno(dp->i_fs, dp->i_offset);
2583 offset = blkoff(dp->i_fs, dp->i_offset);
2584 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2585 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2586 dirrem->dm_pagedep = pagedep;
2587 /*
2588 * Check for a diradd dependency for the same directory entry.
2589 * If present, then both dependencies become obsolete and can
2590 * be de-allocated. Check for an entry on both the pd_dirraddhd
2591 * list and the pd_pendinghd list.
2592 */
2593
2594 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
2595 if (dap->da_offset == offset)
2596 break;
2597 if (dap == NULL) {
2598
2599 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
2600 if (dap->da_offset == offset)
2601 break;
2602 if (dap == NULL)
2603 return (dirrem);
2604 }
2605 /*
2606 * Must be ATTACHED at this point.
2607 */
2608 if ((dap->da_state & ATTACHED) == 0) {
2609 FREE_LOCK(&lk);
2610 panic("newdirrem: not ATTACHED");
2611 }
2612 if (dap->da_newinum != ip->i_number) {
2613 FREE_LOCK(&lk);
2614 panic("newdirrem: inum %d should be %d",
2615 ip->i_number, dap->da_newinum);
2616 }
2617 /*
2618 * If we are deleting a changed name that never made it to disk,
2619 * then return the dirrem describing the previous inode (which
2620 * represents the inode currently referenced from this entry on disk).
2621 */
2622 if ((dap->da_state & DIRCHG) != 0) {
2623 *prevdirremp = dap->da_previous;
2624 dap->da_state &= ~DIRCHG;
2625 dap->da_pagedep = pagedep;
2626 }
2627 /*
2628 * We are deleting an entry that never made it to disk.
2629 * Mark it COMPLETE so we can delete its inode immediately.
2630 */
2631 dirrem->dm_state |= COMPLETE;
2632 free_diradd(dap);
2633 return (dirrem);
2634}
2635
2636/*
2637 * Directory entry change dependencies.
2638 *
2639 * Changing an existing directory entry requires that an add operation
2640 * be completed first followed by a deletion. The semantics for the addition
2641 * are identical to the description of adding a new entry above except
2642 * that the rollback is to the old inode number rather than zero. Once
2643 * the addition dependency is completed, the removal is done as described
2644 * in the removal routine above.
2645 */
2646
2647/*
2648 * This routine should be called immediately after changing
2649 * a directory entry. The inode's link count should not be
2650 * decremented by the calling procedure -- the soft updates
2651 * code will perform this task when it is safe.
2652 */
2653void
2654softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
2655 struct buf *bp; /* buffer containing directory block */
2656 struct inode *dp; /* inode for the directory being modified */
2657 struct inode *ip; /* inode for directory entry being removed */
2658 long newinum; /* new inode number for changed entry */
2659 int isrmdir; /* indicates if doing RMDIR */
2660{
2661 int offset;
2662 struct diradd *dap = NULL;
2663 struct dirrem *dirrem, *prevdirrem;
2664 struct pagedep *pagedep;
2665 struct inodedep *inodedep;
2666
2667 offset = blkoff(dp->i_fs, dp->i_offset);
2668
2669 /*
2670 * Whiteouts do not need diradd dependencies.
2671 */
2672 if (newinum != WINO) {
2673 MALLOC(dap, struct diradd *, sizeof(struct diradd),
2674 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
2675 dap->da_list.wk_type = D_DIRADD;
2676 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
2677 dap->da_offset = offset;
2678 dap->da_newinum = newinum;
2679 }
2680
2681 /*
2682 * Allocate a new dirrem and ACQUIRE_LOCK.
2683 */
2684 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2685 pagedep = dirrem->dm_pagedep;
2686 /*
2687 * The possible values for isrmdir:
2688 * 0 - non-directory file rename
2689 * 1 - directory rename within same directory
2690 * inum - directory rename to new directory of given inode number
2691 * When renaming to a new directory, we are both deleting and
2692 * creating a new directory entry, so the link count on the new
2693 * directory should not change. Thus we do not need the followup
2694 * dirrem which is usually done in handle_workitem_remove. We set
2695 * the DIRCHG flag to tell handle_workitem_remove to skip the
2696 * followup dirrem.
2697 */
2698 if (isrmdir > 1)
2699 dirrem->dm_state |= DIRCHG;
2700
2701 /*
2702 * Whiteouts have no additional dependencies,
2703 * so just put the dirrem on the correct list.
2704 */
2705 if (newinum == WINO) {
2706 if ((dirrem->dm_state & COMPLETE) == 0) {
2707 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
2708 dm_next);
2709 } else {
2710 dirrem->dm_dirinum = pagedep->pd_ino;
2711 add_to_worklist(&dirrem->dm_list);
2712 }
2713 FREE_LOCK(&lk);
2714 return;
2715 }
2716
2717 /*
2718 * If the COMPLETE flag is clear, then there were no active
2719 * entries and we want to roll back to the previous inode until
2720 * the new inode is committed to disk. If the COMPLETE flag is
2721 * set, then we have deleted an entry that never made it to disk.
2722 * If the entry we deleted resulted from a name change, then the old
2723 * inode reference still resides on disk. Any rollback that we do
2724 * needs to be to that old inode (returned to us in prevdirrem). If
2725 * the entry we deleted resulted from a create, then there is
2726 * no entry on the disk, so we want to roll back to zero rather
2727 * than the uncommitted inode. In either of the COMPLETE cases we
2728 * want to immediately free the unwritten and unreferenced inode.
2729 */
2730 if ((dirrem->dm_state & COMPLETE) == 0) {
2731 dap->da_previous = dirrem;
2732 } else {
2733 if (prevdirrem != NULL) {
2734 dap->da_previous = prevdirrem;
2735 } else {
2736 dap->da_state &= ~DIRCHG;
2737 dap->da_pagedep = pagedep;
2738 }
2739 dirrem->dm_dirinum = pagedep->pd_ino;
2740 add_to_worklist(&dirrem->dm_list);
2741 }
2742 /*
2743 * Link into its inodedep. Put it on the id_bufwait list if the inode
2744 * is not yet written. If it is written, do the post-inode write
2745 * processing to put it on the id_pendinghd list.
2746 */
2747 if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
2748 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2749 dap->da_state |= COMPLETE;
2750 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
2751 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
2752 } else {
2753 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
2754 dap, da_pdlist);
2755 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2756 }
2757 FREE_LOCK(&lk);
2758}
2759
2760/*
2761 * Called whenever the link count on an inode is changed.
2762 * It creates an inode dependency so that the new reference(s)
2763 * to the inode cannot be committed to disk until the updated
2764 * inode has been written.
2765 */
2766void
2767softdep_change_linkcnt(ip)
2768 struct inode *ip; /* the inode with the increased link count */
2769{
2770 struct inodedep *inodedep;
2771
2772 ACQUIRE_LOCK(&lk);
2773 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
2774 if (ip->i_nlink < ip->i_effnlink) {
2775 FREE_LOCK(&lk);
2776 panic("softdep_change_linkcnt: bad delta");
2777 }
2778 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2779 FREE_LOCK(&lk);
2780}
2781
2782/*
2245 *countp += nblocks;
2246 }
2247 bp->b_flags |= B_INVAL | B_NOCACHE;
2248 brelse(bp);
2249 return (allerror);
2250}
2251
2252/*
2253 * Free an allocindir.
2254 * This routine must be called with splbio interrupts blocked.
2255 */
2256static void
2257free_allocindir(aip, inodedep)
2258 struct allocindir *aip;
2259 struct inodedep *inodedep;
2260{
2261 struct freefrag *freefrag;
2262
2263#ifdef DEBUG
2264 if (lk.lkt_held == -1)
2265 panic("free_allocindir: lock not held");
2266#endif
2267 if ((aip->ai_state & DEPCOMPLETE) == 0)
2268 LIST_REMOVE(aip, ai_deps);
2269 if (aip->ai_state & ONWORKLIST)
2270 WORKLIST_REMOVE(&aip->ai_list);
2271 LIST_REMOVE(aip, ai_next);
2272 if ((freefrag = aip->ai_freefrag) != NULL) {
2273 if (inodedep == NULL)
2274 add_to_worklist(&freefrag->ff_list);
2275 else
2276 WORKLIST_INSERT(&inodedep->id_bufwait,
2277 &freefrag->ff_list);
2278 }
2279 WORKITEM_FREE(aip, D_ALLOCINDIR);
2280}
2281
2282/*
2283 * Directory entry addition dependencies.
2284 *
2285 * When adding a new directory entry, the inode (with its incremented link
2286 * count) must be written to disk before the directory entry's pointer to it.
2287 * Also, if the inode is newly allocated, the corresponding freemap must be
2288 * updated (on disk) before the directory entry's pointer. These requirements
2289 * are met via undo/redo on the directory entry's pointer, which consists
2290 * simply of the inode number.
2291 *
2292 * As directory entries are added and deleted, the free space within a
2293 * directory block can become fragmented. The ufs file system will compact
2294 * a fragmented directory block to make space for a new entry. When this
2295 * occurs, the offsets of previously added entries change. Any "diradd"
2296 * dependency structures corresponding to these entries must be updated with
2297 * the new offsets.
2298 */
2299
2300/*
2301 * This routine is called after the in-memory inode's link
2302 * count has been incremented, but before the directory entry's
2303 * pointer to the inode has been set.
2304 */
2305void
2306softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
2307 struct buf *bp; /* buffer containing directory block */
2308 struct inode *dp; /* inode for directory */
2309 off_t diroffset; /* offset of new entry in directory */
2310 long newinum; /* inode referenced by new directory entry */
2311 struct buf *newdirbp; /* non-NULL => contents of new mkdir */
2312{
2313 int offset; /* offset of new entry within directory block */
2314 ufs_lbn_t lbn; /* block in directory containing new entry */
2315 struct fs *fs;
2316 struct diradd *dap;
2317 struct pagedep *pagedep;
2318 struct inodedep *inodedep;
2319 struct mkdir *mkdir1, *mkdir2;
2320
2321 /*
2322 * Whiteouts have no dependencies.
2323 */
2324 if (newinum == WINO) {
2325 if (newdirbp != NULL)
2326 bdwrite(newdirbp);
2327 return;
2328 }
2329
2330 fs = dp->i_fs;
2331 lbn = lblkno(fs, diroffset);
2332 offset = blkoff(fs, diroffset);
2333 MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
2334 M_SOFTDEP_FLAGS|M_ZERO);
2335 dap->da_list.wk_type = D_DIRADD;
2336 dap->da_offset = offset;
2337 dap->da_newinum = newinum;
2338 dap->da_state = ATTACHED;
2339 if (newdirbp == NULL) {
2340 dap->da_state |= DEPCOMPLETE;
2341 ACQUIRE_LOCK(&lk);
2342 } else {
2343 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2344 MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2345 M_SOFTDEP_FLAGS);
2346 mkdir1->md_list.wk_type = D_MKDIR;
2347 mkdir1->md_state = MKDIR_BODY;
2348 mkdir1->md_diradd = dap;
2349 MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2350 M_SOFTDEP_FLAGS);
2351 mkdir2->md_list.wk_type = D_MKDIR;
2352 mkdir2->md_state = MKDIR_PARENT;
2353 mkdir2->md_diradd = dap;
2354 /*
2355 * Dependency on "." and ".." being written to disk.
2356 */
2357 mkdir1->md_buf = newdirbp;
2358 ACQUIRE_LOCK(&lk);
2359 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2360 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2361 FREE_LOCK(&lk);
2362 bdwrite(newdirbp);
2363 /*
2364 * Dependency on link count increase for parent directory
2365 */
2366 ACQUIRE_LOCK(&lk);
2367 if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
2368 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2369 dap->da_state &= ~MKDIR_PARENT;
2370 WORKITEM_FREE(mkdir2, D_MKDIR);
2371 } else {
2372 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2373 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2374 }
2375 }
2376 /*
2377 * Link into parent directory pagedep to await its being written.
2378 */
2379 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2380 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2381 dap->da_pagedep = pagedep;
2382 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2383 da_pdlist);
2384 /*
2385 * Link into its inodedep. Put it on the id_bufwait list if the inode
2386 * is not yet written. If it is written, do the post-inode write
2387 * processing to put it on the id_pendinghd list.
2388 */
2389 (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2390 if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2391 diradd_inode_written(dap, inodedep);
2392 else
2393 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2394 FREE_LOCK(&lk);
2395}
2396
2397/*
2398 * This procedure is called to change the offset of a directory
2399 * entry when compacting a directory block which must be owned
2400 * exclusively by the caller. Note that the actual entry movement
2401 * must be done in this procedure to ensure that no I/O completions
2402 * occur while the move is in progress.
2403 */
2404void
2405softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2406 struct inode *dp; /* inode for directory */
2407 caddr_t base; /* address of dp->i_offset */
2408 caddr_t oldloc; /* address of old directory location */
2409 caddr_t newloc; /* address of new directory location */
2410 int entrysize; /* size of directory entry */
2411{
2412 int offset, oldoffset, newoffset;
2413 struct pagedep *pagedep;
2414 struct diradd *dap;
2415 ufs_lbn_t lbn;
2416
2417 ACQUIRE_LOCK(&lk);
2418 lbn = lblkno(dp->i_fs, dp->i_offset);
2419 offset = blkoff(dp->i_fs, dp->i_offset);
2420 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2421 goto done;
2422 oldoffset = offset + (oldloc - base);
2423 newoffset = offset + (newloc - base);
2424
2425 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
2426 if (dap->da_offset != oldoffset)
2427 continue;
2428 dap->da_offset = newoffset;
2429 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2430 break;
2431 LIST_REMOVE(dap, da_pdlist);
2432 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2433 dap, da_pdlist);
2434 break;
2435 }
2436 if (dap == NULL) {
2437
2438 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
2439 if (dap->da_offset == oldoffset) {
2440 dap->da_offset = newoffset;
2441 break;
2442 }
2443 }
2444 }
2445done:
2446 bcopy(oldloc, newloc, entrysize);
2447 FREE_LOCK(&lk);
2448}
2449
2450/*
2451 * Free a diradd dependency structure. This routine must be called
2452 * with splbio interrupts blocked.
2453 */
2454static void
2455free_diradd(dap)
2456 struct diradd *dap;
2457{
2458 struct dirrem *dirrem;
2459 struct pagedep *pagedep;
2460 struct inodedep *inodedep;
2461 struct mkdir *mkdir, *nextmd;
2462
2463#ifdef DEBUG
2464 if (lk.lkt_held == -1)
2465 panic("free_diradd: lock not held");
2466#endif
2467 WORKLIST_REMOVE(&dap->da_list);
2468 LIST_REMOVE(dap, da_pdlist);
2469 if ((dap->da_state & DIRCHG) == 0) {
2470 pagedep = dap->da_pagedep;
2471 } else {
2472 dirrem = dap->da_previous;
2473 pagedep = dirrem->dm_pagedep;
2474 dirrem->dm_dirinum = pagedep->pd_ino;
2475 add_to_worklist(&dirrem->dm_list);
2476 }
2477 if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2478 0, &inodedep) != 0)
2479 (void) free_inodedep(inodedep);
2480 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2481 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2482 nextmd = LIST_NEXT(mkdir, md_mkdirs);
2483 if (mkdir->md_diradd != dap)
2484 continue;
2485 dap->da_state &= ~mkdir->md_state;
2486 WORKLIST_REMOVE(&mkdir->md_list);
2487 LIST_REMOVE(mkdir, md_mkdirs);
2488 WORKITEM_FREE(mkdir, D_MKDIR);
2489 }
2490 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2491 FREE_LOCK(&lk);
2492 panic("free_diradd: unfound ref");
2493 }
2494 }
2495 WORKITEM_FREE(dap, D_DIRADD);
2496}
2497
2498/*
2499 * Directory entry removal dependencies.
2500 *
2501 * When removing a directory entry, the entry's inode pointer must be
2502 * zero'ed on disk before the corresponding inode's link count is decremented
2503 * (possibly freeing the inode for re-use). This dependency is handled by
2504 * updating the directory entry but delaying the inode count reduction until
2505 * after the directory block has been written to disk. After this point, the
2506 * inode count can be decremented whenever it is convenient.
2507 */
2508
2509/*
2510 * This routine should be called immediately after removing
2511 * a directory entry. The inode's link count should not be
2512 * decremented by the calling procedure -- the soft updates
2513 * code will do this task when it is safe.
2514 */
2515void
2516softdep_setup_remove(bp, dp, ip, isrmdir)
2517 struct buf *bp; /* buffer containing directory block */
2518 struct inode *dp; /* inode for the directory being modified */
2519 struct inode *ip; /* inode for directory entry being removed */
2520 int isrmdir; /* indicates if doing RMDIR */
2521{
2522 struct dirrem *dirrem, *prevdirrem;
2523
2524 /*
2525 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2526 */
2527 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2528
2529 /*
2530 * If the COMPLETE flag is clear, then there were no active
2531 * entries and we want to roll back to a zeroed entry until
2532 * the new inode is committed to disk. If the COMPLETE flag is
2533 * set then we have deleted an entry that never made it to
2534 * disk. If the entry we deleted resulted from a name change,
2535 * then the old name still resides on disk. We cannot delete
2536 * its inode (returned to us in prevdirrem) until the zeroed
2537 * directory entry gets to disk. The new inode has never been
2538 * referenced on the disk, so can be deleted immediately.
2539 */
2540 if ((dirrem->dm_state & COMPLETE) == 0) {
2541 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2542 dm_next);
2543 FREE_LOCK(&lk);
2544 } else {
2545 if (prevdirrem != NULL)
2546 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
2547 prevdirrem, dm_next);
2548 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2549 FREE_LOCK(&lk);
2550 handle_workitem_remove(dirrem);
2551 }
2552}
2553
2554/*
2555 * Allocate a new dirrem if appropriate and return it along with
2556 * its associated pagedep. Called without a lock, returns with lock.
2557 */
2558static long num_dirrem; /* number of dirrem allocated */
2559static struct dirrem *
2560newdirrem(bp, dp, ip, isrmdir, prevdirremp)
2561 struct buf *bp; /* buffer containing directory block */
2562 struct inode *dp; /* inode for the directory being modified */
2563 struct inode *ip; /* inode for directory entry being removed */
2564 int isrmdir; /* indicates if doing RMDIR */
2565 struct dirrem **prevdirremp; /* previously referenced inode, if any */
2566{
2567 int offset;
2568 ufs_lbn_t lbn;
2569 struct diradd *dap;
2570 struct dirrem *dirrem;
2571 struct pagedep *pagedep;
2572
2573 /*
2574 * Whiteouts have no deletion dependencies.
2575 */
2576 if (ip == NULL)
2577 panic("newdirrem: whiteout");
2578 /*
2579 * If we are over our limit, try to improve the situation.
2580 * Limiting the number of dirrem structures will also limit
2581 * the number of freefile and freeblks structures.
2582 */
2583 if (num_dirrem > max_softdeps / 2)
2584 (void) request_cleanup(FLUSH_REMOVE, 0);
2585 num_dirrem += 1;
2586 MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
2587 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
2588 dirrem->dm_list.wk_type = D_DIRREM;
2589 dirrem->dm_state = isrmdir ? RMDIR : 0;
2590 dirrem->dm_mnt = ITOV(ip)->v_mount;
2591 dirrem->dm_oldinum = ip->i_number;
2592 *prevdirremp = NULL;
2593
2594 ACQUIRE_LOCK(&lk);
2595 lbn = lblkno(dp->i_fs, dp->i_offset);
2596 offset = blkoff(dp->i_fs, dp->i_offset);
2597 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2598 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2599 dirrem->dm_pagedep = pagedep;
2600 /*
2601 * Check for a diradd dependency for the same directory entry.
2602 * If present, then both dependencies become obsolete and can
2603 * be de-allocated. Check for an entry on both the pd_dirraddhd
2604 * list and the pd_pendinghd list.
2605 */
2606
2607 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
2608 if (dap->da_offset == offset)
2609 break;
2610 if (dap == NULL) {
2611
2612 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
2613 if (dap->da_offset == offset)
2614 break;
2615 if (dap == NULL)
2616 return (dirrem);
2617 }
2618 /*
2619 * Must be ATTACHED at this point.
2620 */
2621 if ((dap->da_state & ATTACHED) == 0) {
2622 FREE_LOCK(&lk);
2623 panic("newdirrem: not ATTACHED");
2624 }
2625 if (dap->da_newinum != ip->i_number) {
2626 FREE_LOCK(&lk);
2627 panic("newdirrem: inum %d should be %d",
2628 ip->i_number, dap->da_newinum);
2629 }
2630 /*
2631 * If we are deleting a changed name that never made it to disk,
2632 * then return the dirrem describing the previous inode (which
2633 * represents the inode currently referenced from this entry on disk).
2634 */
2635 if ((dap->da_state & DIRCHG) != 0) {
2636 *prevdirremp = dap->da_previous;
2637 dap->da_state &= ~DIRCHG;
2638 dap->da_pagedep = pagedep;
2639 }
2640 /*
2641 * We are deleting an entry that never made it to disk.
2642 * Mark it COMPLETE so we can delete its inode immediately.
2643 */
2644 dirrem->dm_state |= COMPLETE;
2645 free_diradd(dap);
2646 return (dirrem);
2647}
2648
2649/*
2650 * Directory entry change dependencies.
2651 *
2652 * Changing an existing directory entry requires that an add operation
2653 * be completed first followed by a deletion. The semantics for the addition
2654 * are identical to the description of adding a new entry above except
2655 * that the rollback is to the old inode number rather than zero. Once
2656 * the addition dependency is completed, the removal is done as described
2657 * in the removal routine above.
2658 */
2659
2660/*
2661 * This routine should be called immediately after changing
2662 * a directory entry. The inode's link count should not be
2663 * decremented by the calling procedure -- the soft updates
2664 * code will perform this task when it is safe.
2665 */
2666void
2667softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
2668 struct buf *bp; /* buffer containing directory block */
2669 struct inode *dp; /* inode for the directory being modified */
2670 struct inode *ip; /* inode for directory entry being removed */
2671 long newinum; /* new inode number for changed entry */
2672 int isrmdir; /* indicates if doing RMDIR */
2673{
2674 int offset;
2675 struct diradd *dap = NULL;
2676 struct dirrem *dirrem, *prevdirrem;
2677 struct pagedep *pagedep;
2678 struct inodedep *inodedep;
2679
2680 offset = blkoff(dp->i_fs, dp->i_offset);
2681
2682 /*
2683 * Whiteouts do not need diradd dependencies.
2684 */
2685 if (newinum != WINO) {
2686 MALLOC(dap, struct diradd *, sizeof(struct diradd),
2687 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
2688 dap->da_list.wk_type = D_DIRADD;
2689 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
2690 dap->da_offset = offset;
2691 dap->da_newinum = newinum;
2692 }
2693
2694 /*
2695 * Allocate a new dirrem and ACQUIRE_LOCK.
2696 */
2697 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2698 pagedep = dirrem->dm_pagedep;
2699 /*
2700 * The possible values for isrmdir:
2701 * 0 - non-directory file rename
2702 * 1 - directory rename within same directory
2703 * inum - directory rename to new directory of given inode number
2704 * When renaming to a new directory, we are both deleting and
2705 * creating a new directory entry, so the link count on the new
2706 * directory should not change. Thus we do not need the followup
2707 * dirrem which is usually done in handle_workitem_remove. We set
2708 * the DIRCHG flag to tell handle_workitem_remove to skip the
2709 * followup dirrem.
2710 */
2711 if (isrmdir > 1)
2712 dirrem->dm_state |= DIRCHG;
2713
2714 /*
2715 * Whiteouts have no additional dependencies,
2716 * so just put the dirrem on the correct list.
2717 */
2718 if (newinum == WINO) {
2719 if ((dirrem->dm_state & COMPLETE) == 0) {
2720 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
2721 dm_next);
2722 } else {
2723 dirrem->dm_dirinum = pagedep->pd_ino;
2724 add_to_worklist(&dirrem->dm_list);
2725 }
2726 FREE_LOCK(&lk);
2727 return;
2728 }
2729
2730 /*
2731 * If the COMPLETE flag is clear, then there were no active
2732 * entries and we want to roll back to the previous inode until
2733 * the new inode is committed to disk. If the COMPLETE flag is
2734 * set, then we have deleted an entry that never made it to disk.
2735 * If the entry we deleted resulted from a name change, then the old
2736 * inode reference still resides on disk. Any rollback that we do
2737 * needs to be to that old inode (returned to us in prevdirrem). If
2738 * the entry we deleted resulted from a create, then there is
2739 * no entry on the disk, so we want to roll back to zero rather
2740 * than the uncommitted inode. In either of the COMPLETE cases we
2741 * want to immediately free the unwritten and unreferenced inode.
2742 */
2743 if ((dirrem->dm_state & COMPLETE) == 0) {
2744 dap->da_previous = dirrem;
2745 } else {
2746 if (prevdirrem != NULL) {
2747 dap->da_previous = prevdirrem;
2748 } else {
2749 dap->da_state &= ~DIRCHG;
2750 dap->da_pagedep = pagedep;
2751 }
2752 dirrem->dm_dirinum = pagedep->pd_ino;
2753 add_to_worklist(&dirrem->dm_list);
2754 }
2755 /*
2756 * Link into its inodedep. Put it on the id_bufwait list if the inode
2757 * is not yet written. If it is written, do the post-inode write
2758 * processing to put it on the id_pendinghd list.
2759 */
2760 if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
2761 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2762 dap->da_state |= COMPLETE;
2763 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
2764 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
2765 } else {
2766 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
2767 dap, da_pdlist);
2768 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2769 }
2770 FREE_LOCK(&lk);
2771}
2772
2773/*
2774 * Called whenever the link count on an inode is changed.
2775 * It creates an inode dependency so that the new reference(s)
2776 * to the inode cannot be committed to disk until the updated
2777 * inode has been written.
2778 */
2779void
2780softdep_change_linkcnt(ip)
2781 struct inode *ip; /* the inode with the increased link count */
2782{
2783 struct inodedep *inodedep;
2784
2785 ACQUIRE_LOCK(&lk);
2786 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
2787 if (ip->i_nlink < ip->i_effnlink) {
2788 FREE_LOCK(&lk);
2789 panic("softdep_change_linkcnt: bad delta");
2790 }
2791 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2792 FREE_LOCK(&lk);
2793}
2794
2795/*
2796 * Called when the effective link count and the reference count
2797 * on an inode drops to zero. At this point there are no names
2798 * referencing the file in the filesystem and no active file
2799 * references. The space associated with the file will be freed
2800 * as soon as the necessary soft dependencies are cleared.
2801 */
2802void
2803softdep_releasefile(ip)
2804 struct inode *ip; /* inode with the zero effective link count */
2805{
2806 struct inodedep *inodedep;
2807
2808 if (ip->i_effnlink > 0)
2809 panic("softdep_filerelease: file still referenced");
2810 /*
2811 * We may be called several times as the real reference count
2812 * drops to zero. We only want to account for the space once.
2813 */
2814 if (ip->i_flag & IN_SPACECOUNTED)
2815 return;
2816 /*
2817 * We have to deactivate a snapshot otherwise copyonwrites may
2818 * add blocks and the cleanup may remove blocks after we have
2819 * tried to account for them.
2820 */
2821 if ((ip->i_flags & SF_SNAPSHOT) != 0)
2822 ffs_snapremove(ITOV(ip));
2823 /*
2824 * If we are tracking an nlinkdelta, we have to also remember
2825 * whether we accounted for the freed space yet.
2826 */
2827 ACQUIRE_LOCK(&lk);
2828 if ((inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep)))
2829 inodedep->id_state |= SPACECOUNTED;
2830 FREE_LOCK(&lk);
2831 ip->i_fs->fs_pendingblocks += ip->i_blocks;
2832 ip->i_fs->fs_pendinginodes += 1;
2833 ip->i_flag |= IN_SPACECOUNTED;
2834}
2835
2836/*
2783 * This workitem decrements the inode's link count.
2784 * If the link count reaches zero, the file is removed.
2785 */
2786static void
2787handle_workitem_remove(dirrem)
2788 struct dirrem *dirrem;
2789{
2790 struct proc *p = CURPROC; /* XXX */
2791 struct inodedep *inodedep;
2792 struct vnode *vp;
2793 struct inode *ip;
2794 ino_t oldinum;
2795 int error;
2796
2797 if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
2798 softdep_error("handle_workitem_remove: vget", error);
2799 return;
2800 }
2801 ip = VTOI(vp);
2802 ACQUIRE_LOCK(&lk);
2803 if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){
2804 FREE_LOCK(&lk);
2805 panic("handle_workitem_remove: lost inodedep");
2806 }
2807 /*
2808 * Normal file deletion.
2809 */
2810 if ((dirrem->dm_state & RMDIR) == 0) {
2811 ip->i_nlink--;
2812 ip->i_flag |= IN_CHANGE;
2813 if (ip->i_nlink < ip->i_effnlink) {
2814 FREE_LOCK(&lk);
2815 panic("handle_workitem_remove: bad file delta");
2816 }
2817 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2818 FREE_LOCK(&lk);
2819 vput(vp);
2820 num_dirrem -= 1;
2821 WORKITEM_FREE(dirrem, D_DIRREM);
2822 return;
2823 }
2824 /*
2825 * Directory deletion. Decrement reference count for both the
2826 * just deleted parent directory entry and the reference for ".".
2827 * Next truncate the directory to length zero. When the
2828 * truncation completes, arrange to have the reference count on
2829 * the parent decremented to account for the loss of "..".
2830 */
2831 ip->i_nlink -= 2;
2832 ip->i_flag |= IN_CHANGE;
2833 if (ip->i_nlink < ip->i_effnlink) {
2834 FREE_LOCK(&lk);
2835 panic("handle_workitem_remove: bad dir delta");
2836 }
2837 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2838 FREE_LOCK(&lk);
2839 if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0)
2840 softdep_error("handle_workitem_remove: truncate", error);
2841 /*
2842 * Rename a directory to a new parent. Since, we are both deleting
2843 * and creating a new directory entry, the link count on the new
2844 * directory should not change. Thus we skip the followup dirrem.
2845 */
2846 if (dirrem->dm_state & DIRCHG) {
2847 vput(vp);
2848 num_dirrem -= 1;
2849 WORKITEM_FREE(dirrem, D_DIRREM);
2850 return;
2851 }
2852 /*
2853 * If the inodedep does not exist, then the zero'ed inode has
2854 * been written to disk. If the allocated inode has never been
2855 * written to disk, then the on-disk inode is zero'ed. In either
2856 * case we can remove the file immediately.
2857 */
2858 ACQUIRE_LOCK(&lk);
2859 dirrem->dm_state = 0;
2860 oldinum = dirrem->dm_oldinum;
2861 dirrem->dm_oldinum = dirrem->dm_dirinum;
2862 if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
2863 check_inode_unwritten(inodedep)) {
2864 FREE_LOCK(&lk);
2865 vput(vp);
2866 handle_workitem_remove(dirrem);
2867 return;
2868 }
2869 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
2870 FREE_LOCK(&lk);
2871 vput(vp);
2872}
2873
2874/*
2875 * Inode de-allocation dependencies.
2876 *
2877 * When an inode's link count is reduced to zero, it can be de-allocated. We
2878 * found it convenient to postpone de-allocation until after the inode is
2879 * written to disk with its new link count (zero). At this point, all of the
2880 * on-disk inode's block pointers are nullified and, with careful dependency
2881 * list ordering, all dependencies related to the inode will be satisfied and
2882 * the corresponding dependency structures de-allocated. So, if/when the
2883 * inode is reused, there will be no mixing of old dependencies with new
2884 * ones. This artificial dependency is set up by the block de-allocation
2885 * procedure above (softdep_setup_freeblocks) and completed by the
2886 * following procedure.
2887 */
2888static void
2889handle_workitem_freefile(freefile)
2890 struct freefile *freefile;
2891{
2892 struct fs *fs;
2893 struct inode tip;
2894 struct inodedep *idp;
2895 int error;
2896
2897 fs = VFSTOUFS(freefile->fx_mnt)->um_fs;
2898#ifdef DEBUG
2899 ACQUIRE_LOCK(&lk);
2900 error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp);
2901 FREE_LOCK(&lk);
2902 if (error)
2903 panic("handle_workitem_freefile: inodedep survived");
2904#endif
2905 tip.i_devvp = freefile->fx_devvp;
2906 tip.i_dev = freefile->fx_devvp->v_rdev;
2907 tip.i_fs = fs;
2837 * This workitem decrements the inode's link count.
2838 * If the link count reaches zero, the file is removed.
2839 */
2840static void
2841handle_workitem_remove(dirrem)
2842 struct dirrem *dirrem;
2843{
2844 struct proc *p = CURPROC; /* XXX */
2845 struct inodedep *inodedep;
2846 struct vnode *vp;
2847 struct inode *ip;
2848 ino_t oldinum;
2849 int error;
2850
2851 if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
2852 softdep_error("handle_workitem_remove: vget", error);
2853 return;
2854 }
2855 ip = VTOI(vp);
2856 ACQUIRE_LOCK(&lk);
2857 if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){
2858 FREE_LOCK(&lk);
2859 panic("handle_workitem_remove: lost inodedep");
2860 }
2861 /*
2862 * Normal file deletion.
2863 */
2864 if ((dirrem->dm_state & RMDIR) == 0) {
2865 ip->i_nlink--;
2866 ip->i_flag |= IN_CHANGE;
2867 if (ip->i_nlink < ip->i_effnlink) {
2868 FREE_LOCK(&lk);
2869 panic("handle_workitem_remove: bad file delta");
2870 }
2871 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2872 FREE_LOCK(&lk);
2873 vput(vp);
2874 num_dirrem -= 1;
2875 WORKITEM_FREE(dirrem, D_DIRREM);
2876 return;
2877 }
2878 /*
2879 * Directory deletion. Decrement reference count for both the
2880 * just deleted parent directory entry and the reference for ".".
2881 * Next truncate the directory to length zero. When the
2882 * truncation completes, arrange to have the reference count on
2883 * the parent decremented to account for the loss of "..".
2884 */
2885 ip->i_nlink -= 2;
2886 ip->i_flag |= IN_CHANGE;
2887 if (ip->i_nlink < ip->i_effnlink) {
2888 FREE_LOCK(&lk);
2889 panic("handle_workitem_remove: bad dir delta");
2890 }
2891 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2892 FREE_LOCK(&lk);
2893 if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0)
2894 softdep_error("handle_workitem_remove: truncate", error);
2895 /*
2896 * Rename a directory to a new parent. Since, we are both deleting
2897 * and creating a new directory entry, the link count on the new
2898 * directory should not change. Thus we skip the followup dirrem.
2899 */
2900 if (dirrem->dm_state & DIRCHG) {
2901 vput(vp);
2902 num_dirrem -= 1;
2903 WORKITEM_FREE(dirrem, D_DIRREM);
2904 return;
2905 }
2906 /*
2907 * If the inodedep does not exist, then the zero'ed inode has
2908 * been written to disk. If the allocated inode has never been
2909 * written to disk, then the on-disk inode is zero'ed. In either
2910 * case we can remove the file immediately.
2911 */
2912 ACQUIRE_LOCK(&lk);
2913 dirrem->dm_state = 0;
2914 oldinum = dirrem->dm_oldinum;
2915 dirrem->dm_oldinum = dirrem->dm_dirinum;
2916 if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
2917 check_inode_unwritten(inodedep)) {
2918 FREE_LOCK(&lk);
2919 vput(vp);
2920 handle_workitem_remove(dirrem);
2921 return;
2922 }
2923 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
2924 FREE_LOCK(&lk);
2925 vput(vp);
2926}
2927
2928/*
2929 * Inode de-allocation dependencies.
2930 *
2931 * When an inode's link count is reduced to zero, it can be de-allocated. We
2932 * found it convenient to postpone de-allocation until after the inode is
2933 * written to disk with its new link count (zero). At this point, all of the
2934 * on-disk inode's block pointers are nullified and, with careful dependency
2935 * list ordering, all dependencies related to the inode will be satisfied and
2936 * the corresponding dependency structures de-allocated. So, if/when the
2937 * inode is reused, there will be no mixing of old dependencies with new
2938 * ones. This artificial dependency is set up by the block de-allocation
2939 * procedure above (softdep_setup_freeblocks) and completed by the
2940 * following procedure.
2941 */
2942static void
2943handle_workitem_freefile(freefile)
2944 struct freefile *freefile;
2945{
2946 struct fs *fs;
2947 struct inode tip;
2948 struct inodedep *idp;
2949 int error;
2950
2951 fs = VFSTOUFS(freefile->fx_mnt)->um_fs;
2952#ifdef DEBUG
2953 ACQUIRE_LOCK(&lk);
2954 error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp);
2955 FREE_LOCK(&lk);
2956 if (error)
2957 panic("handle_workitem_freefile: inodedep survived");
2958#endif
2959 tip.i_devvp = freefile->fx_devvp;
2960 tip.i_dev = freefile->fx_devvp->v_rdev;
2961 tip.i_fs = fs;
2962 fs->fs_pendinginodes -= 1;
2908 if ((error = ffs_freefile(&tip, freefile->fx_oldinum, freefile->fx_mode)) != 0)
2909 softdep_error("handle_workitem_freefile", error);
2910 WORKITEM_FREE(freefile, D_FREEFILE);
2911}
2912
2913/*
2914 * Disk writes.
2915 *
2916 * The dependency structures constructed above are most actively used when file
2917 * system blocks are written to disk. No constraints are placed on when a
2918 * block can be written, but unsatisfied update dependencies are made safe by
2919 * modifying (or replacing) the source memory for the duration of the disk
2920 * write. When the disk write completes, the memory block is again brought
2921 * up-to-date.
2922 *
2923 * In-core inode structure reclamation.
2924 *
2925 * Because there are a finite number of "in-core" inode structures, they are
2926 * reused regularly. By transferring all inode-related dependencies to the
2927 * in-memory inode block and indexing them separately (via "inodedep"s), we
2928 * can allow "in-core" inode structures to be reused at any time and avoid
2929 * any increase in contention.
2930 *
2931 * Called just before entering the device driver to initiate a new disk I/O.
2932 * The buffer must be locked, thus, no I/O completion operations can occur
2933 * while we are manipulating its associated dependencies.
2934 */
2935static void
2936softdep_disk_io_initiation(bp)
2937 struct buf *bp; /* structure describing disk write to occur */
2938{
2939 struct worklist *wk, *nextwk;
2940 struct indirdep *indirdep;
2941
2942 /*
2943 * We only care about write operations. There should never
2944 * be dependencies for reads.
2945 */
2946 if (bp->b_iocmd == BIO_READ)
2947 panic("softdep_disk_io_initiation: read");
2948 /*
2949 * Do any necessary pre-I/O processing.
2950 */
2951 for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
2952 nextwk = LIST_NEXT(wk, wk_list);
2953 switch (wk->wk_type) {
2954
2955 case D_PAGEDEP:
2956 initiate_write_filepage(WK_PAGEDEP(wk), bp);
2957 continue;
2958
2959 case D_INODEDEP:
2960 initiate_write_inodeblock(WK_INODEDEP(wk), bp);
2961 continue;
2962
2963 case D_INDIRDEP:
2964 indirdep = WK_INDIRDEP(wk);
2965 if (indirdep->ir_state & GOINGAWAY)
2966 panic("disk_io_initiation: indirdep gone");
2967 /*
2968 * If there are no remaining dependencies, this
2969 * will be writing the real pointers, so the
2970 * dependency can be freed.
2971 */
2972 if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
2973 indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
2974 brelse(indirdep->ir_savebp);
2975 /* inline expand WORKLIST_REMOVE(wk); */
2976 wk->wk_state &= ~ONWORKLIST;
2977 LIST_REMOVE(wk, wk_list);
2978 WORKITEM_FREE(indirdep, D_INDIRDEP);
2979 continue;
2980 }
2981 /*
2982 * Replace up-to-date version with safe version.
2983 */
2984 MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
2985 M_INDIRDEP, M_SOFTDEP_FLAGS);
2986 ACQUIRE_LOCK(&lk);
2987 indirdep->ir_state &= ~ATTACHED;
2988 indirdep->ir_state |= UNDONE;
2989 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
2990 bcopy(indirdep->ir_savebp->b_data, bp->b_data,
2991 bp->b_bcount);
2992 FREE_LOCK(&lk);
2993 continue;
2994
2995 case D_MKDIR:
2996 case D_BMSAFEMAP:
2997 case D_ALLOCDIRECT:
2998 case D_ALLOCINDIR:
2999 continue;
3000
3001 default:
3002 panic("handle_disk_io_initiation: Unexpected type %s",
3003 TYPENAME(wk->wk_type));
3004 /* NOTREACHED */
3005 }
3006 }
3007}
3008
3009/*
3010 * Called from within the procedure above to deal with unsatisfied
3011 * allocation dependencies in a directory. The buffer must be locked,
3012 * thus, no I/O completion operations can occur while we are
3013 * manipulating its associated dependencies.
3014 */
3015static void
3016initiate_write_filepage(pagedep, bp)
3017 struct pagedep *pagedep;
3018 struct buf *bp;
3019{
3020 struct diradd *dap;
3021 struct direct *ep;
3022 int i;
3023
3024 if (pagedep->pd_state & IOSTARTED) {
3025 /*
3026 * This can only happen if there is a driver that does not
3027 * understand chaining. Here biodone will reissue the call
3028 * to strategy for the incomplete buffers.
3029 */
3030 printf("initiate_write_filepage: already started\n");
3031 return;
3032 }
3033 pagedep->pd_state |= IOSTARTED;
3034 ACQUIRE_LOCK(&lk);
3035 for (i = 0; i < DAHASHSZ; i++) {
3036 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
3037 ep = (struct direct *)
3038 ((char *)bp->b_data + dap->da_offset);
3039 if (ep->d_ino != dap->da_newinum) {
3040 FREE_LOCK(&lk);
3041 panic("%s: dir inum %d != new %d",
3042 "initiate_write_filepage",
3043 ep->d_ino, dap->da_newinum);
3044 }
3045 if (dap->da_state & DIRCHG)
3046 ep->d_ino = dap->da_previous->dm_oldinum;
3047 else
3048 ep->d_ino = 0;
3049 dap->da_state &= ~ATTACHED;
3050 dap->da_state |= UNDONE;
3051 }
3052 }
3053 FREE_LOCK(&lk);
3054}
3055
3056/*
3057 * Called from within the procedure above to deal with unsatisfied
3058 * allocation dependencies in an inodeblock. The buffer must be
3059 * locked, thus, no I/O completion operations can occur while we
3060 * are manipulating its associated dependencies.
3061 */
3062static void
3063initiate_write_inodeblock(inodedep, bp)
3064 struct inodedep *inodedep;
3065 struct buf *bp; /* The inode block */
3066{
3067 struct allocdirect *adp, *lastadp;
3068 struct dinode *dp;
3069 struct fs *fs;
3070 ufs_lbn_t prevlbn = 0;
3071 int i, deplist;
3072
3073 if (inodedep->id_state & IOSTARTED)
3074 panic("initiate_write_inodeblock: already started");
3075 inodedep->id_state |= IOSTARTED;
3076 fs = inodedep->id_fs;
3077 dp = (struct dinode *)bp->b_data +
3078 ino_to_fsbo(fs, inodedep->id_ino);
3079 /*
3080 * If the bitmap is not yet written, then the allocated
3081 * inode cannot be written to disk.
3082 */
3083 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3084 if (inodedep->id_savedino != NULL)
3085 panic("initiate_write_inodeblock: already doing I/O");
3086 MALLOC(inodedep->id_savedino, struct dinode *,
3087 sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
3088 *inodedep->id_savedino = *dp;
3089 bzero((caddr_t)dp, sizeof(struct dinode));
3090 return;
3091 }
3092 /*
3093 * If no dependencies, then there is nothing to roll back.
3094 */
3095 inodedep->id_savedsize = dp->di_size;
3096 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3097 return;
3098 /*
3099 * Set the dependencies to busy.
3100 */
3101 ACQUIRE_LOCK(&lk);
3102 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3103 adp = TAILQ_NEXT(adp, ad_next)) {
3104#ifdef DIAGNOSTIC
3105 if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3106 FREE_LOCK(&lk);
3107 panic("softdep_write_inodeblock: lbn order");
3108 }
3109 prevlbn = adp->ad_lbn;
3110 if (adp->ad_lbn < NDADDR &&
3111 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
3112 FREE_LOCK(&lk);
3113 panic("%s: direct pointer #%ld mismatch %d != %d",
3114 "softdep_write_inodeblock", adp->ad_lbn,
3115 dp->di_db[adp->ad_lbn], adp->ad_newblkno);
3116 }
3117 if (adp->ad_lbn >= NDADDR &&
3118 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
3119 FREE_LOCK(&lk);
3120 panic("%s: indirect pointer #%ld mismatch %d != %d",
3121 "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
3122 dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
3123 }
3124 deplist |= 1 << adp->ad_lbn;
3125 if ((adp->ad_state & ATTACHED) == 0) {
3126 FREE_LOCK(&lk);
3127 panic("softdep_write_inodeblock: Unknown state 0x%x",
3128 adp->ad_state);
3129 }
3130#endif /* DIAGNOSTIC */
3131 adp->ad_state &= ~ATTACHED;
3132 adp->ad_state |= UNDONE;
3133 }
3134 /*
3135 * The on-disk inode cannot claim to be any larger than the last
3136 * fragment that has been written. Otherwise, the on-disk inode
3137 * might have fragments that were not the last block in the file
3138 * which would corrupt the filesystem.
3139 */
3140 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3141 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3142 if (adp->ad_lbn >= NDADDR)
3143 break;
3144 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3145 /* keep going until hitting a rollback to a frag */
3146 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3147 continue;
3148 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3149 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3150#ifdef DIAGNOSTIC
3151 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
3152 FREE_LOCK(&lk);
3153 panic("softdep_write_inodeblock: lost dep1");
3154 }
3155#endif /* DIAGNOSTIC */
3156 dp->di_db[i] = 0;
3157 }
3158 for (i = 0; i < NIADDR; i++) {
3159#ifdef DIAGNOSTIC
3160 if (dp->di_ib[i] != 0 &&
3161 (deplist & ((1 << NDADDR) << i)) == 0) {
3162 FREE_LOCK(&lk);
3163 panic("softdep_write_inodeblock: lost dep2");
3164 }
3165#endif /* DIAGNOSTIC */
3166 dp->di_ib[i] = 0;
3167 }
3168 FREE_LOCK(&lk);
3169 return;
3170 }
3171 /*
3172 * If we have zero'ed out the last allocated block of the file,
3173 * roll back the size to the last currently allocated block.
3174 * We know that this last allocated block is a full-sized as
3175 * we already checked for fragments in the loop above.
3176 */
3177 if (lastadp != NULL &&
3178 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3179 for (i = lastadp->ad_lbn; i >= 0; i--)
3180 if (dp->di_db[i] != 0)
3181 break;
3182 dp->di_size = (i + 1) * fs->fs_bsize;
3183 }
3184 /*
3185 * The only dependencies are for indirect blocks.
3186 *
3187 * The file size for indirect block additions is not guaranteed.
3188 * Such a guarantee would be non-trivial to achieve. The conventional
3189 * synchronous write implementation also does not make this guarantee.
3190 * Fsck should catch and fix discrepancies. Arguably, the file size
3191 * can be over-estimated without destroying integrity when the file
3192 * moves into the indirect blocks (i.e., is large). If we want to
3193 * postpone fsck, we are stuck with this argument.
3194 */
3195 for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3196 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3197 FREE_LOCK(&lk);
3198}
3199
3200/*
3201 * This routine is called during the completion interrupt
3202 * service routine for a disk write (from the procedure called
3203 * by the device driver to inform the file system caches of
3204 * a request completion). It should be called early in this
3205 * procedure, before the block is made available to other
3206 * processes or other routines are called.
3207 */
3208static void
3209softdep_disk_write_complete(bp)
3210 struct buf *bp; /* describes the completed disk write */
3211{
3212 struct worklist *wk;
3213 struct workhead reattach;
3214 struct newblk *newblk;
3215 struct allocindir *aip;
3216 struct allocdirect *adp;
3217 struct indirdep *indirdep;
3218 struct inodedep *inodedep;
3219 struct bmsafemap *bmsafemap;
3220
3221#ifdef DEBUG
3222 if (lk.lkt_held != -1)
3223 panic("softdep_disk_write_complete: lock is held");
3224 lk.lkt_held = -2;
3225#endif
3226 LIST_INIT(&reattach);
3227 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
3228 WORKLIST_REMOVE(wk);
3229 switch (wk->wk_type) {
3230
3231 case D_PAGEDEP:
3232 if (handle_written_filepage(WK_PAGEDEP(wk), bp))
3233 WORKLIST_INSERT(&reattach, wk);
3234 continue;
3235
3236 case D_INODEDEP:
3237 if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
3238 WORKLIST_INSERT(&reattach, wk);
3239 continue;
3240
3241 case D_BMSAFEMAP:
3242 bmsafemap = WK_BMSAFEMAP(wk);
3243 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
3244 newblk->nb_state |= DEPCOMPLETE;
3245 newblk->nb_bmsafemap = NULL;
3246 LIST_REMOVE(newblk, nb_deps);
3247 }
3248 while ((adp =
3249 LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
3250 adp->ad_state |= DEPCOMPLETE;
3251 adp->ad_buf = NULL;
3252 LIST_REMOVE(adp, ad_deps);
3253 handle_allocdirect_partdone(adp);
3254 }
3255 while ((aip =
3256 LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
3257 aip->ai_state |= DEPCOMPLETE;
3258 aip->ai_buf = NULL;
3259 LIST_REMOVE(aip, ai_deps);
3260 handle_allocindir_partdone(aip);
3261 }
3262 while ((inodedep =
3263 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
3264 inodedep->id_state |= DEPCOMPLETE;
3265 LIST_REMOVE(inodedep, id_deps);
3266 inodedep->id_buf = NULL;
3267 }
3268 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
3269 continue;
3270
3271 case D_MKDIR:
3272 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
3273 continue;
3274
3275 case D_ALLOCDIRECT:
3276 adp = WK_ALLOCDIRECT(wk);
3277 adp->ad_state |= COMPLETE;
3278 handle_allocdirect_partdone(adp);
3279 continue;
3280
3281 case D_ALLOCINDIR:
3282 aip = WK_ALLOCINDIR(wk);
3283 aip->ai_state |= COMPLETE;
3284 handle_allocindir_partdone(aip);
3285 continue;
3286
3287 case D_INDIRDEP:
3288 indirdep = WK_INDIRDEP(wk);
3289 if (indirdep->ir_state & GOINGAWAY) {
3290 lk.lkt_held = -1;
3291 panic("disk_write_complete: indirdep gone");
3292 }
3293 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
3294 FREE(indirdep->ir_saveddata, M_INDIRDEP);
3295 indirdep->ir_saveddata = 0;
3296 indirdep->ir_state &= ~UNDONE;
3297 indirdep->ir_state |= ATTACHED;
3298 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
3299 handle_allocindir_partdone(aip);
3300 if (aip == LIST_FIRST(&indirdep->ir_donehd)) {
3301 lk.lkt_held = -1;
3302 panic("disk_write_complete: not gone");
3303 }
3304 }
3305 WORKLIST_INSERT(&reattach, wk);
3306 if ((bp->b_flags & B_DELWRI) == 0)
3307 stat_indir_blk_ptrs++;
3308 bdirty(bp);
3309 continue;
3310
3311 default:
3312 lk.lkt_held = -1;
3313 panic("handle_disk_write_complete: Unknown type %s",
3314 TYPENAME(wk->wk_type));
3315 /* NOTREACHED */
3316 }
3317 }
3318 /*
3319 * Reattach any requests that must be redone.
3320 */
3321 while ((wk = LIST_FIRST(&reattach)) != NULL) {
3322 WORKLIST_REMOVE(wk);
3323 WORKLIST_INSERT(&bp->b_dep, wk);
3324 }
3325#ifdef DEBUG
3326 if (lk.lkt_held != -2)
3327 panic("softdep_disk_write_complete: lock lost");
3328 lk.lkt_held = -1;
3329#endif
3330}
3331
3332/*
3333 * Called from within softdep_disk_write_complete above. Note that
3334 * this routine is always called from interrupt level with further
3335 * splbio interrupts blocked.
3336 */
3337static void
3338handle_allocdirect_partdone(adp)
3339 struct allocdirect *adp; /* the completed allocdirect */
3340{
3341 struct allocdirect *listadp;
3342 struct inodedep *inodedep;
3343 long bsize, delay;
3344
3345 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3346 return;
3347 if (adp->ad_buf != NULL) {
3348 lk.lkt_held = -1;
3349 panic("handle_allocdirect_partdone: dangling dep");
3350 }
3351 /*
3352 * The on-disk inode cannot claim to be any larger than the last
3353 * fragment that has been written. Otherwise, the on-disk inode
3354 * might have fragments that were not the last block in the file
3355 * which would corrupt the filesystem. Thus, we cannot free any
3356 * allocdirects after one whose ad_oldblkno claims a fragment as
3357 * these blocks must be rolled back to zero before writing the inode.
3358 * We check the currently active set of allocdirects in id_inoupdt.
3359 */
3360 inodedep = adp->ad_inodedep;
3361 bsize = inodedep->id_fs->fs_bsize;
3362 TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
3363 /* found our block */
3364 if (listadp == adp)
3365 break;
3366 /* continue if ad_oldlbn is not a fragment */
3367 if (listadp->ad_oldsize == 0 ||
3368 listadp->ad_oldsize == bsize)
3369 continue;
3370 /* hit a fragment */
3371 return;
3372 }
3373 /*
3374 * If we have reached the end of the current list without
3375 * finding the just finished dependency, then it must be
3376 * on the future dependency list. Future dependencies cannot
3377 * be freed until they are moved to the current list.
3378 */
3379 if (listadp == NULL) {
3380#ifdef DEBUG
3381 TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
3382 /* found our block */
3383 if (listadp == adp)
3384 break;
3385 if (listadp == NULL) {
3386 lk.lkt_held = -1;
3387 panic("handle_allocdirect_partdone: lost dep");
3388 }
3389#endif /* DEBUG */
3390 return;
3391 }
3392 /*
3393 * If we have found the just finished dependency, then free
3394 * it along with anything that follows it that is complete.
3395 * If the inode still has a bitmap dependency, then it has
3396 * never been written to disk, hence the on-disk inode cannot
3397 * reference the old fragment so we can free it without delay.
3398 */
3399 delay = (inodedep->id_state & DEPCOMPLETE);
3400 for (; adp; adp = listadp) {
3401 listadp = TAILQ_NEXT(adp, ad_next);
3402 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3403 return;
3404 free_allocdirect(&inodedep->id_inoupdt, adp, delay);
3405 }
3406}
3407
3408/*
3409 * Called from within softdep_disk_write_complete above. Note that
3410 * this routine is always called from interrupt level with further
3411 * splbio interrupts blocked.
3412 */
3413static void
3414handle_allocindir_partdone(aip)
3415 struct allocindir *aip; /* the completed allocindir */
3416{
3417 struct indirdep *indirdep;
3418
3419 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
3420 return;
3421 if (aip->ai_buf != NULL) {
3422 lk.lkt_held = -1;
3423 panic("handle_allocindir_partdone: dangling dependency");
3424 }
3425 indirdep = aip->ai_indirdep;
3426 if (indirdep->ir_state & UNDONE) {
3427 LIST_REMOVE(aip, ai_next);
3428 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
3429 return;
3430 }
3431 ((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3432 aip->ai_newblkno;
3433 LIST_REMOVE(aip, ai_next);
3434 if (aip->ai_freefrag != NULL)
3435 add_to_worklist(&aip->ai_freefrag->ff_list);
3436 WORKITEM_FREE(aip, D_ALLOCINDIR);
3437}
3438
3439/*
3440 * Called from within softdep_disk_write_complete above to restore
3441 * in-memory inode block contents to their most up-to-date state. Note
3442 * that this routine is always called from interrupt level with further
3443 * splbio interrupts blocked.
3444 */
3445static int
3446handle_written_inodeblock(inodedep, bp)
3447 struct inodedep *inodedep;
3448 struct buf *bp; /* buffer containing the inode block */
3449{
3450 struct worklist *wk, *filefree;
3451 struct allocdirect *adp, *nextadp;
3452 struct dinode *dp;
3453 int hadchanges;
3454
3455 if ((inodedep->id_state & IOSTARTED) == 0) {
3456 lk.lkt_held = -1;
3457 panic("handle_written_inodeblock: not started");
3458 }
3459 inodedep->id_state &= ~IOSTARTED;
3460 inodedep->id_state |= COMPLETE;
3461 dp = (struct dinode *)bp->b_data +
3462 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
3463 /*
3464 * If we had to rollback the inode allocation because of
3465 * bitmaps being incomplete, then simply restore it.
3466 * Keep the block dirty so that it will not be reclaimed until
3467 * all associated dependencies have been cleared and the
3468 * corresponding updates written to disk.
3469 */
3470 if (inodedep->id_savedino != NULL) {
3471 *dp = *inodedep->id_savedino;
3472 FREE(inodedep->id_savedino, M_INODEDEP);
3473 inodedep->id_savedino = NULL;
3474 if ((bp->b_flags & B_DELWRI) == 0)
3475 stat_inode_bitmap++;
3476 bdirty(bp);
3477 return (1);
3478 }
3479 /*
3480 * Roll forward anything that had to be rolled back before
3481 * the inode could be updated.
3482 */
3483 hadchanges = 0;
3484 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
3485 nextadp = TAILQ_NEXT(adp, ad_next);
3486 if (adp->ad_state & ATTACHED) {
3487 lk.lkt_held = -1;
3488 panic("handle_written_inodeblock: new entry");
3489 }
3490 if (adp->ad_lbn < NDADDR) {
3491 if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) {
3492 lk.lkt_held = -1;
3493 panic("%s: %s #%ld mismatch %d != %d",
3494 "handle_written_inodeblock",
3495 "direct pointer", adp->ad_lbn,
3496 dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
3497 }
3498 dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
3499 } else {
3500 if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) {
3501 lk.lkt_held = -1;
3502 panic("%s: %s #%ld allocated as %d",
3503 "handle_written_inodeblock",
3504 "indirect pointer", adp->ad_lbn - NDADDR,
3505 dp->di_ib[adp->ad_lbn - NDADDR]);
3506 }
3507 dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
3508 }
3509 adp->ad_state &= ~UNDONE;
3510 adp->ad_state |= ATTACHED;
3511 hadchanges = 1;
3512 }
3513 if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
3514 stat_direct_blk_ptrs++;
3515 /*
3516 * Reset the file size to its most up-to-date value.
3517 */
3518 if (inodedep->id_savedsize == -1) {
3519 lk.lkt_held = -1;
3520 panic("handle_written_inodeblock: bad size");
3521 }
3522 if (dp->di_size != inodedep->id_savedsize) {
3523 dp->di_size = inodedep->id_savedsize;
3524 hadchanges = 1;
3525 }
3526 inodedep->id_savedsize = -1;
3527 /*
3528 * If there were any rollbacks in the inode block, then it must be
3529 * marked dirty so that its will eventually get written back in
3530 * its correct form.
3531 */
3532 if (hadchanges)
3533 bdirty(bp);
3534 /*
3535 * Process any allocdirects that completed during the update.
3536 */
3537 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
3538 handle_allocdirect_partdone(adp);
3539 /*
3540 * Process deallocations that were held pending until the
3541 * inode had been written to disk. Freeing of the inode
3542 * is delayed until after all blocks have been freed to
3543 * avoid creation of new <vfsid, inum, lbn> triples
3544 * before the old ones have been deleted.
3545 */
3546 filefree = NULL;
3547 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
3548 WORKLIST_REMOVE(wk);
3549 switch (wk->wk_type) {
3550
3551 case D_FREEFILE:
3552 /*
3553 * We defer adding filefree to the worklist until
3554 * all other additions have been made to ensure
3555 * that it will be done after all the old blocks
3556 * have been freed.
3557 */
3558 if (filefree != NULL) {
3559 lk.lkt_held = -1;
3560 panic("handle_written_inodeblock: filefree");
3561 }
3562 filefree = wk;
3563 continue;
3564
3565 case D_MKDIR:
3566 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
3567 continue;
3568
3569 case D_DIRADD:
3570 diradd_inode_written(WK_DIRADD(wk), inodedep);
3571 continue;
3572
3573 case D_FREEBLKS:
3574 case D_FREEFRAG:
3575 case D_DIRREM:
3576 add_to_worklist(wk);
3577 continue;
3578
3579 default:
3580 lk.lkt_held = -1;
3581 panic("handle_written_inodeblock: Unknown type %s",
3582 TYPENAME(wk->wk_type));
3583 /* NOTREACHED */
3584 }
3585 }
3586 if (filefree != NULL) {
3587 if (free_inodedep(inodedep) == 0) {
3588 lk.lkt_held = -1;
3589 panic("handle_written_inodeblock: live inodedep");
3590 }
3591 add_to_worklist(filefree);
3592 return (0);
3593 }
3594
3595 /*
3596 * If no outstanding dependencies, free it.
3597 */
3598 if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
3599 return (0);
3600 return (hadchanges);
3601}
3602
3603/*
3604 * Process a diradd entry after its dependent inode has been written.
3605 * This routine must be called with splbio interrupts blocked.
3606 */
3607static void
3608diradd_inode_written(dap, inodedep)
3609 struct diradd *dap;
3610 struct inodedep *inodedep;
3611{
3612 struct pagedep *pagedep;
3613
3614 dap->da_state |= COMPLETE;
3615 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3616 if (dap->da_state & DIRCHG)
3617 pagedep = dap->da_previous->dm_pagedep;
3618 else
3619 pagedep = dap->da_pagedep;
3620 LIST_REMOVE(dap, da_pdlist);
3621 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3622 }
3623 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3624}
3625
3626/*
3627 * Handle the completion of a mkdir dependency.
3628 */
3629static void
3630handle_written_mkdir(mkdir, type)
3631 struct mkdir *mkdir;
3632 int type;
3633{
3634 struct diradd *dap;
3635 struct pagedep *pagedep;
3636
3637 if (mkdir->md_state != type) {
3638 lk.lkt_held = -1;
3639 panic("handle_written_mkdir: bad type");
3640 }
3641 dap = mkdir->md_diradd;
3642 dap->da_state &= ~type;
3643 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
3644 dap->da_state |= DEPCOMPLETE;
3645 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3646 if (dap->da_state & DIRCHG)
3647 pagedep = dap->da_previous->dm_pagedep;
3648 else
3649 pagedep = dap->da_pagedep;
3650 LIST_REMOVE(dap, da_pdlist);
3651 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3652 }
3653 LIST_REMOVE(mkdir, md_mkdirs);
3654 WORKITEM_FREE(mkdir, D_MKDIR);
3655}
3656
3657/*
3658 * Called from within softdep_disk_write_complete above.
3659 * A write operation was just completed. Removed inodes can
3660 * now be freed and associated block pointers may be committed.
3661 * Note that this routine is always called from interrupt level
3662 * with further splbio interrupts blocked.
3663 */
3664static int
3665handle_written_filepage(pagedep, bp)
3666 struct pagedep *pagedep;
3667 struct buf *bp; /* buffer containing the written page */
3668{
3669 struct dirrem *dirrem;
3670 struct diradd *dap, *nextdap;
3671 struct direct *ep;
3672 int i, chgs;
3673
3674 if ((pagedep->pd_state & IOSTARTED) == 0) {
3675 lk.lkt_held = -1;
3676 panic("handle_written_filepage: not started");
3677 }
3678 pagedep->pd_state &= ~IOSTARTED;
3679 /*
3680 * Process any directory removals that have been committed.
3681 */
3682 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
3683 LIST_REMOVE(dirrem, dm_next);
3684 dirrem->dm_dirinum = pagedep->pd_ino;
3685 add_to_worklist(&dirrem->dm_list);
3686 }
3687 /*
3688 * Free any directory additions that have been committed.
3689 */
3690 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
3691 free_diradd(dap);
3692 /*
3693 * Uncommitted directory entries must be restored.
3694 */
3695 for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
3696 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
3697 dap = nextdap) {
3698 nextdap = LIST_NEXT(dap, da_pdlist);
3699 if (dap->da_state & ATTACHED) {
3700 lk.lkt_held = -1;
3701 panic("handle_written_filepage: attached");
3702 }
3703 ep = (struct direct *)
3704 ((char *)bp->b_data + dap->da_offset);
3705 ep->d_ino = dap->da_newinum;
3706 dap->da_state &= ~UNDONE;
3707 dap->da_state |= ATTACHED;
3708 chgs = 1;
3709 /*
3710 * If the inode referenced by the directory has
3711 * been written out, then the dependency can be
3712 * moved to the pending list.
3713 */
3714 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3715 LIST_REMOVE(dap, da_pdlist);
3716 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
3717 da_pdlist);
3718 }
3719 }
3720 }
3721 /*
3722 * If there were any rollbacks in the directory, then it must be
3723 * marked dirty so that its will eventually get written back in
3724 * its correct form.
3725 */
3726 if (chgs) {
3727 if ((bp->b_flags & B_DELWRI) == 0)
3728 stat_dir_entry++;
3729 bdirty(bp);
3730 }
3731 /*
3732 * If no dependencies remain, the pagedep will be freed.
3733 * Otherwise it will remain to update the page before it
3734 * is written back to disk.
3735 */
3736 if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
3737 for (i = 0; i < DAHASHSZ; i++)
3738 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
3739 break;
3740 if (i == DAHASHSZ) {
3741 LIST_REMOVE(pagedep, pd_hash);
3742 WORKITEM_FREE(pagedep, D_PAGEDEP);
3743 return (0);
3744 }
3745 }
3746 return (1);
3747}
3748
3749/*
3750 * Writing back in-core inode structures.
3751 *
3752 * The file system only accesses an inode's contents when it occupies an
3753 * "in-core" inode structure. These "in-core" structures are separate from
3754 * the page frames used to cache inode blocks. Only the latter are
3755 * transferred to/from the disk. So, when the updated contents of the
3756 * "in-core" inode structure are copied to the corresponding in-memory inode
3757 * block, the dependencies are also transferred. The following procedure is
3758 * called when copying a dirty "in-core" inode to a cached inode block.
3759 */
3760
3761/*
3762 * Called when an inode is loaded from disk. If the effective link count
3763 * differed from the actual link count when it was last flushed, then we
3764 * need to ensure that the correct effective link count is put back.
3765 */
3766void
3767softdep_load_inodeblock(ip)
3768 struct inode *ip; /* the "in_core" copy of the inode */
3769{
3770 struct inodedep *inodedep;
3771
3772 /*
3773 * Check for alternate nlink count.
3774 */
3775 ip->i_effnlink = ip->i_nlink;
3776 ACQUIRE_LOCK(&lk);
3777 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3778 FREE_LOCK(&lk);
3779 return;
3780 }
3781 ip->i_effnlink -= inodedep->id_nlinkdelta;
2963 if ((error = ffs_freefile(&tip, freefile->fx_oldinum, freefile->fx_mode)) != 0)
2964 softdep_error("handle_workitem_freefile", error);
2965 WORKITEM_FREE(freefile, D_FREEFILE);
2966}
2967
2968/*
2969 * Disk writes.
2970 *
2971 * The dependency structures constructed above are most actively used when file
2972 * system blocks are written to disk. No constraints are placed on when a
2973 * block can be written, but unsatisfied update dependencies are made safe by
2974 * modifying (or replacing) the source memory for the duration of the disk
2975 * write. When the disk write completes, the memory block is again brought
2976 * up-to-date.
2977 *
2978 * In-core inode structure reclamation.
2979 *
2980 * Because there are a finite number of "in-core" inode structures, they are
2981 * reused regularly. By transferring all inode-related dependencies to the
2982 * in-memory inode block and indexing them separately (via "inodedep"s), we
2983 * can allow "in-core" inode structures to be reused at any time and avoid
2984 * any increase in contention.
2985 *
2986 * Called just before entering the device driver to initiate a new disk I/O.
2987 * The buffer must be locked, thus, no I/O completion operations can occur
2988 * while we are manipulating its associated dependencies.
2989 */
2990static void
2991softdep_disk_io_initiation(bp)
2992 struct buf *bp; /* structure describing disk write to occur */
2993{
2994 struct worklist *wk, *nextwk;
2995 struct indirdep *indirdep;
2996
2997 /*
2998 * We only care about write operations. There should never
2999 * be dependencies for reads.
3000 */
3001 if (bp->b_iocmd == BIO_READ)
3002 panic("softdep_disk_io_initiation: read");
3003 /*
3004 * Do any necessary pre-I/O processing.
3005 */
3006 for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
3007 nextwk = LIST_NEXT(wk, wk_list);
3008 switch (wk->wk_type) {
3009
3010 case D_PAGEDEP:
3011 initiate_write_filepage(WK_PAGEDEP(wk), bp);
3012 continue;
3013
3014 case D_INODEDEP:
3015 initiate_write_inodeblock(WK_INODEDEP(wk), bp);
3016 continue;
3017
3018 case D_INDIRDEP:
3019 indirdep = WK_INDIRDEP(wk);
3020 if (indirdep->ir_state & GOINGAWAY)
3021 panic("disk_io_initiation: indirdep gone");
3022 /*
3023 * If there are no remaining dependencies, this
3024 * will be writing the real pointers, so the
3025 * dependency can be freed.
3026 */
3027 if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
3028 indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
3029 brelse(indirdep->ir_savebp);
3030 /* inline expand WORKLIST_REMOVE(wk); */
3031 wk->wk_state &= ~ONWORKLIST;
3032 LIST_REMOVE(wk, wk_list);
3033 WORKITEM_FREE(indirdep, D_INDIRDEP);
3034 continue;
3035 }
3036 /*
3037 * Replace up-to-date version with safe version.
3038 */
3039 MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
3040 M_INDIRDEP, M_SOFTDEP_FLAGS);
3041 ACQUIRE_LOCK(&lk);
3042 indirdep->ir_state &= ~ATTACHED;
3043 indirdep->ir_state |= UNDONE;
3044 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
3045 bcopy(indirdep->ir_savebp->b_data, bp->b_data,
3046 bp->b_bcount);
3047 FREE_LOCK(&lk);
3048 continue;
3049
3050 case D_MKDIR:
3051 case D_BMSAFEMAP:
3052 case D_ALLOCDIRECT:
3053 case D_ALLOCINDIR:
3054 continue;
3055
3056 default:
3057 panic("handle_disk_io_initiation: Unexpected type %s",
3058 TYPENAME(wk->wk_type));
3059 /* NOTREACHED */
3060 }
3061 }
3062}
3063
3064/*
3065 * Called from within the procedure above to deal with unsatisfied
3066 * allocation dependencies in a directory. The buffer must be locked,
3067 * thus, no I/O completion operations can occur while we are
3068 * manipulating its associated dependencies.
3069 */
3070static void
3071initiate_write_filepage(pagedep, bp)
3072 struct pagedep *pagedep;
3073 struct buf *bp;
3074{
3075 struct diradd *dap;
3076 struct direct *ep;
3077 int i;
3078
3079 if (pagedep->pd_state & IOSTARTED) {
3080 /*
3081 * This can only happen if there is a driver that does not
3082 * understand chaining. Here biodone will reissue the call
3083 * to strategy for the incomplete buffers.
3084 */
3085 printf("initiate_write_filepage: already started\n");
3086 return;
3087 }
3088 pagedep->pd_state |= IOSTARTED;
3089 ACQUIRE_LOCK(&lk);
3090 for (i = 0; i < DAHASHSZ; i++) {
3091 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
3092 ep = (struct direct *)
3093 ((char *)bp->b_data + dap->da_offset);
3094 if (ep->d_ino != dap->da_newinum) {
3095 FREE_LOCK(&lk);
3096 panic("%s: dir inum %d != new %d",
3097 "initiate_write_filepage",
3098 ep->d_ino, dap->da_newinum);
3099 }
3100 if (dap->da_state & DIRCHG)
3101 ep->d_ino = dap->da_previous->dm_oldinum;
3102 else
3103 ep->d_ino = 0;
3104 dap->da_state &= ~ATTACHED;
3105 dap->da_state |= UNDONE;
3106 }
3107 }
3108 FREE_LOCK(&lk);
3109}
3110
3111/*
3112 * Called from within the procedure above to deal with unsatisfied
3113 * allocation dependencies in an inodeblock. The buffer must be
3114 * locked, thus, no I/O completion operations can occur while we
3115 * are manipulating its associated dependencies.
3116 */
3117static void
3118initiate_write_inodeblock(inodedep, bp)
3119 struct inodedep *inodedep;
3120 struct buf *bp; /* The inode block */
3121{
3122 struct allocdirect *adp, *lastadp;
3123 struct dinode *dp;
3124 struct fs *fs;
3125 ufs_lbn_t prevlbn = 0;
3126 int i, deplist;
3127
3128 if (inodedep->id_state & IOSTARTED)
3129 panic("initiate_write_inodeblock: already started");
3130 inodedep->id_state |= IOSTARTED;
3131 fs = inodedep->id_fs;
3132 dp = (struct dinode *)bp->b_data +
3133 ino_to_fsbo(fs, inodedep->id_ino);
3134 /*
3135 * If the bitmap is not yet written, then the allocated
3136 * inode cannot be written to disk.
3137 */
3138 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3139 if (inodedep->id_savedino != NULL)
3140 panic("initiate_write_inodeblock: already doing I/O");
3141 MALLOC(inodedep->id_savedino, struct dinode *,
3142 sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
3143 *inodedep->id_savedino = *dp;
3144 bzero((caddr_t)dp, sizeof(struct dinode));
3145 return;
3146 }
3147 /*
3148 * If no dependencies, then there is nothing to roll back.
3149 */
3150 inodedep->id_savedsize = dp->di_size;
3151 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3152 return;
3153 /*
3154 * Set the dependencies to busy.
3155 */
3156 ACQUIRE_LOCK(&lk);
3157 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3158 adp = TAILQ_NEXT(adp, ad_next)) {
3159#ifdef DIAGNOSTIC
3160 if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3161 FREE_LOCK(&lk);
3162 panic("softdep_write_inodeblock: lbn order");
3163 }
3164 prevlbn = adp->ad_lbn;
3165 if (adp->ad_lbn < NDADDR &&
3166 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
3167 FREE_LOCK(&lk);
3168 panic("%s: direct pointer #%ld mismatch %d != %d",
3169 "softdep_write_inodeblock", adp->ad_lbn,
3170 dp->di_db[adp->ad_lbn], adp->ad_newblkno);
3171 }
3172 if (adp->ad_lbn >= NDADDR &&
3173 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
3174 FREE_LOCK(&lk);
3175 panic("%s: indirect pointer #%ld mismatch %d != %d",
3176 "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
3177 dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
3178 }
3179 deplist |= 1 << adp->ad_lbn;
3180 if ((adp->ad_state & ATTACHED) == 0) {
3181 FREE_LOCK(&lk);
3182 panic("softdep_write_inodeblock: Unknown state 0x%x",
3183 adp->ad_state);
3184 }
3185#endif /* DIAGNOSTIC */
3186 adp->ad_state &= ~ATTACHED;
3187 adp->ad_state |= UNDONE;
3188 }
3189 /*
3190 * The on-disk inode cannot claim to be any larger than the last
3191 * fragment that has been written. Otherwise, the on-disk inode
3192 * might have fragments that were not the last block in the file
3193 * which would corrupt the filesystem.
3194 */
3195 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3196 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3197 if (adp->ad_lbn >= NDADDR)
3198 break;
3199 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3200 /* keep going until hitting a rollback to a frag */
3201 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3202 continue;
3203 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3204 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3205#ifdef DIAGNOSTIC
3206 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
3207 FREE_LOCK(&lk);
3208 panic("softdep_write_inodeblock: lost dep1");
3209 }
3210#endif /* DIAGNOSTIC */
3211 dp->di_db[i] = 0;
3212 }
3213 for (i = 0; i < NIADDR; i++) {
3214#ifdef DIAGNOSTIC
3215 if (dp->di_ib[i] != 0 &&
3216 (deplist & ((1 << NDADDR) << i)) == 0) {
3217 FREE_LOCK(&lk);
3218 panic("softdep_write_inodeblock: lost dep2");
3219 }
3220#endif /* DIAGNOSTIC */
3221 dp->di_ib[i] = 0;
3222 }
3223 FREE_LOCK(&lk);
3224 return;
3225 }
3226 /*
3227 * If we have zero'ed out the last allocated block of the file,
3228 * roll back the size to the last currently allocated block.
3229 * We know that this last allocated block is a full-sized as
3230 * we already checked for fragments in the loop above.
3231 */
3232 if (lastadp != NULL &&
3233 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3234 for (i = lastadp->ad_lbn; i >= 0; i--)
3235 if (dp->di_db[i] != 0)
3236 break;
3237 dp->di_size = (i + 1) * fs->fs_bsize;
3238 }
3239 /*
3240 * The only dependencies are for indirect blocks.
3241 *
3242 * The file size for indirect block additions is not guaranteed.
3243 * Such a guarantee would be non-trivial to achieve. The conventional
3244 * synchronous write implementation also does not make this guarantee.
3245 * Fsck should catch and fix discrepancies. Arguably, the file size
3246 * can be over-estimated without destroying integrity when the file
3247 * moves into the indirect blocks (i.e., is large). If we want to
3248 * postpone fsck, we are stuck with this argument.
3249 */
3250 for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3251 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3252 FREE_LOCK(&lk);
3253}
3254
3255/*
3256 * This routine is called during the completion interrupt
3257 * service routine for a disk write (from the procedure called
3258 * by the device driver to inform the file system caches of
3259 * a request completion). It should be called early in this
3260 * procedure, before the block is made available to other
3261 * processes or other routines are called.
3262 */
3263static void
3264softdep_disk_write_complete(bp)
3265 struct buf *bp; /* describes the completed disk write */
3266{
3267 struct worklist *wk;
3268 struct workhead reattach;
3269 struct newblk *newblk;
3270 struct allocindir *aip;
3271 struct allocdirect *adp;
3272 struct indirdep *indirdep;
3273 struct inodedep *inodedep;
3274 struct bmsafemap *bmsafemap;
3275
3276#ifdef DEBUG
3277 if (lk.lkt_held != -1)
3278 panic("softdep_disk_write_complete: lock is held");
3279 lk.lkt_held = -2;
3280#endif
3281 LIST_INIT(&reattach);
3282 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
3283 WORKLIST_REMOVE(wk);
3284 switch (wk->wk_type) {
3285
3286 case D_PAGEDEP:
3287 if (handle_written_filepage(WK_PAGEDEP(wk), bp))
3288 WORKLIST_INSERT(&reattach, wk);
3289 continue;
3290
3291 case D_INODEDEP:
3292 if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
3293 WORKLIST_INSERT(&reattach, wk);
3294 continue;
3295
3296 case D_BMSAFEMAP:
3297 bmsafemap = WK_BMSAFEMAP(wk);
3298 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
3299 newblk->nb_state |= DEPCOMPLETE;
3300 newblk->nb_bmsafemap = NULL;
3301 LIST_REMOVE(newblk, nb_deps);
3302 }
3303 while ((adp =
3304 LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
3305 adp->ad_state |= DEPCOMPLETE;
3306 adp->ad_buf = NULL;
3307 LIST_REMOVE(adp, ad_deps);
3308 handle_allocdirect_partdone(adp);
3309 }
3310 while ((aip =
3311 LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
3312 aip->ai_state |= DEPCOMPLETE;
3313 aip->ai_buf = NULL;
3314 LIST_REMOVE(aip, ai_deps);
3315 handle_allocindir_partdone(aip);
3316 }
3317 while ((inodedep =
3318 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
3319 inodedep->id_state |= DEPCOMPLETE;
3320 LIST_REMOVE(inodedep, id_deps);
3321 inodedep->id_buf = NULL;
3322 }
3323 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
3324 continue;
3325
3326 case D_MKDIR:
3327 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
3328 continue;
3329
3330 case D_ALLOCDIRECT:
3331 adp = WK_ALLOCDIRECT(wk);
3332 adp->ad_state |= COMPLETE;
3333 handle_allocdirect_partdone(adp);
3334 continue;
3335
3336 case D_ALLOCINDIR:
3337 aip = WK_ALLOCINDIR(wk);
3338 aip->ai_state |= COMPLETE;
3339 handle_allocindir_partdone(aip);
3340 continue;
3341
3342 case D_INDIRDEP:
3343 indirdep = WK_INDIRDEP(wk);
3344 if (indirdep->ir_state & GOINGAWAY) {
3345 lk.lkt_held = -1;
3346 panic("disk_write_complete: indirdep gone");
3347 }
3348 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
3349 FREE(indirdep->ir_saveddata, M_INDIRDEP);
3350 indirdep->ir_saveddata = 0;
3351 indirdep->ir_state &= ~UNDONE;
3352 indirdep->ir_state |= ATTACHED;
3353 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
3354 handle_allocindir_partdone(aip);
3355 if (aip == LIST_FIRST(&indirdep->ir_donehd)) {
3356 lk.lkt_held = -1;
3357 panic("disk_write_complete: not gone");
3358 }
3359 }
3360 WORKLIST_INSERT(&reattach, wk);
3361 if ((bp->b_flags & B_DELWRI) == 0)
3362 stat_indir_blk_ptrs++;
3363 bdirty(bp);
3364 continue;
3365
3366 default:
3367 lk.lkt_held = -1;
3368 panic("handle_disk_write_complete: Unknown type %s",
3369 TYPENAME(wk->wk_type));
3370 /* NOTREACHED */
3371 }
3372 }
3373 /*
3374 * Reattach any requests that must be redone.
3375 */
3376 while ((wk = LIST_FIRST(&reattach)) != NULL) {
3377 WORKLIST_REMOVE(wk);
3378 WORKLIST_INSERT(&bp->b_dep, wk);
3379 }
3380#ifdef DEBUG
3381 if (lk.lkt_held != -2)
3382 panic("softdep_disk_write_complete: lock lost");
3383 lk.lkt_held = -1;
3384#endif
3385}
3386
3387/*
3388 * Called from within softdep_disk_write_complete above. Note that
3389 * this routine is always called from interrupt level with further
3390 * splbio interrupts blocked.
3391 */
3392static void
3393handle_allocdirect_partdone(adp)
3394 struct allocdirect *adp; /* the completed allocdirect */
3395{
3396 struct allocdirect *listadp;
3397 struct inodedep *inodedep;
3398 long bsize, delay;
3399
3400 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3401 return;
3402 if (adp->ad_buf != NULL) {
3403 lk.lkt_held = -1;
3404 panic("handle_allocdirect_partdone: dangling dep");
3405 }
3406 /*
3407 * The on-disk inode cannot claim to be any larger than the last
3408 * fragment that has been written. Otherwise, the on-disk inode
3409 * might have fragments that were not the last block in the file
3410 * which would corrupt the filesystem. Thus, we cannot free any
3411 * allocdirects after one whose ad_oldblkno claims a fragment as
3412 * these blocks must be rolled back to zero before writing the inode.
3413 * We check the currently active set of allocdirects in id_inoupdt.
3414 */
3415 inodedep = adp->ad_inodedep;
3416 bsize = inodedep->id_fs->fs_bsize;
3417 TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
3418 /* found our block */
3419 if (listadp == adp)
3420 break;
3421 /* continue if ad_oldlbn is not a fragment */
3422 if (listadp->ad_oldsize == 0 ||
3423 listadp->ad_oldsize == bsize)
3424 continue;
3425 /* hit a fragment */
3426 return;
3427 }
3428 /*
3429 * If we have reached the end of the current list without
3430 * finding the just finished dependency, then it must be
3431 * on the future dependency list. Future dependencies cannot
3432 * be freed until they are moved to the current list.
3433 */
3434 if (listadp == NULL) {
3435#ifdef DEBUG
3436 TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
3437 /* found our block */
3438 if (listadp == adp)
3439 break;
3440 if (listadp == NULL) {
3441 lk.lkt_held = -1;
3442 panic("handle_allocdirect_partdone: lost dep");
3443 }
3444#endif /* DEBUG */
3445 return;
3446 }
3447 /*
3448 * If we have found the just finished dependency, then free
3449 * it along with anything that follows it that is complete.
3450 * If the inode still has a bitmap dependency, then it has
3451 * never been written to disk, hence the on-disk inode cannot
3452 * reference the old fragment so we can free it without delay.
3453 */
3454 delay = (inodedep->id_state & DEPCOMPLETE);
3455 for (; adp; adp = listadp) {
3456 listadp = TAILQ_NEXT(adp, ad_next);
3457 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3458 return;
3459 free_allocdirect(&inodedep->id_inoupdt, adp, delay);
3460 }
3461}
3462
3463/*
3464 * Called from within softdep_disk_write_complete above. Note that
3465 * this routine is always called from interrupt level with further
3466 * splbio interrupts blocked.
3467 */
3468static void
3469handle_allocindir_partdone(aip)
3470 struct allocindir *aip; /* the completed allocindir */
3471{
3472 struct indirdep *indirdep;
3473
3474 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
3475 return;
3476 if (aip->ai_buf != NULL) {
3477 lk.lkt_held = -1;
3478 panic("handle_allocindir_partdone: dangling dependency");
3479 }
3480 indirdep = aip->ai_indirdep;
3481 if (indirdep->ir_state & UNDONE) {
3482 LIST_REMOVE(aip, ai_next);
3483 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
3484 return;
3485 }
3486 ((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3487 aip->ai_newblkno;
3488 LIST_REMOVE(aip, ai_next);
3489 if (aip->ai_freefrag != NULL)
3490 add_to_worklist(&aip->ai_freefrag->ff_list);
3491 WORKITEM_FREE(aip, D_ALLOCINDIR);
3492}
3493
3494/*
3495 * Called from within softdep_disk_write_complete above to restore
3496 * in-memory inode block contents to their most up-to-date state. Note
3497 * that this routine is always called from interrupt level with further
3498 * splbio interrupts blocked.
3499 */
3500static int
3501handle_written_inodeblock(inodedep, bp)
3502 struct inodedep *inodedep;
3503 struct buf *bp; /* buffer containing the inode block */
3504{
3505 struct worklist *wk, *filefree;
3506 struct allocdirect *adp, *nextadp;
3507 struct dinode *dp;
3508 int hadchanges;
3509
3510 if ((inodedep->id_state & IOSTARTED) == 0) {
3511 lk.lkt_held = -1;
3512 panic("handle_written_inodeblock: not started");
3513 }
3514 inodedep->id_state &= ~IOSTARTED;
3515 inodedep->id_state |= COMPLETE;
3516 dp = (struct dinode *)bp->b_data +
3517 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
3518 /*
3519 * If we had to rollback the inode allocation because of
3520 * bitmaps being incomplete, then simply restore it.
3521 * Keep the block dirty so that it will not be reclaimed until
3522 * all associated dependencies have been cleared and the
3523 * corresponding updates written to disk.
3524 */
3525 if (inodedep->id_savedino != NULL) {
3526 *dp = *inodedep->id_savedino;
3527 FREE(inodedep->id_savedino, M_INODEDEP);
3528 inodedep->id_savedino = NULL;
3529 if ((bp->b_flags & B_DELWRI) == 0)
3530 stat_inode_bitmap++;
3531 bdirty(bp);
3532 return (1);
3533 }
3534 /*
3535 * Roll forward anything that had to be rolled back before
3536 * the inode could be updated.
3537 */
3538 hadchanges = 0;
3539 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
3540 nextadp = TAILQ_NEXT(adp, ad_next);
3541 if (adp->ad_state & ATTACHED) {
3542 lk.lkt_held = -1;
3543 panic("handle_written_inodeblock: new entry");
3544 }
3545 if (adp->ad_lbn < NDADDR) {
3546 if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) {
3547 lk.lkt_held = -1;
3548 panic("%s: %s #%ld mismatch %d != %d",
3549 "handle_written_inodeblock",
3550 "direct pointer", adp->ad_lbn,
3551 dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
3552 }
3553 dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
3554 } else {
3555 if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) {
3556 lk.lkt_held = -1;
3557 panic("%s: %s #%ld allocated as %d",
3558 "handle_written_inodeblock",
3559 "indirect pointer", adp->ad_lbn - NDADDR,
3560 dp->di_ib[adp->ad_lbn - NDADDR]);
3561 }
3562 dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
3563 }
3564 adp->ad_state &= ~UNDONE;
3565 adp->ad_state |= ATTACHED;
3566 hadchanges = 1;
3567 }
3568 if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
3569 stat_direct_blk_ptrs++;
3570 /*
3571 * Reset the file size to its most up-to-date value.
3572 */
3573 if (inodedep->id_savedsize == -1) {
3574 lk.lkt_held = -1;
3575 panic("handle_written_inodeblock: bad size");
3576 }
3577 if (dp->di_size != inodedep->id_savedsize) {
3578 dp->di_size = inodedep->id_savedsize;
3579 hadchanges = 1;
3580 }
3581 inodedep->id_savedsize = -1;
3582 /*
3583 * If there were any rollbacks in the inode block, then it must be
3584 * marked dirty so that its will eventually get written back in
3585 * its correct form.
3586 */
3587 if (hadchanges)
3588 bdirty(bp);
3589 /*
3590 * Process any allocdirects that completed during the update.
3591 */
3592 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
3593 handle_allocdirect_partdone(adp);
3594 /*
3595 * Process deallocations that were held pending until the
3596 * inode had been written to disk. Freeing of the inode
3597 * is delayed until after all blocks have been freed to
3598 * avoid creation of new <vfsid, inum, lbn> triples
3599 * before the old ones have been deleted.
3600 */
3601 filefree = NULL;
3602 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
3603 WORKLIST_REMOVE(wk);
3604 switch (wk->wk_type) {
3605
3606 case D_FREEFILE:
3607 /*
3608 * We defer adding filefree to the worklist until
3609 * all other additions have been made to ensure
3610 * that it will be done after all the old blocks
3611 * have been freed.
3612 */
3613 if (filefree != NULL) {
3614 lk.lkt_held = -1;
3615 panic("handle_written_inodeblock: filefree");
3616 }
3617 filefree = wk;
3618 continue;
3619
3620 case D_MKDIR:
3621 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
3622 continue;
3623
3624 case D_DIRADD:
3625 diradd_inode_written(WK_DIRADD(wk), inodedep);
3626 continue;
3627
3628 case D_FREEBLKS:
3629 case D_FREEFRAG:
3630 case D_DIRREM:
3631 add_to_worklist(wk);
3632 continue;
3633
3634 default:
3635 lk.lkt_held = -1;
3636 panic("handle_written_inodeblock: Unknown type %s",
3637 TYPENAME(wk->wk_type));
3638 /* NOTREACHED */
3639 }
3640 }
3641 if (filefree != NULL) {
3642 if (free_inodedep(inodedep) == 0) {
3643 lk.lkt_held = -1;
3644 panic("handle_written_inodeblock: live inodedep");
3645 }
3646 add_to_worklist(filefree);
3647 return (0);
3648 }
3649
3650 /*
3651 * If no outstanding dependencies, free it.
3652 */
3653 if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
3654 return (0);
3655 return (hadchanges);
3656}
3657
3658/*
3659 * Process a diradd entry after its dependent inode has been written.
3660 * This routine must be called with splbio interrupts blocked.
3661 */
3662static void
3663diradd_inode_written(dap, inodedep)
3664 struct diradd *dap;
3665 struct inodedep *inodedep;
3666{
3667 struct pagedep *pagedep;
3668
3669 dap->da_state |= COMPLETE;
3670 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3671 if (dap->da_state & DIRCHG)
3672 pagedep = dap->da_previous->dm_pagedep;
3673 else
3674 pagedep = dap->da_pagedep;
3675 LIST_REMOVE(dap, da_pdlist);
3676 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3677 }
3678 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3679}
3680
3681/*
3682 * Handle the completion of a mkdir dependency.
3683 */
3684static void
3685handle_written_mkdir(mkdir, type)
3686 struct mkdir *mkdir;
3687 int type;
3688{
3689 struct diradd *dap;
3690 struct pagedep *pagedep;
3691
3692 if (mkdir->md_state != type) {
3693 lk.lkt_held = -1;
3694 panic("handle_written_mkdir: bad type");
3695 }
3696 dap = mkdir->md_diradd;
3697 dap->da_state &= ~type;
3698 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
3699 dap->da_state |= DEPCOMPLETE;
3700 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3701 if (dap->da_state & DIRCHG)
3702 pagedep = dap->da_previous->dm_pagedep;
3703 else
3704 pagedep = dap->da_pagedep;
3705 LIST_REMOVE(dap, da_pdlist);
3706 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3707 }
3708 LIST_REMOVE(mkdir, md_mkdirs);
3709 WORKITEM_FREE(mkdir, D_MKDIR);
3710}
3711
3712/*
3713 * Called from within softdep_disk_write_complete above.
3714 * A write operation was just completed. Removed inodes can
3715 * now be freed and associated block pointers may be committed.
3716 * Note that this routine is always called from interrupt level
3717 * with further splbio interrupts blocked.
3718 */
3719static int
3720handle_written_filepage(pagedep, bp)
3721 struct pagedep *pagedep;
3722 struct buf *bp; /* buffer containing the written page */
3723{
3724 struct dirrem *dirrem;
3725 struct diradd *dap, *nextdap;
3726 struct direct *ep;
3727 int i, chgs;
3728
3729 if ((pagedep->pd_state & IOSTARTED) == 0) {
3730 lk.lkt_held = -1;
3731 panic("handle_written_filepage: not started");
3732 }
3733 pagedep->pd_state &= ~IOSTARTED;
3734 /*
3735 * Process any directory removals that have been committed.
3736 */
3737 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
3738 LIST_REMOVE(dirrem, dm_next);
3739 dirrem->dm_dirinum = pagedep->pd_ino;
3740 add_to_worklist(&dirrem->dm_list);
3741 }
3742 /*
3743 * Free any directory additions that have been committed.
3744 */
3745 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
3746 free_diradd(dap);
3747 /*
3748 * Uncommitted directory entries must be restored.
3749 */
3750 for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
3751 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
3752 dap = nextdap) {
3753 nextdap = LIST_NEXT(dap, da_pdlist);
3754 if (dap->da_state & ATTACHED) {
3755 lk.lkt_held = -1;
3756 panic("handle_written_filepage: attached");
3757 }
3758 ep = (struct direct *)
3759 ((char *)bp->b_data + dap->da_offset);
3760 ep->d_ino = dap->da_newinum;
3761 dap->da_state &= ~UNDONE;
3762 dap->da_state |= ATTACHED;
3763 chgs = 1;
3764 /*
3765 * If the inode referenced by the directory has
3766 * been written out, then the dependency can be
3767 * moved to the pending list.
3768 */
3769 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3770 LIST_REMOVE(dap, da_pdlist);
3771 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
3772 da_pdlist);
3773 }
3774 }
3775 }
3776 /*
3777 * If there were any rollbacks in the directory, then it must be
3778 * marked dirty so that its will eventually get written back in
3779 * its correct form.
3780 */
3781 if (chgs) {
3782 if ((bp->b_flags & B_DELWRI) == 0)
3783 stat_dir_entry++;
3784 bdirty(bp);
3785 }
3786 /*
3787 * If no dependencies remain, the pagedep will be freed.
3788 * Otherwise it will remain to update the page before it
3789 * is written back to disk.
3790 */
3791 if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
3792 for (i = 0; i < DAHASHSZ; i++)
3793 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
3794 break;
3795 if (i == DAHASHSZ) {
3796 LIST_REMOVE(pagedep, pd_hash);
3797 WORKITEM_FREE(pagedep, D_PAGEDEP);
3798 return (0);
3799 }
3800 }
3801 return (1);
3802}
3803
3804/*
3805 * Writing back in-core inode structures.
3806 *
3807 * The file system only accesses an inode's contents when it occupies an
3808 * "in-core" inode structure. These "in-core" structures are separate from
3809 * the page frames used to cache inode blocks. Only the latter are
3810 * transferred to/from the disk. So, when the updated contents of the
3811 * "in-core" inode structure are copied to the corresponding in-memory inode
3812 * block, the dependencies are also transferred. The following procedure is
3813 * called when copying a dirty "in-core" inode to a cached inode block.
3814 */
3815
3816/*
3817 * Called when an inode is loaded from disk. If the effective link count
3818 * differed from the actual link count when it was last flushed, then we
3819 * need to ensure that the correct effective link count is put back.
3820 */
3821void
3822softdep_load_inodeblock(ip)
3823 struct inode *ip; /* the "in_core" copy of the inode */
3824{
3825 struct inodedep *inodedep;
3826
3827 /*
3828 * Check for alternate nlink count.
3829 */
3830 ip->i_effnlink = ip->i_nlink;
3831 ACQUIRE_LOCK(&lk);
3832 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3833 FREE_LOCK(&lk);
3834 return;
3835 }
3836 ip->i_effnlink -= inodedep->id_nlinkdelta;
3837 if (inodedep->id_state & SPACECOUNTED)
3838 ip->i_flag |= IN_SPACECOUNTED;
3782 FREE_LOCK(&lk);
3783}
3784
3785/*
3786 * This routine is called just before the "in-core" inode
3787 * information is to be copied to the in-memory inode block.
3788 * Recall that an inode block contains several inodes. If
3789 * the force flag is set, then the dependencies will be
3790 * cleared so that the update can always be made. Note that
3791 * the buffer is locked when this routine is called, so we
3792 * will never be in the middle of writing the inode block
3793 * to disk.
3794 */
3795void
3796softdep_update_inodeblock(ip, bp, waitfor)
3797 struct inode *ip; /* the "in_core" copy of the inode */
3798 struct buf *bp; /* the buffer containing the inode block */
3799 int waitfor; /* nonzero => update must be allowed */
3800{
3801 struct inodedep *inodedep;
3802 struct worklist *wk;
3803 int error, gotit;
3804
3805 /*
3806 * If the effective link count is not equal to the actual link
3807 * count, then we must track the difference in an inodedep while
3808 * the inode is (potentially) tossed out of the cache. Otherwise,
3809 * if there is no existing inodedep, then there are no dependencies
3810 * to track.
3811 */
3812 ACQUIRE_LOCK(&lk);
3813 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3814 FREE_LOCK(&lk);
3815 if (ip->i_effnlink != ip->i_nlink)
3816 panic("softdep_update_inodeblock: bad link count");
3817 return;
3818 }
3819 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) {
3820 FREE_LOCK(&lk);
3821 panic("softdep_update_inodeblock: bad delta");
3822 }
3823 /*
3824 * Changes have been initiated. Anything depending on these
3825 * changes cannot occur until this inode has been written.
3826 */
3827 inodedep->id_state &= ~COMPLETE;
3828 if ((inodedep->id_state & ONWORKLIST) == 0)
3829 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
3830 /*
3831 * Any new dependencies associated with the incore inode must
3832 * now be moved to the list associated with the buffer holding
3833 * the in-memory copy of the inode. Once merged process any
3834 * allocdirects that are completed by the merger.
3835 */
3836 merge_inode_lists(inodedep);
3837 if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
3838 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
3839 /*
3840 * Now that the inode has been pushed into the buffer, the
3841 * operations dependent on the inode being written to disk
3842 * can be moved to the id_bufwait so that they will be
3843 * processed when the buffer I/O completes.
3844 */
3845 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
3846 WORKLIST_REMOVE(wk);
3847 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
3848 }
3849 /*
3850 * Newly allocated inodes cannot be written until the bitmap
3851 * that allocates them have been written (indicated by
3852 * DEPCOMPLETE being set in id_state). If we are doing a
3853 * forced sync (e.g., an fsync on a file), we force the bitmap
3854 * to be written so that the update can be done.
3855 */
3856 if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
3857 FREE_LOCK(&lk);
3858 return;
3859 }
3860 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
3861 FREE_LOCK(&lk);
3862 if (gotit &&
3863 (error = BUF_WRITE(inodedep->id_buf)) != 0)
3864 softdep_error("softdep_update_inodeblock: bwrite", error);
3865 if ((inodedep->id_state & DEPCOMPLETE) == 0)
3866 panic("softdep_update_inodeblock: update failed");
3867}
3868
3869/*
3870 * Merge the new inode dependency list (id_newinoupdt) into the old
3871 * inode dependency list (id_inoupdt). This routine must be called
3872 * with splbio interrupts blocked.
3873 */
3874static void
3875merge_inode_lists(inodedep)
3876 struct inodedep *inodedep;
3877{
3878 struct allocdirect *listadp, *newadp;
3879
3880 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3881 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
3882 if (listadp->ad_lbn < newadp->ad_lbn) {
3883 listadp = TAILQ_NEXT(listadp, ad_next);
3884 continue;
3885 }
3886 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3887 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
3888 if (listadp->ad_lbn == newadp->ad_lbn) {
3889 allocdirect_merge(&inodedep->id_inoupdt, newadp,
3890 listadp);
3891 listadp = newadp;
3892 }
3893 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3894 }
3895 while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
3896 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3897 TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
3898 }
3899}
3900
3901/*
3902 * If we are doing an fsync, then we must ensure that any directory
3903 * entries for the inode have been written after the inode gets to disk.
3904 */
3905int
3906softdep_fsync(vp)
3907 struct vnode *vp; /* the "in_core" copy of the inode */
3908{
3909 struct inodedep *inodedep;
3910 struct pagedep *pagedep;
3911 struct worklist *wk;
3912 struct diradd *dap;
3913 struct mount *mnt;
3914 struct vnode *pvp;
3915 struct inode *ip;
3916 struct buf *bp;
3917 struct fs *fs;
3918 struct proc *p = CURPROC; /* XXX */
3919 int error, flushparent;
3920 ino_t parentino;
3921 ufs_lbn_t lbn;
3922
3923 ip = VTOI(vp);
3924 fs = ip->i_fs;
3925 ACQUIRE_LOCK(&lk);
3926 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
3927 FREE_LOCK(&lk);
3928 return (0);
3929 }
3930 if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
3931 LIST_FIRST(&inodedep->id_bufwait) != NULL ||
3932 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
3933 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
3934 FREE_LOCK(&lk);
3935 panic("softdep_fsync: pending ops");
3936 }
3937 for (error = 0, flushparent = 0; ; ) {
3938 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
3939 break;
3940 if (wk->wk_type != D_DIRADD) {
3941 FREE_LOCK(&lk);
3942 panic("softdep_fsync: Unexpected type %s",
3943 TYPENAME(wk->wk_type));
3944 }
3945 dap = WK_DIRADD(wk);
3946 /*
3947 * Flush our parent if this directory entry
3948 * has a MKDIR_PARENT dependency.
3949 */
3950 if (dap->da_state & DIRCHG)
3951 pagedep = dap->da_previous->dm_pagedep;
3952 else
3953 pagedep = dap->da_pagedep;
3954 mnt = pagedep->pd_mnt;
3955 parentino = pagedep->pd_ino;
3956 lbn = pagedep->pd_lbn;
3957 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) {
3958 FREE_LOCK(&lk);
3959 panic("softdep_fsync: dirty");
3960 }
3961 flushparent = dap->da_state & MKDIR_PARENT;
3962 /*
3963 * If we are being fsync'ed as part of vgone'ing this vnode,
3964 * then we will not be able to release and recover the
3965 * vnode below, so we just have to give up on writing its
3966 * directory entry out. It will eventually be written, just
3967 * not now, but then the user was not asking to have it
3968 * written, so we are not breaking any promises.
3969 */
3970 if (vp->v_flag & VXLOCK)
3971 break;
3972 /*
3973 * We prevent deadlock by always fetching inodes from the
3974 * root, moving down the directory tree. Thus, when fetching
3975 * our parent directory, we must unlock ourselves before
3976 * requesting the lock on our parent. See the comment in
3977 * ufs_lookup for details on possible races.
3978 */
3979 FREE_LOCK(&lk);
3980 VOP_UNLOCK(vp, 0, p);
3981 error = VFS_VGET(mnt, parentino, &pvp);
3982 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
3983 if (error != 0)
3984 return (error);
3985 if (flushparent) {
3986 if ((error = UFS_UPDATE(pvp, 1)) != 0) {
3987 vput(pvp);
3988 return (error);
3989 }
3990 }
3991 /*
3992 * Flush directory page containing the inode's name.
3993 */
3994 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
3995 &bp);
3996 if (error == 0)
3997 error = BUF_WRITE(bp);
3998 vput(pvp);
3999 if (error != 0)
4000 return (error);
4001 ACQUIRE_LOCK(&lk);
4002 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
4003 break;
4004 }
4005 FREE_LOCK(&lk);
4006 return (0);
4007}
4008
4009/*
4010 * Flush all the dirty bitmaps associated with the block device
4011 * before flushing the rest of the dirty blocks so as to reduce
4012 * the number of dependencies that will have to be rolled back.
4013 */
4014void
4015softdep_fsync_mountdev(vp)
4016 struct vnode *vp;
4017{
4018 struct buf *bp, *nbp;
4019 struct worklist *wk;
4020
4021 if (!vn_isdisk(vp, NULL))
4022 panic("softdep_fsync_mountdev: vnode not a disk");
4023 ACQUIRE_LOCK(&lk);
4024 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
4025 nbp = TAILQ_NEXT(bp, b_vnbufs);
4026 /*
4027 * If it is already scheduled, skip to the next buffer.
4028 */
4029 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
4030 continue;
4031 if ((bp->b_flags & B_DELWRI) == 0) {
4032 FREE_LOCK(&lk);
4033 panic("softdep_fsync_mountdev: not dirty");
4034 }
4035 /*
4036 * We are only interested in bitmaps with outstanding
4037 * dependencies.
4038 */
4039 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
4040 wk->wk_type != D_BMSAFEMAP ||
4041 (bp->b_xflags & BX_BKGRDINPROG)) {
4042 BUF_UNLOCK(bp);
4043 continue;
4044 }
4045 bremfree(bp);
4046 FREE_LOCK(&lk);
4047 (void) bawrite(bp);
4048 ACQUIRE_LOCK(&lk);
4049 /*
4050 * Since we may have slept during the I/O, we need
4051 * to start from a known point.
4052 */
4053 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
4054 }
4055 drain_output(vp, 1);
4056 FREE_LOCK(&lk);
4057}
4058
4059/*
4060 * This routine is called when we are trying to synchronously flush a
4061 * file. This routine must eliminate any filesystem metadata dependencies
4062 * so that the syncing routine can succeed by pushing the dirty blocks
4063 * associated with the file. If any I/O errors occur, they are returned.
4064 */
4065int
4066softdep_sync_metadata(ap)
4067 struct vop_fsync_args /* {
4068 struct vnode *a_vp;
4069 struct ucred *a_cred;
4070 int a_waitfor;
4071 struct proc *a_p;
4072 } */ *ap;
4073{
4074 struct vnode *vp = ap->a_vp;
4075 struct pagedep *pagedep;
4076 struct allocdirect *adp;
4077 struct allocindir *aip;
4078 struct buf *bp, *nbp;
4079 struct worklist *wk;
4080 int i, error, waitfor;
4081
4082 /*
4083 * Check whether this vnode is involved in a filesystem
4084 * that is doing soft dependency processing.
4085 */
4086 if (!vn_isdisk(vp, NULL)) {
4087 if (!DOINGSOFTDEP(vp))
4088 return (0);
4089 } else
4090 if (vp->v_rdev->si_mountpoint == NULL ||
4091 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0)
4092 return (0);
4093 /*
4094 * Ensure that any direct block dependencies have been cleared.
4095 */
4096 ACQUIRE_LOCK(&lk);
4097 if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
4098 FREE_LOCK(&lk);
4099 return (error);
4100 }
4101 /*
4102 * For most files, the only metadata dependencies are the
4103 * cylinder group maps that allocate their inode or blocks.
4104 * The block allocation dependencies can be found by traversing
4105 * the dependency lists for any buffers that remain on their
4106 * dirty buffer list. The inode allocation dependency will
4107 * be resolved when the inode is updated with MNT_WAIT.
4108 * This work is done in two passes. The first pass grabs most
4109 * of the buffers and begins asynchronously writing them. The
4110 * only way to wait for these asynchronous writes is to sleep
4111 * on the filesystem vnode which may stay busy for a long time
4112 * if the filesystem is active. So, instead, we make a second
4113 * pass over the dependencies blocking on each write. In the
4114 * usual case we will be blocking against a write that we
4115 * initiated, so when it is done the dependency will have been
4116 * resolved. Thus the second pass is expected to end quickly.
4117 */
4118 waitfor = MNT_NOWAIT;
4119top:
4120 if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
4121 FREE_LOCK(&lk);
4122 return (0);
4123 }
4124 bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
4125 /* While syncing snapshots, we must allow recursive lookups */
4126 bp->b_lock.lk_flags |= LK_CANRECURSE;
4127loop:
4128 /*
4129 * As we hold the buffer locked, none of its dependencies
4130 * will disappear.
4131 */
4132 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4133 switch (wk->wk_type) {
4134
4135 case D_ALLOCDIRECT:
4136 adp = WK_ALLOCDIRECT(wk);
4137 if (adp->ad_state & DEPCOMPLETE)
4138 continue;
4139 nbp = adp->ad_buf;
4140 if (getdirtybuf(&nbp, waitfor) == 0)
4141 continue;
4142 FREE_LOCK(&lk);
4143 if (waitfor == MNT_NOWAIT) {
4144 bawrite(nbp);
4145 } else if ((error = BUF_WRITE(nbp)) != 0) {
4146 break;
4147 }
4148 ACQUIRE_LOCK(&lk);
4149 continue;
4150
4151 case D_ALLOCINDIR:
4152 aip = WK_ALLOCINDIR(wk);
4153 if (aip->ai_state & DEPCOMPLETE)
4154 continue;
4155 nbp = aip->ai_buf;
4156 if (getdirtybuf(&nbp, waitfor) == 0)
4157 continue;
4158 FREE_LOCK(&lk);
4159 if (waitfor == MNT_NOWAIT) {
4160 bawrite(nbp);
4161 } else if ((error = BUF_WRITE(nbp)) != 0) {
4162 break;
4163 }
4164 ACQUIRE_LOCK(&lk);
4165 continue;
4166
4167 case D_INDIRDEP:
4168 restart:
4169
4170 LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
4171 if (aip->ai_state & DEPCOMPLETE)
4172 continue;
4173 nbp = aip->ai_buf;
4174 if (getdirtybuf(&nbp, MNT_WAIT) == 0)
4175 goto restart;
4176 FREE_LOCK(&lk);
4177 if ((error = BUF_WRITE(nbp)) != 0) {
4178 break;
4179 }
4180 ACQUIRE_LOCK(&lk);
4181 goto restart;
4182 }
4183 continue;
4184
4185 case D_INODEDEP:
4186 if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
4187 WK_INODEDEP(wk)->id_ino)) != 0) {
4188 FREE_LOCK(&lk);
4189 break;
4190 }
4191 continue;
4192
4193 case D_PAGEDEP:
4194 /*
4195 * We are trying to sync a directory that may
4196 * have dependencies on both its own metadata
4197 * and/or dependencies on the inodes of any
4198 * recently allocated files. We walk its diradd
4199 * lists pushing out the associated inode.
4200 */
4201 pagedep = WK_PAGEDEP(wk);
4202 for (i = 0; i < DAHASHSZ; i++) {
4203 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
4204 continue;
4205 if ((error =
4206 flush_pagedep_deps(vp, pagedep->pd_mnt,
4207 &pagedep->pd_diraddhd[i]))) {
4208 FREE_LOCK(&lk);
4209 break;
4210 }
4211 }
4212 continue;
4213
4214 case D_MKDIR:
4215 /*
4216 * This case should never happen if the vnode has
4217 * been properly sync'ed. However, if this function
4218 * is used at a place where the vnode has not yet
4219 * been sync'ed, this dependency can show up. So,
4220 * rather than panic, just flush it.
4221 */
4222 nbp = WK_MKDIR(wk)->md_buf;
4223 if (getdirtybuf(&nbp, waitfor) == 0)
4224 continue;
4225 FREE_LOCK(&lk);
4226 if (waitfor == MNT_NOWAIT) {
4227 bawrite(nbp);
4228 } else if ((error = BUF_WRITE(nbp)) != 0) {
4229 break;
4230 }
4231 ACQUIRE_LOCK(&lk);
4232 continue;
4233
4234 case D_BMSAFEMAP:
4235 /*
4236 * This case should never happen if the vnode has
4237 * been properly sync'ed. However, if this function
4238 * is used at a place where the vnode has not yet
4239 * been sync'ed, this dependency can show up. So,
4240 * rather than panic, just flush it.
4241 */
4242 nbp = WK_BMSAFEMAP(wk)->sm_buf;
4243 if (getdirtybuf(&nbp, waitfor) == 0)
4244 continue;
4245 FREE_LOCK(&lk);
4246 if (waitfor == MNT_NOWAIT) {
4247 bawrite(nbp);
4248 } else if ((error = BUF_WRITE(nbp)) != 0) {
4249 break;
4250 }
4251 ACQUIRE_LOCK(&lk);
4252 continue;
4253
4254 default:
4255 FREE_LOCK(&lk);
4256 panic("softdep_sync_metadata: Unknown type %s",
4257 TYPENAME(wk->wk_type));
4258 /* NOTREACHED */
4259 }
4260 /* We reach here only in error and unlocked */
4261 if (error == 0)
4262 panic("softdep_sync_metadata: zero error");
4263 bp->b_lock.lk_flags &= ~LK_CANRECURSE;
4264 bawrite(bp);
4265 return (error);
4266 }
4267 (void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT);
4268 nbp = TAILQ_NEXT(bp, b_vnbufs);
4269 FREE_LOCK(&lk);
4270 bp->b_lock.lk_flags &= ~LK_CANRECURSE;
4271 bawrite(bp);
4272 ACQUIRE_LOCK(&lk);
4273 if (nbp != NULL) {
4274 bp = nbp;
4275 goto loop;
4276 }
4277 /*
4278 * We must wait for any I/O in progress to finish so that
4279 * all potential buffers on the dirty list will be visible.
4280 * Once they are all there, proceed with the second pass
4281 * which will wait for the I/O as per above.
4282 */
4283 drain_output(vp, 1);
4284 /*
4285 * The brief unlock is to allow any pent up dependency
4286 * processing to be done.
4287 */
4288 if (waitfor == MNT_NOWAIT) {
4289 waitfor = MNT_WAIT;
4290 FREE_LOCK(&lk);
4291 ACQUIRE_LOCK(&lk);
4292 goto top;
4293 }
4294
4295 /*
4296 * If we have managed to get rid of all the dirty buffers,
4297 * then we are done. For certain directories and block
4298 * devices, we may need to do further work.
4299 */
4300 if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
4301 FREE_LOCK(&lk);
4302 return (0);
4303 }
4304
4305 FREE_LOCK(&lk);
4306 /*
4307 * If we are trying to sync a block device, some of its buffers may
4308 * contain metadata that cannot be written until the contents of some
4309 * partially written files have been written to disk. The only easy
4310 * way to accomplish this is to sync the entire filesystem (luckily
4311 * this happens rarely).
4312 */
4313 if (vn_isdisk(vp, NULL) &&
4314 vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) &&
4315 (error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT, ap->a_cred,
4316 ap->a_p)) != 0)
4317 return (error);
4318 return (0);
4319}
4320
4321/*
4322 * Flush the dependencies associated with an inodedep.
4323 * Called with splbio blocked.
4324 */
4325static int
4326flush_inodedep_deps(fs, ino)
4327 struct fs *fs;
4328 ino_t ino;
4329{
4330 struct inodedep *inodedep;
4331 struct allocdirect *adp;
4332 int error, waitfor;
4333 struct buf *bp;
4334
4335 /*
4336 * This work is done in two passes. The first pass grabs most
4337 * of the buffers and begins asynchronously writing them. The
4338 * only way to wait for these asynchronous writes is to sleep
4339 * on the filesystem vnode which may stay busy for a long time
4340 * if the filesystem is active. So, instead, we make a second
4341 * pass over the dependencies blocking on each write. In the
4342 * usual case we will be blocking against a write that we
4343 * initiated, so when it is done the dependency will have been
4344 * resolved. Thus the second pass is expected to end quickly.
4345 * We give a brief window at the top of the loop to allow
4346 * any pending I/O to complete.
4347 */
4348 for (waitfor = MNT_NOWAIT; ; ) {
4349 FREE_LOCK(&lk);
4350 ACQUIRE_LOCK(&lk);
4351 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4352 return (0);
4353 TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
4354 if (adp->ad_state & DEPCOMPLETE)
4355 continue;
4356 bp = adp->ad_buf;
4357 if (getdirtybuf(&bp, waitfor) == 0) {
4358 if (waitfor == MNT_NOWAIT)
4359 continue;
4360 break;
4361 }
4362 FREE_LOCK(&lk);
4363 if (waitfor == MNT_NOWAIT) {
4364 bawrite(bp);
4365 } else if ((error = BUF_WRITE(bp)) != 0) {
4366 ACQUIRE_LOCK(&lk);
4367 return (error);
4368 }
4369 ACQUIRE_LOCK(&lk);
4370 break;
4371 }
4372 if (adp != NULL)
4373 continue;
4374 TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
4375 if (adp->ad_state & DEPCOMPLETE)
4376 continue;
4377 bp = adp->ad_buf;
4378 if (getdirtybuf(&bp, waitfor) == 0) {
4379 if (waitfor == MNT_NOWAIT)
4380 continue;
4381 break;
4382 }
4383 FREE_LOCK(&lk);
4384 if (waitfor == MNT_NOWAIT) {
4385 bawrite(bp);
4386 } else if ((error = BUF_WRITE(bp)) != 0) {
4387 ACQUIRE_LOCK(&lk);
4388 return (error);
4389 }
4390 ACQUIRE_LOCK(&lk);
4391 break;
4392 }
4393 if (adp != NULL)
4394 continue;
4395 /*
4396 * If pass2, we are done, otherwise do pass 2.
4397 */
4398 if (waitfor == MNT_WAIT)
4399 break;
4400 waitfor = MNT_WAIT;
4401 }
4402 /*
4403 * Try freeing inodedep in case all dependencies have been removed.
4404 */
4405 if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
4406 (void) free_inodedep(inodedep);
4407 return (0);
4408}
4409
4410/*
4411 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
4412 * Called with splbio blocked.
4413 */
4414static int
4415flush_pagedep_deps(pvp, mp, diraddhdp)
4416 struct vnode *pvp;
4417 struct mount *mp;
4418 struct diraddhd *diraddhdp;
4419{
4420 struct proc *p = CURPROC; /* XXX */
4421 struct inodedep *inodedep;
4422 struct ufsmount *ump;
4423 struct diradd *dap;
4424 struct vnode *vp;
4425 int gotit, error = 0;
4426 struct buf *bp;
4427 ino_t inum;
4428
4429 ump = VFSTOUFS(mp);
4430 while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
4431 /*
4432 * Flush ourselves if this directory entry
4433 * has a MKDIR_PARENT dependency.
4434 */
4435 if (dap->da_state & MKDIR_PARENT) {
4436 FREE_LOCK(&lk);
4437 if ((error = UFS_UPDATE(pvp, 1)) != 0)
4438 break;
4439 ACQUIRE_LOCK(&lk);
4440 /*
4441 * If that cleared dependencies, go on to next.
4442 */
4443 if (dap != LIST_FIRST(diraddhdp))
4444 continue;
4445 if (dap->da_state & MKDIR_PARENT) {
4446 FREE_LOCK(&lk);
4447 panic("flush_pagedep_deps: MKDIR_PARENT");
4448 }
4449 }
4450 /*
4451 * A newly allocated directory must have its "." and
4452 * ".." entries written out before its name can be
4453 * committed in its parent. We do not want or need
4454 * the full semantics of a synchronous VOP_FSYNC as
4455 * that may end up here again, once for each directory
4456 * level in the filesystem. Instead, we push the blocks
4457 * and wait for them to clear. We have to fsync twice
4458 * because the first call may choose to defer blocks
4459 * that still have dependencies, but deferral will
4460 * happen at most once.
4461 */
4462 inum = dap->da_newinum;
4463 if (dap->da_state & MKDIR_BODY) {
4464 FREE_LOCK(&lk);
4465 if ((error = VFS_VGET(mp, inum, &vp)) != 0)
4466 break;
4467 if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) ||
4468 (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) {
4469 vput(vp);
4470 break;
4471 }
4472 drain_output(vp, 0);
4473 vput(vp);
4474 ACQUIRE_LOCK(&lk);
4475 /*
4476 * If that cleared dependencies, go on to next.
4477 */
4478 if (dap != LIST_FIRST(diraddhdp))
4479 continue;
4480 if (dap->da_state & MKDIR_BODY) {
4481 FREE_LOCK(&lk);
4482 panic("flush_pagedep_deps: MKDIR_BODY");
4483 }
4484 }
4485 /*
4486 * Flush the inode on which the directory entry depends.
4487 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
4488 * the only remaining dependency is that the updated inode
4489 * count must get pushed to disk. The inode has already
4490 * been pushed into its inode buffer (via VOP_UPDATE) at
4491 * the time of the reference count change. So we need only
4492 * locate that buffer, ensure that there will be no rollback
4493 * caused by a bitmap dependency, then write the inode buffer.
4494 */
4495 if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
4496 FREE_LOCK(&lk);
4497 panic("flush_pagedep_deps: lost inode");
4498 }
4499 /*
4500 * If the inode still has bitmap dependencies,
4501 * push them to disk.
4502 */
4503 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4504 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
4505 FREE_LOCK(&lk);
4506 if (gotit &&
4507 (error = BUF_WRITE(inodedep->id_buf)) != 0)
4508 break;
4509 ACQUIRE_LOCK(&lk);
4510 if (dap != LIST_FIRST(diraddhdp))
4511 continue;
4512 }
4513 /*
4514 * If the inode is still sitting in a buffer waiting
4515 * to be written, push it to disk.
4516 */
4517 FREE_LOCK(&lk);
4518 if ((error = bread(ump->um_devvp,
4519 fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
4520 (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0)
4521 break;
4522 if ((error = BUF_WRITE(bp)) != 0)
4523 break;
4524 ACQUIRE_LOCK(&lk);
4525 /*
4526 * If we have failed to get rid of all the dependencies
4527 * then something is seriously wrong.
4528 */
4529 if (dap == LIST_FIRST(diraddhdp)) {
4530 FREE_LOCK(&lk);
4531 panic("flush_pagedep_deps: flush failed");
4532 }
4533 }
4534 if (error)
4535 ACQUIRE_LOCK(&lk);
4536 return (error);
4537}
4538
4539/*
4540 * A large burst of file addition or deletion activity can drive the
4541 * memory load excessively high. First attempt to slow things down
4542 * using the techniques below. If that fails, this routine requests
4543 * the offending operations to fall back to running synchronously
4544 * until the memory load returns to a reasonable level.
4545 */
4546int
4547softdep_slowdown(vp)
4548 struct vnode *vp;
4549{
4550 int max_softdeps_hard;
4551
4552 max_softdeps_hard = max_softdeps * 11 / 10;
4553 if (num_dirrem < max_softdeps_hard / 2 &&
4554 num_inodedep < max_softdeps_hard)
4555 return (0);
4556 stat_sync_limit_hit += 1;
4557 return (1);
4558}
4559
4560/*
4561 * If memory utilization has gotten too high, deliberately slow things
4562 * down and speed up the I/O processing.
4563 */
4564static int
4565request_cleanup(resource, islocked)
4566 int resource;
4567 int islocked;
4568{
4569 struct proc *p = CURPROC;
4570
4571 /*
4572 * We never hold up the filesystem syncer process.
4573 */
4574 if (p == filesys_syncer)
4575 return (0);
4576 /*
4577 * First check to see if the work list has gotten backlogged.
4578 * If it has, co-opt this process to help clean up two entries.
4579 * Because this process may hold inodes locked, we cannot
4580 * handle any remove requests that might block on a locked
4581 * inode as that could lead to deadlock.
4582 */
4583 if (num_on_worklist > max_softdeps / 10) {
4584 if (islocked)
4585 FREE_LOCK(&lk);
4586 process_worklist_item(NULL, LK_NOWAIT);
4587 process_worklist_item(NULL, LK_NOWAIT);
4588 stat_worklist_push += 2;
4589 if (islocked)
4590 ACQUIRE_LOCK(&lk);
4591 return(1);
4592 }
4593 /*
4594 * Next, we attempt to speed up the syncer process. If that
4595 * is successful, then we allow the process to continue.
4596 */
4597 if (speedup_syncer())
4598 return(0);
4599 /*
4600 * If we are resource constrained on inode dependencies, try
4601 * flushing some dirty inodes. Otherwise, we are constrained
4602 * by file deletions, so try accelerating flushes of directories
4603 * with removal dependencies. We would like to do the cleanup
4604 * here, but we probably hold an inode locked at this point and
4605 * that might deadlock against one that we try to clean. So,
4606 * the best that we can do is request the syncer daemon to do
4607 * the cleanup for us.
4608 */
4609 switch (resource) {
4610
4611 case FLUSH_INODES:
4612 stat_ino_limit_push += 1;
4613 req_clear_inodedeps += 1;
4614 stat_countp = &stat_ino_limit_hit;
4615 break;
4616
4617 case FLUSH_REMOVE:
4618 stat_blk_limit_push += 1;
4619 req_clear_remove += 1;
4620 stat_countp = &stat_blk_limit_hit;
4621 break;
4622
4623 default:
4624 if (islocked)
4625 FREE_LOCK(&lk);
4626 panic("request_cleanup: unknown type");
4627 }
4628 /*
4629 * Hopefully the syncer daemon will catch up and awaken us.
4630 * We wait at most tickdelay before proceeding in any case.
4631 */
4632 if (islocked == 0)
4633 ACQUIRE_LOCK(&lk);
4634 proc_waiting += 1;
4635 if (handle.callout == NULL)
4636 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
4637 FREE_LOCK_INTERLOCKED(&lk);
4638 (void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0);
4639 ACQUIRE_LOCK_INTERLOCKED(&lk);
4640 proc_waiting -= 1;
4641 if (islocked == 0)
4642 FREE_LOCK(&lk);
4643 return (1);
4644}
4645
4646/*
4647 * Awaken processes pausing in request_cleanup and clear proc_waiting
4648 * to indicate that there is no longer a timer running.
4649 */
4650void
4651pause_timer(arg)
4652 void *arg;
4653{
4654
4655 *stat_countp += 1;
4656 wakeup_one(&proc_waiting);
4657 if (proc_waiting > 0)
4658 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
4659 else
4660 handle.callout = NULL;
4661}
4662
4663/*
4664 * Flush out a directory with at least one removal dependency in an effort to
4665 * reduce the number of dirrem, freefile, and freeblks dependency structures.
4666 */
4667static void
4668clear_remove(p)
4669 struct proc *p;
4670{
4671 struct pagedep_hashhead *pagedephd;
4672 struct pagedep *pagedep;
4673 static int next = 0;
4674 struct mount *mp;
4675 struct vnode *vp;
4676 int error, cnt;
4677 ino_t ino;
4678
4679 ACQUIRE_LOCK(&lk);
4680 for (cnt = 0; cnt < pagedep_hash; cnt++) {
4681 pagedephd = &pagedep_hashtbl[next++];
4682 if (next >= pagedep_hash)
4683 next = 0;
4684 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
4685 if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
4686 continue;
4687 mp = pagedep->pd_mnt;
4688 ino = pagedep->pd_ino;
4689 FREE_LOCK(&lk);
4690 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
4691 continue;
4692 if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
4693 softdep_error("clear_remove: vget", error);
4694 vn_finished_write(mp);
4695 return;
4696 }
4697 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
4698 softdep_error("clear_remove: fsync", error);
4699 drain_output(vp, 0);
4700 vput(vp);
4701 vn_finished_write(mp);
4702 return;
4703 }
4704 }
4705 FREE_LOCK(&lk);
4706}
4707
4708/*
4709 * Clear out a block of dirty inodes in an effort to reduce
4710 * the number of inodedep dependency structures.
4711 */
4712static void
4713clear_inodedeps(p)
4714 struct proc *p;
4715{
4716 struct inodedep_hashhead *inodedephd;
4717 struct inodedep *inodedep;
4718 static int next = 0;
4719 struct mount *mp;
4720 struct vnode *vp;
4721 struct fs *fs;
4722 int error, cnt;
4723 ino_t firstino, lastino, ino;
4724
4725 ACQUIRE_LOCK(&lk);
4726 /*
4727 * Pick a random inode dependency to be cleared.
4728 * We will then gather up all the inodes in its block
4729 * that have dependencies and flush them out.
4730 */
4731 for (cnt = 0; cnt < inodedep_hash; cnt++) {
4732 inodedephd = &inodedep_hashtbl[next++];
4733 if (next >= inodedep_hash)
4734 next = 0;
4735 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
4736 break;
4737 }
4738 if (inodedep == NULL)
4739 return;
4740 /*
4741 * Ugly code to find mount point given pointer to superblock.
4742 */
4743 fs = inodedep->id_fs;
4744 TAILQ_FOREACH(mp, &mountlist, mnt_list)
4745 if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
4746 break;
4747 /*
4748 * Find the last inode in the block with dependencies.
4749 */
4750 firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
4751 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
4752 if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
4753 break;
4754 /*
4755 * Asynchronously push all but the last inode with dependencies.
4756 * Synchronously push the last inode with dependencies to ensure
4757 * that the inode block gets written to free up the inodedeps.
4758 */
4759 for (ino = firstino; ino <= lastino; ino++) {
4760 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4761 continue;
4762 FREE_LOCK(&lk);
4763 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
4764 continue;
4765 if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
4766 softdep_error("clear_inodedeps: vget", error);
4767 vn_finished_write(mp);
4768 return;
4769 }
4770 if (ino == lastino) {
4771 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p)))
4772 softdep_error("clear_inodedeps: fsync1", error);
4773 } else {
4774 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
4775 softdep_error("clear_inodedeps: fsync2", error);
4776 drain_output(vp, 0);
4777 }
4778 vput(vp);
4779 vn_finished_write(mp);
4780 ACQUIRE_LOCK(&lk);
4781 }
4782 FREE_LOCK(&lk);
4783}
4784
4785/*
4786 * Function to determine if the buffer has outstanding dependencies
4787 * that will cause a roll-back if the buffer is written. If wantcount
4788 * is set, return number of dependencies, otherwise just yes or no.
4789 */
4790static int
4791softdep_count_dependencies(bp, wantcount)
4792 struct buf *bp;
4793 int wantcount;
4794{
4795 struct worklist *wk;
4796 struct inodedep *inodedep;
4797 struct indirdep *indirdep;
4798 struct allocindir *aip;
4799 struct pagedep *pagedep;
4800 struct diradd *dap;
4801 int i, retval;
4802
4803 retval = 0;
4804 ACQUIRE_LOCK(&lk);
4805 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4806 switch (wk->wk_type) {
4807
4808 case D_INODEDEP:
4809 inodedep = WK_INODEDEP(wk);
4810 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4811 /* bitmap allocation dependency */
4812 retval += 1;
4813 if (!wantcount)
4814 goto out;
4815 }
4816 if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
4817 /* direct block pointer dependency */
4818 retval += 1;
4819 if (!wantcount)
4820 goto out;
4821 }
4822 continue;
4823
4824 case D_INDIRDEP:
4825 indirdep = WK_INDIRDEP(wk);
4826
4827 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
4828 /* indirect block pointer dependency */
4829 retval += 1;
4830 if (!wantcount)
4831 goto out;
4832 }
4833 continue;
4834
4835 case D_PAGEDEP:
4836 pagedep = WK_PAGEDEP(wk);
4837 for (i = 0; i < DAHASHSZ; i++) {
4838
4839 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
4840 /* directory entry dependency */
4841 retval += 1;
4842 if (!wantcount)
4843 goto out;
4844 }
4845 }
4846 continue;
4847
4848 case D_BMSAFEMAP:
4849 case D_ALLOCDIRECT:
4850 case D_ALLOCINDIR:
4851 case D_MKDIR:
4852 /* never a dependency on these blocks */
4853 continue;
4854
4855 default:
4856 FREE_LOCK(&lk);
4857 panic("softdep_check_for_rollback: Unexpected type %s",
4858 TYPENAME(wk->wk_type));
4859 /* NOTREACHED */
4860 }
4861 }
4862out:
4863 FREE_LOCK(&lk);
4864 return retval;
4865}
4866
4867/*
4868 * Acquire exclusive access to a buffer.
4869 * Must be called with splbio blocked.
4870 * Return 1 if buffer was acquired.
4871 */
4872static int
4873getdirtybuf(bpp, waitfor)
4874 struct buf **bpp;
4875 int waitfor;
4876{
4877 struct buf *bp;
4878
4879 for (;;) {
4880 if ((bp = *bpp) == NULL)
4881 return (0);
4882 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
4883 if ((bp->b_xflags & BX_BKGRDINPROG) == 0)
4884 break;
4885 BUF_UNLOCK(bp);
4886 if (waitfor != MNT_WAIT)
4887 return (0);
4888 bp->b_xflags |= BX_BKGRDWAIT;
4889 FREE_LOCK_INTERLOCKED(&lk);
4890 tsleep(&bp->b_xflags, PRIBIO, "getbuf", 0);
4891 ACQUIRE_LOCK_INTERLOCKED(&lk);
4892 continue;
4893 }
4894 if (waitfor != MNT_WAIT)
4895 return (0);
4896 FREE_LOCK_INTERLOCKED(&lk);
4897 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL) != ENOLCK)
4898 panic("getdirtybuf: inconsistent lock");
4899 ACQUIRE_LOCK_INTERLOCKED(&lk);
4900 }
4901 if ((bp->b_flags & B_DELWRI) == 0) {
4902 BUF_UNLOCK(bp);
4903 return (0);
4904 }
4905 bremfree(bp);
4906 return (1);
4907}
4908
4909/*
4910 * Wait for pending output on a vnode to complete.
4911 * Must be called with vnode locked.
4912 */
4913static void
4914drain_output(vp, islocked)
4915 struct vnode *vp;
4916 int islocked;
4917{
4918
4919 if (!islocked)
4920 ACQUIRE_LOCK(&lk);
4921 while (vp->v_numoutput) {
4922 vp->v_flag |= VBWAIT;
4923 FREE_LOCK_INTERLOCKED(&lk);
4924 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drainvp", 0);
4925 ACQUIRE_LOCK_INTERLOCKED(&lk);
4926 }
4927 if (!islocked)
4928 FREE_LOCK(&lk);
4929}
4930
4931/*
4932 * Called whenever a buffer that is being invalidated or reallocated
4933 * contains dependencies. This should only happen if an I/O error has
4934 * occurred. The routine is called with the buffer locked.
4935 */
4936static void
4937softdep_deallocate_dependencies(bp)
4938 struct buf *bp;
4939{
4940
4941 if ((bp->b_ioflags & BIO_ERROR) == 0)
4942 panic("softdep_deallocate_dependencies: dangling deps");
4943 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
4944 panic("softdep_deallocate_dependencies: unrecovered I/O error");
4945}
4946
4947/*
4948 * Function to handle asynchronous write errors in the filesystem.
4949 */
4950void
4951softdep_error(func, error)
4952 char *func;
4953 int error;
4954{
4955
4956 /* XXX should do something better! */
4957 printf("%s: got error %d while accessing filesystem\n", func, error);
4958}
3839 FREE_LOCK(&lk);
3840}
3841
3842/*
3843 * This routine is called just before the "in-core" inode
3844 * information is to be copied to the in-memory inode block.
3845 * Recall that an inode block contains several inodes. If
3846 * the force flag is set, then the dependencies will be
3847 * cleared so that the update can always be made. Note that
3848 * the buffer is locked when this routine is called, so we
3849 * will never be in the middle of writing the inode block
3850 * to disk.
3851 */
3852void
3853softdep_update_inodeblock(ip, bp, waitfor)
3854 struct inode *ip; /* the "in_core" copy of the inode */
3855 struct buf *bp; /* the buffer containing the inode block */
3856 int waitfor; /* nonzero => update must be allowed */
3857{
3858 struct inodedep *inodedep;
3859 struct worklist *wk;
3860 int error, gotit;
3861
3862 /*
3863 * If the effective link count is not equal to the actual link
3864 * count, then we must track the difference in an inodedep while
3865 * the inode is (potentially) tossed out of the cache. Otherwise,
3866 * if there is no existing inodedep, then there are no dependencies
3867 * to track.
3868 */
3869 ACQUIRE_LOCK(&lk);
3870 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3871 FREE_LOCK(&lk);
3872 if (ip->i_effnlink != ip->i_nlink)
3873 panic("softdep_update_inodeblock: bad link count");
3874 return;
3875 }
3876 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) {
3877 FREE_LOCK(&lk);
3878 panic("softdep_update_inodeblock: bad delta");
3879 }
3880 /*
3881 * Changes have been initiated. Anything depending on these
3882 * changes cannot occur until this inode has been written.
3883 */
3884 inodedep->id_state &= ~COMPLETE;
3885 if ((inodedep->id_state & ONWORKLIST) == 0)
3886 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
3887 /*
3888 * Any new dependencies associated with the incore inode must
3889 * now be moved to the list associated with the buffer holding
3890 * the in-memory copy of the inode. Once merged process any
3891 * allocdirects that are completed by the merger.
3892 */
3893 merge_inode_lists(inodedep);
3894 if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
3895 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
3896 /*
3897 * Now that the inode has been pushed into the buffer, the
3898 * operations dependent on the inode being written to disk
3899 * can be moved to the id_bufwait so that they will be
3900 * processed when the buffer I/O completes.
3901 */
3902 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
3903 WORKLIST_REMOVE(wk);
3904 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
3905 }
3906 /*
3907 * Newly allocated inodes cannot be written until the bitmap
3908 * that allocates them have been written (indicated by
3909 * DEPCOMPLETE being set in id_state). If we are doing a
3910 * forced sync (e.g., an fsync on a file), we force the bitmap
3911 * to be written so that the update can be done.
3912 */
3913 if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
3914 FREE_LOCK(&lk);
3915 return;
3916 }
3917 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
3918 FREE_LOCK(&lk);
3919 if (gotit &&
3920 (error = BUF_WRITE(inodedep->id_buf)) != 0)
3921 softdep_error("softdep_update_inodeblock: bwrite", error);
3922 if ((inodedep->id_state & DEPCOMPLETE) == 0)
3923 panic("softdep_update_inodeblock: update failed");
3924}
3925
3926/*
3927 * Merge the new inode dependency list (id_newinoupdt) into the old
3928 * inode dependency list (id_inoupdt). This routine must be called
3929 * with splbio interrupts blocked.
3930 */
3931static void
3932merge_inode_lists(inodedep)
3933 struct inodedep *inodedep;
3934{
3935 struct allocdirect *listadp, *newadp;
3936
3937 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3938 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
3939 if (listadp->ad_lbn < newadp->ad_lbn) {
3940 listadp = TAILQ_NEXT(listadp, ad_next);
3941 continue;
3942 }
3943 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3944 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
3945 if (listadp->ad_lbn == newadp->ad_lbn) {
3946 allocdirect_merge(&inodedep->id_inoupdt, newadp,
3947 listadp);
3948 listadp = newadp;
3949 }
3950 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3951 }
3952 while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
3953 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3954 TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
3955 }
3956}
3957
3958/*
3959 * If we are doing an fsync, then we must ensure that any directory
3960 * entries for the inode have been written after the inode gets to disk.
3961 */
3962int
3963softdep_fsync(vp)
3964 struct vnode *vp; /* the "in_core" copy of the inode */
3965{
3966 struct inodedep *inodedep;
3967 struct pagedep *pagedep;
3968 struct worklist *wk;
3969 struct diradd *dap;
3970 struct mount *mnt;
3971 struct vnode *pvp;
3972 struct inode *ip;
3973 struct buf *bp;
3974 struct fs *fs;
3975 struct proc *p = CURPROC; /* XXX */
3976 int error, flushparent;
3977 ino_t parentino;
3978 ufs_lbn_t lbn;
3979
3980 ip = VTOI(vp);
3981 fs = ip->i_fs;
3982 ACQUIRE_LOCK(&lk);
3983 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
3984 FREE_LOCK(&lk);
3985 return (0);
3986 }
3987 if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
3988 LIST_FIRST(&inodedep->id_bufwait) != NULL ||
3989 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
3990 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
3991 FREE_LOCK(&lk);
3992 panic("softdep_fsync: pending ops");
3993 }
3994 for (error = 0, flushparent = 0; ; ) {
3995 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
3996 break;
3997 if (wk->wk_type != D_DIRADD) {
3998 FREE_LOCK(&lk);
3999 panic("softdep_fsync: Unexpected type %s",
4000 TYPENAME(wk->wk_type));
4001 }
4002 dap = WK_DIRADD(wk);
4003 /*
4004 * Flush our parent if this directory entry
4005 * has a MKDIR_PARENT dependency.
4006 */
4007 if (dap->da_state & DIRCHG)
4008 pagedep = dap->da_previous->dm_pagedep;
4009 else
4010 pagedep = dap->da_pagedep;
4011 mnt = pagedep->pd_mnt;
4012 parentino = pagedep->pd_ino;
4013 lbn = pagedep->pd_lbn;
4014 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) {
4015 FREE_LOCK(&lk);
4016 panic("softdep_fsync: dirty");
4017 }
4018 flushparent = dap->da_state & MKDIR_PARENT;
4019 /*
4020 * If we are being fsync'ed as part of vgone'ing this vnode,
4021 * then we will not be able to release and recover the
4022 * vnode below, so we just have to give up on writing its
4023 * directory entry out. It will eventually be written, just
4024 * not now, but then the user was not asking to have it
4025 * written, so we are not breaking any promises.
4026 */
4027 if (vp->v_flag & VXLOCK)
4028 break;
4029 /*
4030 * We prevent deadlock by always fetching inodes from the
4031 * root, moving down the directory tree. Thus, when fetching
4032 * our parent directory, we must unlock ourselves before
4033 * requesting the lock on our parent. See the comment in
4034 * ufs_lookup for details on possible races.
4035 */
4036 FREE_LOCK(&lk);
4037 VOP_UNLOCK(vp, 0, p);
4038 error = VFS_VGET(mnt, parentino, &pvp);
4039 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
4040 if (error != 0)
4041 return (error);
4042 if (flushparent) {
4043 if ((error = UFS_UPDATE(pvp, 1)) != 0) {
4044 vput(pvp);
4045 return (error);
4046 }
4047 }
4048 /*
4049 * Flush directory page containing the inode's name.
4050 */
4051 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
4052 &bp);
4053 if (error == 0)
4054 error = BUF_WRITE(bp);
4055 vput(pvp);
4056 if (error != 0)
4057 return (error);
4058 ACQUIRE_LOCK(&lk);
4059 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
4060 break;
4061 }
4062 FREE_LOCK(&lk);
4063 return (0);
4064}
4065
4066/*
4067 * Flush all the dirty bitmaps associated with the block device
4068 * before flushing the rest of the dirty blocks so as to reduce
4069 * the number of dependencies that will have to be rolled back.
4070 */
4071void
4072softdep_fsync_mountdev(vp)
4073 struct vnode *vp;
4074{
4075 struct buf *bp, *nbp;
4076 struct worklist *wk;
4077
4078 if (!vn_isdisk(vp, NULL))
4079 panic("softdep_fsync_mountdev: vnode not a disk");
4080 ACQUIRE_LOCK(&lk);
4081 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
4082 nbp = TAILQ_NEXT(bp, b_vnbufs);
4083 /*
4084 * If it is already scheduled, skip to the next buffer.
4085 */
4086 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
4087 continue;
4088 if ((bp->b_flags & B_DELWRI) == 0) {
4089 FREE_LOCK(&lk);
4090 panic("softdep_fsync_mountdev: not dirty");
4091 }
4092 /*
4093 * We are only interested in bitmaps with outstanding
4094 * dependencies.
4095 */
4096 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
4097 wk->wk_type != D_BMSAFEMAP ||
4098 (bp->b_xflags & BX_BKGRDINPROG)) {
4099 BUF_UNLOCK(bp);
4100 continue;
4101 }
4102 bremfree(bp);
4103 FREE_LOCK(&lk);
4104 (void) bawrite(bp);
4105 ACQUIRE_LOCK(&lk);
4106 /*
4107 * Since we may have slept during the I/O, we need
4108 * to start from a known point.
4109 */
4110 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
4111 }
4112 drain_output(vp, 1);
4113 FREE_LOCK(&lk);
4114}
4115
4116/*
4117 * This routine is called when we are trying to synchronously flush a
4118 * file. This routine must eliminate any filesystem metadata dependencies
4119 * so that the syncing routine can succeed by pushing the dirty blocks
4120 * associated with the file. If any I/O errors occur, they are returned.
4121 */
4122int
4123softdep_sync_metadata(ap)
4124 struct vop_fsync_args /* {
4125 struct vnode *a_vp;
4126 struct ucred *a_cred;
4127 int a_waitfor;
4128 struct proc *a_p;
4129 } */ *ap;
4130{
4131 struct vnode *vp = ap->a_vp;
4132 struct pagedep *pagedep;
4133 struct allocdirect *adp;
4134 struct allocindir *aip;
4135 struct buf *bp, *nbp;
4136 struct worklist *wk;
4137 int i, error, waitfor;
4138
4139 /*
4140 * Check whether this vnode is involved in a filesystem
4141 * that is doing soft dependency processing.
4142 */
4143 if (!vn_isdisk(vp, NULL)) {
4144 if (!DOINGSOFTDEP(vp))
4145 return (0);
4146 } else
4147 if (vp->v_rdev->si_mountpoint == NULL ||
4148 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0)
4149 return (0);
4150 /*
4151 * Ensure that any direct block dependencies have been cleared.
4152 */
4153 ACQUIRE_LOCK(&lk);
4154 if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
4155 FREE_LOCK(&lk);
4156 return (error);
4157 }
4158 /*
4159 * For most files, the only metadata dependencies are the
4160 * cylinder group maps that allocate their inode or blocks.
4161 * The block allocation dependencies can be found by traversing
4162 * the dependency lists for any buffers that remain on their
4163 * dirty buffer list. The inode allocation dependency will
4164 * be resolved when the inode is updated with MNT_WAIT.
4165 * This work is done in two passes. The first pass grabs most
4166 * of the buffers and begins asynchronously writing them. The
4167 * only way to wait for these asynchronous writes is to sleep
4168 * on the filesystem vnode which may stay busy for a long time
4169 * if the filesystem is active. So, instead, we make a second
4170 * pass over the dependencies blocking on each write. In the
4171 * usual case we will be blocking against a write that we
4172 * initiated, so when it is done the dependency will have been
4173 * resolved. Thus the second pass is expected to end quickly.
4174 */
4175 waitfor = MNT_NOWAIT;
4176top:
4177 if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
4178 FREE_LOCK(&lk);
4179 return (0);
4180 }
4181 bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
4182 /* While syncing snapshots, we must allow recursive lookups */
4183 bp->b_lock.lk_flags |= LK_CANRECURSE;
4184loop:
4185 /*
4186 * As we hold the buffer locked, none of its dependencies
4187 * will disappear.
4188 */
4189 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4190 switch (wk->wk_type) {
4191
4192 case D_ALLOCDIRECT:
4193 adp = WK_ALLOCDIRECT(wk);
4194 if (adp->ad_state & DEPCOMPLETE)
4195 continue;
4196 nbp = adp->ad_buf;
4197 if (getdirtybuf(&nbp, waitfor) == 0)
4198 continue;
4199 FREE_LOCK(&lk);
4200 if (waitfor == MNT_NOWAIT) {
4201 bawrite(nbp);
4202 } else if ((error = BUF_WRITE(nbp)) != 0) {
4203 break;
4204 }
4205 ACQUIRE_LOCK(&lk);
4206 continue;
4207
4208 case D_ALLOCINDIR:
4209 aip = WK_ALLOCINDIR(wk);
4210 if (aip->ai_state & DEPCOMPLETE)
4211 continue;
4212 nbp = aip->ai_buf;
4213 if (getdirtybuf(&nbp, waitfor) == 0)
4214 continue;
4215 FREE_LOCK(&lk);
4216 if (waitfor == MNT_NOWAIT) {
4217 bawrite(nbp);
4218 } else if ((error = BUF_WRITE(nbp)) != 0) {
4219 break;
4220 }
4221 ACQUIRE_LOCK(&lk);
4222 continue;
4223
4224 case D_INDIRDEP:
4225 restart:
4226
4227 LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
4228 if (aip->ai_state & DEPCOMPLETE)
4229 continue;
4230 nbp = aip->ai_buf;
4231 if (getdirtybuf(&nbp, MNT_WAIT) == 0)
4232 goto restart;
4233 FREE_LOCK(&lk);
4234 if ((error = BUF_WRITE(nbp)) != 0) {
4235 break;
4236 }
4237 ACQUIRE_LOCK(&lk);
4238 goto restart;
4239 }
4240 continue;
4241
4242 case D_INODEDEP:
4243 if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
4244 WK_INODEDEP(wk)->id_ino)) != 0) {
4245 FREE_LOCK(&lk);
4246 break;
4247 }
4248 continue;
4249
4250 case D_PAGEDEP:
4251 /*
4252 * We are trying to sync a directory that may
4253 * have dependencies on both its own metadata
4254 * and/or dependencies on the inodes of any
4255 * recently allocated files. We walk its diradd
4256 * lists pushing out the associated inode.
4257 */
4258 pagedep = WK_PAGEDEP(wk);
4259 for (i = 0; i < DAHASHSZ; i++) {
4260 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
4261 continue;
4262 if ((error =
4263 flush_pagedep_deps(vp, pagedep->pd_mnt,
4264 &pagedep->pd_diraddhd[i]))) {
4265 FREE_LOCK(&lk);
4266 break;
4267 }
4268 }
4269 continue;
4270
4271 case D_MKDIR:
4272 /*
4273 * This case should never happen if the vnode has
4274 * been properly sync'ed. However, if this function
4275 * is used at a place where the vnode has not yet
4276 * been sync'ed, this dependency can show up. So,
4277 * rather than panic, just flush it.
4278 */
4279 nbp = WK_MKDIR(wk)->md_buf;
4280 if (getdirtybuf(&nbp, waitfor) == 0)
4281 continue;
4282 FREE_LOCK(&lk);
4283 if (waitfor == MNT_NOWAIT) {
4284 bawrite(nbp);
4285 } else if ((error = BUF_WRITE(nbp)) != 0) {
4286 break;
4287 }
4288 ACQUIRE_LOCK(&lk);
4289 continue;
4290
4291 case D_BMSAFEMAP:
4292 /*
4293 * This case should never happen if the vnode has
4294 * been properly sync'ed. However, if this function
4295 * is used at a place where the vnode has not yet
4296 * been sync'ed, this dependency can show up. So,
4297 * rather than panic, just flush it.
4298 */
4299 nbp = WK_BMSAFEMAP(wk)->sm_buf;
4300 if (getdirtybuf(&nbp, waitfor) == 0)
4301 continue;
4302 FREE_LOCK(&lk);
4303 if (waitfor == MNT_NOWAIT) {
4304 bawrite(nbp);
4305 } else if ((error = BUF_WRITE(nbp)) != 0) {
4306 break;
4307 }
4308 ACQUIRE_LOCK(&lk);
4309 continue;
4310
4311 default:
4312 FREE_LOCK(&lk);
4313 panic("softdep_sync_metadata: Unknown type %s",
4314 TYPENAME(wk->wk_type));
4315 /* NOTREACHED */
4316 }
4317 /* We reach here only in error and unlocked */
4318 if (error == 0)
4319 panic("softdep_sync_metadata: zero error");
4320 bp->b_lock.lk_flags &= ~LK_CANRECURSE;
4321 bawrite(bp);
4322 return (error);
4323 }
4324 (void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT);
4325 nbp = TAILQ_NEXT(bp, b_vnbufs);
4326 FREE_LOCK(&lk);
4327 bp->b_lock.lk_flags &= ~LK_CANRECURSE;
4328 bawrite(bp);
4329 ACQUIRE_LOCK(&lk);
4330 if (nbp != NULL) {
4331 bp = nbp;
4332 goto loop;
4333 }
4334 /*
4335 * We must wait for any I/O in progress to finish so that
4336 * all potential buffers on the dirty list will be visible.
4337 * Once they are all there, proceed with the second pass
4338 * which will wait for the I/O as per above.
4339 */
4340 drain_output(vp, 1);
4341 /*
4342 * The brief unlock is to allow any pent up dependency
4343 * processing to be done.
4344 */
4345 if (waitfor == MNT_NOWAIT) {
4346 waitfor = MNT_WAIT;
4347 FREE_LOCK(&lk);
4348 ACQUIRE_LOCK(&lk);
4349 goto top;
4350 }
4351
4352 /*
4353 * If we have managed to get rid of all the dirty buffers,
4354 * then we are done. For certain directories and block
4355 * devices, we may need to do further work.
4356 */
4357 if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
4358 FREE_LOCK(&lk);
4359 return (0);
4360 }
4361
4362 FREE_LOCK(&lk);
4363 /*
4364 * If we are trying to sync a block device, some of its buffers may
4365 * contain metadata that cannot be written until the contents of some
4366 * partially written files have been written to disk. The only easy
4367 * way to accomplish this is to sync the entire filesystem (luckily
4368 * this happens rarely).
4369 */
4370 if (vn_isdisk(vp, NULL) &&
4371 vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) &&
4372 (error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT, ap->a_cred,
4373 ap->a_p)) != 0)
4374 return (error);
4375 return (0);
4376}
4377
4378/*
4379 * Flush the dependencies associated with an inodedep.
4380 * Called with splbio blocked.
4381 */
4382static int
4383flush_inodedep_deps(fs, ino)
4384 struct fs *fs;
4385 ino_t ino;
4386{
4387 struct inodedep *inodedep;
4388 struct allocdirect *adp;
4389 int error, waitfor;
4390 struct buf *bp;
4391
4392 /*
4393 * This work is done in two passes. The first pass grabs most
4394 * of the buffers and begins asynchronously writing them. The
4395 * only way to wait for these asynchronous writes is to sleep
4396 * on the filesystem vnode which may stay busy for a long time
4397 * if the filesystem is active. So, instead, we make a second
4398 * pass over the dependencies blocking on each write. In the
4399 * usual case we will be blocking against a write that we
4400 * initiated, so when it is done the dependency will have been
4401 * resolved. Thus the second pass is expected to end quickly.
4402 * We give a brief window at the top of the loop to allow
4403 * any pending I/O to complete.
4404 */
4405 for (waitfor = MNT_NOWAIT; ; ) {
4406 FREE_LOCK(&lk);
4407 ACQUIRE_LOCK(&lk);
4408 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4409 return (0);
4410 TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
4411 if (adp->ad_state & DEPCOMPLETE)
4412 continue;
4413 bp = adp->ad_buf;
4414 if (getdirtybuf(&bp, waitfor) == 0) {
4415 if (waitfor == MNT_NOWAIT)
4416 continue;
4417 break;
4418 }
4419 FREE_LOCK(&lk);
4420 if (waitfor == MNT_NOWAIT) {
4421 bawrite(bp);
4422 } else if ((error = BUF_WRITE(bp)) != 0) {
4423 ACQUIRE_LOCK(&lk);
4424 return (error);
4425 }
4426 ACQUIRE_LOCK(&lk);
4427 break;
4428 }
4429 if (adp != NULL)
4430 continue;
4431 TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
4432 if (adp->ad_state & DEPCOMPLETE)
4433 continue;
4434 bp = adp->ad_buf;
4435 if (getdirtybuf(&bp, waitfor) == 0) {
4436 if (waitfor == MNT_NOWAIT)
4437 continue;
4438 break;
4439 }
4440 FREE_LOCK(&lk);
4441 if (waitfor == MNT_NOWAIT) {
4442 bawrite(bp);
4443 } else if ((error = BUF_WRITE(bp)) != 0) {
4444 ACQUIRE_LOCK(&lk);
4445 return (error);
4446 }
4447 ACQUIRE_LOCK(&lk);
4448 break;
4449 }
4450 if (adp != NULL)
4451 continue;
4452 /*
4453 * If pass2, we are done, otherwise do pass 2.
4454 */
4455 if (waitfor == MNT_WAIT)
4456 break;
4457 waitfor = MNT_WAIT;
4458 }
4459 /*
4460 * Try freeing inodedep in case all dependencies have been removed.
4461 */
4462 if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
4463 (void) free_inodedep(inodedep);
4464 return (0);
4465}
4466
4467/*
4468 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
4469 * Called with splbio blocked.
4470 */
4471static int
4472flush_pagedep_deps(pvp, mp, diraddhdp)
4473 struct vnode *pvp;
4474 struct mount *mp;
4475 struct diraddhd *diraddhdp;
4476{
4477 struct proc *p = CURPROC; /* XXX */
4478 struct inodedep *inodedep;
4479 struct ufsmount *ump;
4480 struct diradd *dap;
4481 struct vnode *vp;
4482 int gotit, error = 0;
4483 struct buf *bp;
4484 ino_t inum;
4485
4486 ump = VFSTOUFS(mp);
4487 while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
4488 /*
4489 * Flush ourselves if this directory entry
4490 * has a MKDIR_PARENT dependency.
4491 */
4492 if (dap->da_state & MKDIR_PARENT) {
4493 FREE_LOCK(&lk);
4494 if ((error = UFS_UPDATE(pvp, 1)) != 0)
4495 break;
4496 ACQUIRE_LOCK(&lk);
4497 /*
4498 * If that cleared dependencies, go on to next.
4499 */
4500 if (dap != LIST_FIRST(diraddhdp))
4501 continue;
4502 if (dap->da_state & MKDIR_PARENT) {
4503 FREE_LOCK(&lk);
4504 panic("flush_pagedep_deps: MKDIR_PARENT");
4505 }
4506 }
4507 /*
4508 * A newly allocated directory must have its "." and
4509 * ".." entries written out before its name can be
4510 * committed in its parent. We do not want or need
4511 * the full semantics of a synchronous VOP_FSYNC as
4512 * that may end up here again, once for each directory
4513 * level in the filesystem. Instead, we push the blocks
4514 * and wait for them to clear. We have to fsync twice
4515 * because the first call may choose to defer blocks
4516 * that still have dependencies, but deferral will
4517 * happen at most once.
4518 */
4519 inum = dap->da_newinum;
4520 if (dap->da_state & MKDIR_BODY) {
4521 FREE_LOCK(&lk);
4522 if ((error = VFS_VGET(mp, inum, &vp)) != 0)
4523 break;
4524 if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) ||
4525 (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) {
4526 vput(vp);
4527 break;
4528 }
4529 drain_output(vp, 0);
4530 vput(vp);
4531 ACQUIRE_LOCK(&lk);
4532 /*
4533 * If that cleared dependencies, go on to next.
4534 */
4535 if (dap != LIST_FIRST(diraddhdp))
4536 continue;
4537 if (dap->da_state & MKDIR_BODY) {
4538 FREE_LOCK(&lk);
4539 panic("flush_pagedep_deps: MKDIR_BODY");
4540 }
4541 }
4542 /*
4543 * Flush the inode on which the directory entry depends.
4544 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
4545 * the only remaining dependency is that the updated inode
4546 * count must get pushed to disk. The inode has already
4547 * been pushed into its inode buffer (via VOP_UPDATE) at
4548 * the time of the reference count change. So we need only
4549 * locate that buffer, ensure that there will be no rollback
4550 * caused by a bitmap dependency, then write the inode buffer.
4551 */
4552 if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
4553 FREE_LOCK(&lk);
4554 panic("flush_pagedep_deps: lost inode");
4555 }
4556 /*
4557 * If the inode still has bitmap dependencies,
4558 * push them to disk.
4559 */
4560 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4561 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
4562 FREE_LOCK(&lk);
4563 if (gotit &&
4564 (error = BUF_WRITE(inodedep->id_buf)) != 0)
4565 break;
4566 ACQUIRE_LOCK(&lk);
4567 if (dap != LIST_FIRST(diraddhdp))
4568 continue;
4569 }
4570 /*
4571 * If the inode is still sitting in a buffer waiting
4572 * to be written, push it to disk.
4573 */
4574 FREE_LOCK(&lk);
4575 if ((error = bread(ump->um_devvp,
4576 fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
4577 (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0)
4578 break;
4579 if ((error = BUF_WRITE(bp)) != 0)
4580 break;
4581 ACQUIRE_LOCK(&lk);
4582 /*
4583 * If we have failed to get rid of all the dependencies
4584 * then something is seriously wrong.
4585 */
4586 if (dap == LIST_FIRST(diraddhdp)) {
4587 FREE_LOCK(&lk);
4588 panic("flush_pagedep_deps: flush failed");
4589 }
4590 }
4591 if (error)
4592 ACQUIRE_LOCK(&lk);
4593 return (error);
4594}
4595
4596/*
4597 * A large burst of file addition or deletion activity can drive the
4598 * memory load excessively high. First attempt to slow things down
4599 * using the techniques below. If that fails, this routine requests
4600 * the offending operations to fall back to running synchronously
4601 * until the memory load returns to a reasonable level.
4602 */
4603int
4604softdep_slowdown(vp)
4605 struct vnode *vp;
4606{
4607 int max_softdeps_hard;
4608
4609 max_softdeps_hard = max_softdeps * 11 / 10;
4610 if (num_dirrem < max_softdeps_hard / 2 &&
4611 num_inodedep < max_softdeps_hard)
4612 return (0);
4613 stat_sync_limit_hit += 1;
4614 return (1);
4615}
4616
4617/*
4618 * If memory utilization has gotten too high, deliberately slow things
4619 * down and speed up the I/O processing.
4620 */
4621static int
4622request_cleanup(resource, islocked)
4623 int resource;
4624 int islocked;
4625{
4626 struct proc *p = CURPROC;
4627
4628 /*
4629 * We never hold up the filesystem syncer process.
4630 */
4631 if (p == filesys_syncer)
4632 return (0);
4633 /*
4634 * First check to see if the work list has gotten backlogged.
4635 * If it has, co-opt this process to help clean up two entries.
4636 * Because this process may hold inodes locked, we cannot
4637 * handle any remove requests that might block on a locked
4638 * inode as that could lead to deadlock.
4639 */
4640 if (num_on_worklist > max_softdeps / 10) {
4641 if (islocked)
4642 FREE_LOCK(&lk);
4643 process_worklist_item(NULL, LK_NOWAIT);
4644 process_worklist_item(NULL, LK_NOWAIT);
4645 stat_worklist_push += 2;
4646 if (islocked)
4647 ACQUIRE_LOCK(&lk);
4648 return(1);
4649 }
4650 /*
4651 * Next, we attempt to speed up the syncer process. If that
4652 * is successful, then we allow the process to continue.
4653 */
4654 if (speedup_syncer())
4655 return(0);
4656 /*
4657 * If we are resource constrained on inode dependencies, try
4658 * flushing some dirty inodes. Otherwise, we are constrained
4659 * by file deletions, so try accelerating flushes of directories
4660 * with removal dependencies. We would like to do the cleanup
4661 * here, but we probably hold an inode locked at this point and
4662 * that might deadlock against one that we try to clean. So,
4663 * the best that we can do is request the syncer daemon to do
4664 * the cleanup for us.
4665 */
4666 switch (resource) {
4667
4668 case FLUSH_INODES:
4669 stat_ino_limit_push += 1;
4670 req_clear_inodedeps += 1;
4671 stat_countp = &stat_ino_limit_hit;
4672 break;
4673
4674 case FLUSH_REMOVE:
4675 stat_blk_limit_push += 1;
4676 req_clear_remove += 1;
4677 stat_countp = &stat_blk_limit_hit;
4678 break;
4679
4680 default:
4681 if (islocked)
4682 FREE_LOCK(&lk);
4683 panic("request_cleanup: unknown type");
4684 }
4685 /*
4686 * Hopefully the syncer daemon will catch up and awaken us.
4687 * We wait at most tickdelay before proceeding in any case.
4688 */
4689 if (islocked == 0)
4690 ACQUIRE_LOCK(&lk);
4691 proc_waiting += 1;
4692 if (handle.callout == NULL)
4693 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
4694 FREE_LOCK_INTERLOCKED(&lk);
4695 (void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0);
4696 ACQUIRE_LOCK_INTERLOCKED(&lk);
4697 proc_waiting -= 1;
4698 if (islocked == 0)
4699 FREE_LOCK(&lk);
4700 return (1);
4701}
4702
4703/*
4704 * Awaken processes pausing in request_cleanup and clear proc_waiting
4705 * to indicate that there is no longer a timer running.
4706 */
4707void
4708pause_timer(arg)
4709 void *arg;
4710{
4711
4712 *stat_countp += 1;
4713 wakeup_one(&proc_waiting);
4714 if (proc_waiting > 0)
4715 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
4716 else
4717 handle.callout = NULL;
4718}
4719
4720/*
4721 * Flush out a directory with at least one removal dependency in an effort to
4722 * reduce the number of dirrem, freefile, and freeblks dependency structures.
4723 */
4724static void
4725clear_remove(p)
4726 struct proc *p;
4727{
4728 struct pagedep_hashhead *pagedephd;
4729 struct pagedep *pagedep;
4730 static int next = 0;
4731 struct mount *mp;
4732 struct vnode *vp;
4733 int error, cnt;
4734 ino_t ino;
4735
4736 ACQUIRE_LOCK(&lk);
4737 for (cnt = 0; cnt < pagedep_hash; cnt++) {
4738 pagedephd = &pagedep_hashtbl[next++];
4739 if (next >= pagedep_hash)
4740 next = 0;
4741 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
4742 if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
4743 continue;
4744 mp = pagedep->pd_mnt;
4745 ino = pagedep->pd_ino;
4746 FREE_LOCK(&lk);
4747 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
4748 continue;
4749 if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
4750 softdep_error("clear_remove: vget", error);
4751 vn_finished_write(mp);
4752 return;
4753 }
4754 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
4755 softdep_error("clear_remove: fsync", error);
4756 drain_output(vp, 0);
4757 vput(vp);
4758 vn_finished_write(mp);
4759 return;
4760 }
4761 }
4762 FREE_LOCK(&lk);
4763}
4764
4765/*
4766 * Clear out a block of dirty inodes in an effort to reduce
4767 * the number of inodedep dependency structures.
4768 */
4769static void
4770clear_inodedeps(p)
4771 struct proc *p;
4772{
4773 struct inodedep_hashhead *inodedephd;
4774 struct inodedep *inodedep;
4775 static int next = 0;
4776 struct mount *mp;
4777 struct vnode *vp;
4778 struct fs *fs;
4779 int error, cnt;
4780 ino_t firstino, lastino, ino;
4781
4782 ACQUIRE_LOCK(&lk);
4783 /*
4784 * Pick a random inode dependency to be cleared.
4785 * We will then gather up all the inodes in its block
4786 * that have dependencies and flush them out.
4787 */
4788 for (cnt = 0; cnt < inodedep_hash; cnt++) {
4789 inodedephd = &inodedep_hashtbl[next++];
4790 if (next >= inodedep_hash)
4791 next = 0;
4792 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
4793 break;
4794 }
4795 if (inodedep == NULL)
4796 return;
4797 /*
4798 * Ugly code to find mount point given pointer to superblock.
4799 */
4800 fs = inodedep->id_fs;
4801 TAILQ_FOREACH(mp, &mountlist, mnt_list)
4802 if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
4803 break;
4804 /*
4805 * Find the last inode in the block with dependencies.
4806 */
4807 firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
4808 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
4809 if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
4810 break;
4811 /*
4812 * Asynchronously push all but the last inode with dependencies.
4813 * Synchronously push the last inode with dependencies to ensure
4814 * that the inode block gets written to free up the inodedeps.
4815 */
4816 for (ino = firstino; ino <= lastino; ino++) {
4817 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4818 continue;
4819 FREE_LOCK(&lk);
4820 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
4821 continue;
4822 if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
4823 softdep_error("clear_inodedeps: vget", error);
4824 vn_finished_write(mp);
4825 return;
4826 }
4827 if (ino == lastino) {
4828 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p)))
4829 softdep_error("clear_inodedeps: fsync1", error);
4830 } else {
4831 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
4832 softdep_error("clear_inodedeps: fsync2", error);
4833 drain_output(vp, 0);
4834 }
4835 vput(vp);
4836 vn_finished_write(mp);
4837 ACQUIRE_LOCK(&lk);
4838 }
4839 FREE_LOCK(&lk);
4840}
4841
4842/*
4843 * Function to determine if the buffer has outstanding dependencies
4844 * that will cause a roll-back if the buffer is written. If wantcount
4845 * is set, return number of dependencies, otherwise just yes or no.
4846 */
4847static int
4848softdep_count_dependencies(bp, wantcount)
4849 struct buf *bp;
4850 int wantcount;
4851{
4852 struct worklist *wk;
4853 struct inodedep *inodedep;
4854 struct indirdep *indirdep;
4855 struct allocindir *aip;
4856 struct pagedep *pagedep;
4857 struct diradd *dap;
4858 int i, retval;
4859
4860 retval = 0;
4861 ACQUIRE_LOCK(&lk);
4862 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4863 switch (wk->wk_type) {
4864
4865 case D_INODEDEP:
4866 inodedep = WK_INODEDEP(wk);
4867 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4868 /* bitmap allocation dependency */
4869 retval += 1;
4870 if (!wantcount)
4871 goto out;
4872 }
4873 if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
4874 /* direct block pointer dependency */
4875 retval += 1;
4876 if (!wantcount)
4877 goto out;
4878 }
4879 continue;
4880
4881 case D_INDIRDEP:
4882 indirdep = WK_INDIRDEP(wk);
4883
4884 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
4885 /* indirect block pointer dependency */
4886 retval += 1;
4887 if (!wantcount)
4888 goto out;
4889 }
4890 continue;
4891
4892 case D_PAGEDEP:
4893 pagedep = WK_PAGEDEP(wk);
4894 for (i = 0; i < DAHASHSZ; i++) {
4895
4896 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
4897 /* directory entry dependency */
4898 retval += 1;
4899 if (!wantcount)
4900 goto out;
4901 }
4902 }
4903 continue;
4904
4905 case D_BMSAFEMAP:
4906 case D_ALLOCDIRECT:
4907 case D_ALLOCINDIR:
4908 case D_MKDIR:
4909 /* never a dependency on these blocks */
4910 continue;
4911
4912 default:
4913 FREE_LOCK(&lk);
4914 panic("softdep_check_for_rollback: Unexpected type %s",
4915 TYPENAME(wk->wk_type));
4916 /* NOTREACHED */
4917 }
4918 }
4919out:
4920 FREE_LOCK(&lk);
4921 return retval;
4922}
4923
4924/*
4925 * Acquire exclusive access to a buffer.
4926 * Must be called with splbio blocked.
4927 * Return 1 if buffer was acquired.
4928 */
4929static int
4930getdirtybuf(bpp, waitfor)
4931 struct buf **bpp;
4932 int waitfor;
4933{
4934 struct buf *bp;
4935
4936 for (;;) {
4937 if ((bp = *bpp) == NULL)
4938 return (0);
4939 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
4940 if ((bp->b_xflags & BX_BKGRDINPROG) == 0)
4941 break;
4942 BUF_UNLOCK(bp);
4943 if (waitfor != MNT_WAIT)
4944 return (0);
4945 bp->b_xflags |= BX_BKGRDWAIT;
4946 FREE_LOCK_INTERLOCKED(&lk);
4947 tsleep(&bp->b_xflags, PRIBIO, "getbuf", 0);
4948 ACQUIRE_LOCK_INTERLOCKED(&lk);
4949 continue;
4950 }
4951 if (waitfor != MNT_WAIT)
4952 return (0);
4953 FREE_LOCK_INTERLOCKED(&lk);
4954 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL) != ENOLCK)
4955 panic("getdirtybuf: inconsistent lock");
4956 ACQUIRE_LOCK_INTERLOCKED(&lk);
4957 }
4958 if ((bp->b_flags & B_DELWRI) == 0) {
4959 BUF_UNLOCK(bp);
4960 return (0);
4961 }
4962 bremfree(bp);
4963 return (1);
4964}
4965
4966/*
4967 * Wait for pending output on a vnode to complete.
4968 * Must be called with vnode locked.
4969 */
4970static void
4971drain_output(vp, islocked)
4972 struct vnode *vp;
4973 int islocked;
4974{
4975
4976 if (!islocked)
4977 ACQUIRE_LOCK(&lk);
4978 while (vp->v_numoutput) {
4979 vp->v_flag |= VBWAIT;
4980 FREE_LOCK_INTERLOCKED(&lk);
4981 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drainvp", 0);
4982 ACQUIRE_LOCK_INTERLOCKED(&lk);
4983 }
4984 if (!islocked)
4985 FREE_LOCK(&lk);
4986}
4987
4988/*
4989 * Called whenever a buffer that is being invalidated or reallocated
4990 * contains dependencies. This should only happen if an I/O error has
4991 * occurred. The routine is called with the buffer locked.
4992 */
4993static void
4994softdep_deallocate_dependencies(bp)
4995 struct buf *bp;
4996{
4997
4998 if ((bp->b_ioflags & BIO_ERROR) == 0)
4999 panic("softdep_deallocate_dependencies: dangling deps");
5000 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
5001 panic("softdep_deallocate_dependencies: unrecovered I/O error");
5002}
5003
5004/*
5005 * Function to handle asynchronous write errors in the filesystem.
5006 */
5007void
5008softdep_error(func, error)
5009 char *func;
5010 int error;
5011{
5012
5013 /* XXX should do something better! */
5014 printf("%s: got error %d while accessing filesystem\n", func, error);
5015}