1/*	$NetBSD: vfs_wapbl.c,v 1.51.2.1 2012/05/07 03:01:13 riz Exp $	*/
2
3/*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36#define WAPBL_INTERNAL
37
38#include <sys/cdefs.h>
39__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.51.2.1 2012/05/07 03:01:13 riz Exp $");
40
41#include <sys/param.h>
42#include <sys/bitops.h>
43
44#ifdef _KERNEL
45#include <sys/param.h>
46#include <sys/namei.h>
47#include <sys/proc.h>
48#include <sys/sysctl.h>
49#include <sys/uio.h>
50#include <sys/vnode.h>
51#include <sys/file.h>
52#include <sys/module.h>
53#include <sys/resourcevar.h>
54#include <sys/conf.h>
55#include <sys/mount.h>
56#include <sys/kernel.h>
57#include <sys/kauth.h>
58#include <sys/mutex.h>
59#include <sys/atomic.h>
60#include <sys/wapbl.h>
61#include <sys/wapbl_replay.h>
62
63#include <miscfs/specfs/specdev.h>
64
65#define	wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
66#define	wapbl_free(a, s) kmem_free((a), (s))
67#define	wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
68
69static struct sysctllog *wapbl_sysctl;
70static int wapbl_flush_disk_cache = 1;
71static int wapbl_verbose_commit = 0;
72
73#else /* !_KERNEL */
74#include <assert.h>
75#include <errno.h>
76#include <stdio.h>
77#include <stdbool.h>
78#include <stdlib.h>
79#include <string.h>
80
81#include <sys/time.h>
82#include <sys/wapbl.h>
83#include <sys/wapbl_replay.h>
84
85#define	KDASSERT(x) assert(x)
86#define	KASSERT(x) assert(x)
87#define	wapbl_alloc(s) malloc(s)
88#define	wapbl_free(a, s) free(a)
89#define	wapbl_calloc(n, s) calloc((n), (s))
90
91#endif /* !_KERNEL */
92
93/*
94 * INTERNAL DATA STRUCTURES
95 */
96
97/*
98 * This structure holds per-mount log information.
99 *
100 * Legend:	a = atomic access only
101 *		r = read-only after init
102 *		l = rwlock held
103 *		m = mutex held
104 *		lm = rwlock held writing or mutex held
105 *		u = unlocked access ok
106 *		b = bufcache_lock held
107 */
108struct wapbl {
109	struct vnode *wl_logvp;	/* r:	log here */
110	struct vnode *wl_devvp;	/* r:	log on this device */
111	struct mount *wl_mount;	/* r:	mountpoint wl is associated with */
112	daddr_t wl_logpbn;	/* r:	Physical block number of start of log */
113	int wl_log_dev_bshift;	/* r:	logarithm of device block size of log
114					device */
115	int wl_fs_dev_bshift;	/* r:	logarithm of device block size of
116					filesystem device */
117
118	unsigned wl_lock_count;	/* m:	Count of transactions in progress */
119
120	size_t wl_circ_size; 	/* r:	Number of bytes in buffer of log */
121	size_t wl_circ_off;	/* r:	Number of bytes reserved at start */
122
123	size_t wl_bufcount_max;	/* r:	Number of buffers reserved for log */
124	size_t wl_bufbytes_max;	/* r:	Number of buf bytes reserved for log */
125
126	off_t wl_head;		/* l:	Byte offset of log head */
127	off_t wl_tail;		/* l:	Byte offset of log tail */
128	/*
129	 * head == tail == 0 means log is empty
130	 * head == tail != 0 means log is full
131	 * see assertions in wapbl_advance() for other boundary conditions.
132	 * only truncate moves the tail, except when flush sets it to
133	 * wl_header_size only flush moves the head, except when truncate
134	 * sets it to 0.
135	 */
136
137	struct wapbl_wc_header *wl_wc_header;	/* l	*/
138	void *wl_wc_scratch;	/* l:	scratch space (XXX: por que?!?) */
139
140	kmutex_t wl_mtx;	/* u:	short-term lock */
141	krwlock_t wl_rwlock;	/* u:	File system transaction lock */
142
143	/*
144	 * Must be held while accessing
145	 * wl_count or wl_bufs or head or tail
146	 */
147
148	/*
149	 * Callback called from within the flush routine to flush any extra
150	 * bits.  Note that flush may be skipped without calling this if
151	 * there are no outstanding buffers in the transaction.
152	 */
153#if _KERNEL
154	wapbl_flush_fn_t wl_flush;	/* r	*/
155	wapbl_flush_fn_t wl_flush_abort;/* r	*/
156#endif
157
158	size_t wl_bufbytes;	/* m:	Byte count of pages in wl_bufs */
159	size_t wl_bufcount;	/* m:	Count of buffers in wl_bufs */
160	size_t wl_bcount;	/* m:	Total bcount of wl_bufs */
161
162	LIST_HEAD(, buf) wl_bufs; /* m:	Buffers in current transaction */
163
164	kcondvar_t wl_reclaimable_cv;	/* m (obviously) */
165	size_t wl_reclaimable_bytes; /* m:	Amount of space available for
166						reclamation by truncate */
167	int wl_error_count;	/* m:	# of wl_entries with errors */
168	size_t wl_reserved_bytes; /* never truncate log smaller than this */
169
170#ifdef WAPBL_DEBUG_BUFBYTES
171	size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
172#endif
173
174	daddr_t *wl_deallocblks;/* lm:	address of block */
175	int *wl_dealloclens;	/* lm:	size of block */
176	int wl_dealloccnt;	/* lm:	total count */
177	int wl_dealloclim;	/* l:	max count */
178
179	/* hashtable of inode numbers for allocated but unlinked inodes */
180	/* synch ??? */
181	LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
182	u_long wl_inohashmask;
183	int wl_inohashcnt;
184
185	SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
186						   accounting */
187
188	u_char *wl_buffer;	/* l:   buffer for wapbl_buffered_write() */
189	daddr_t wl_buffer_dblk;	/* l:   buffer disk block address */
190	size_t wl_buffer_used;	/* l:   buffer current use */
191};
192
193#ifdef WAPBL_DEBUG_PRINT
194int wapbl_debug_print = WAPBL_DEBUG_PRINT;
195#endif
196
197/****************************************************************/
198#ifdef _KERNEL
199
200#ifdef WAPBL_DEBUG
201struct wapbl *wapbl_debug_wl;
202#endif
203
204static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
205static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
206static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
207static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
208#endif /* _KERNEL */
209
210static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
211
212static inline size_t wapbl_space_free(size_t avail, off_t head,
213	off_t tail);
214static inline size_t wapbl_space_used(size_t avail, off_t head,
215	off_t tail);
216
217#ifdef _KERNEL
218
219static struct pool wapbl_entry_pool;
220
221#define	WAPBL_INODETRK_SIZE 83
222static int wapbl_ino_pool_refcount;
223static struct pool wapbl_ino_pool;
224struct wapbl_ino {
225	LIST_ENTRY(wapbl_ino) wi_hash;
226	ino_t wi_ino;
227	mode_t wi_mode;
228};
229
230static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
231static void wapbl_inodetrk_free(struct wapbl *wl);
232static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
233
234static size_t wapbl_transaction_len(struct wapbl *wl);
235static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
236
237#if 0
238int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
239#endif
240
241static int wapbl_replay_isopen1(struct wapbl_replay *);
242
243/*
244 * This is useful for debugging.  If set, the log will
245 * only be truncated when necessary.
246 */
247int wapbl_lazy_truncate = 0;
248
249struct wapbl_ops wapbl_ops = {
250	.wo_wapbl_discard	= wapbl_discard,
251	.wo_wapbl_replay_isopen	= wapbl_replay_isopen1,
252	.wo_wapbl_replay_can_read = wapbl_replay_can_read,
253	.wo_wapbl_replay_read	= wapbl_replay_read,
254	.wo_wapbl_add_buf	= wapbl_add_buf,
255	.wo_wapbl_remove_buf	= wapbl_remove_buf,
256	.wo_wapbl_resize_buf	= wapbl_resize_buf,
257	.wo_wapbl_begin		= wapbl_begin,
258	.wo_wapbl_end		= wapbl_end,
259	.wo_wapbl_junlock_assert= wapbl_junlock_assert,
260
261	/* XXX: the following is only used to say "this is a wapbl buf" */
262	.wo_wapbl_biodone	= wapbl_biodone,
263};
264
265static int
266wapbl_sysctl_init(void)
267{
268	int rv;
269	const struct sysctlnode *rnode, *cnode;
270
271	wapbl_sysctl = NULL;
272
273	rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
274		       CTLFLAG_PERMANENT,
275		       CTLTYPE_NODE, "vfs", NULL,
276		       NULL, 0, NULL, 0,
277		       CTL_VFS, CTL_EOL);
278	if (rv)
279		return rv;
280
281	rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &rnode,
282		       CTLFLAG_PERMANENT,
283		       CTLTYPE_NODE, "wapbl",
284		       SYSCTL_DESCR("WAPBL journaling options"),
285		       NULL, 0, NULL, 0,
286		       CTL_CREATE, CTL_EOL);
287	if (rv)
288		return rv;
289
290	rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
291		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
292		       CTLTYPE_INT, "flush_disk_cache",
293		       SYSCTL_DESCR("flush disk cache"),
294		       NULL, 0, &wapbl_flush_disk_cache, 0,
295		       CTL_CREATE, CTL_EOL);
296	if (rv)
297		return rv;
298
299	rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
300		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
301		       CTLTYPE_INT, "verbose_commit",
302		       SYSCTL_DESCR("show time and size of wapbl log commits"),
303		       NULL, 0, &wapbl_verbose_commit, 0,
304		       CTL_CREATE, CTL_EOL);
305	return rv;
306}
307
308static void
309wapbl_init(void)
310{
311
312	pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
313	    "wapblentrypl", &pool_allocator_kmem, IPL_VM);
314
315	wapbl_sysctl_init();
316}
317
318#ifdef notyet
319static int
320wapbl_fini(bool interface)
321{
322
323	if (aio_sysctl != NULL)
324		 sysctl_teardown(&aio_sysctl);
325
326	pool_destroy(&wapbl_entry_pool);
327
328	return 0;
329}
330#endif
331
332static int
333wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
334{
335	int error, i;
336
337	WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
338	    ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
339
340	/*
341	 * Its only valid to reuse the replay log if its
342	 * the same as the new log we just opened.
343	 */
344	KDASSERT(!wapbl_replay_isopen(wr));
345	KASSERT(wl->wl_devvp->v_type == VBLK);
346	KASSERT(wr->wr_devvp->v_type == VBLK);
347	KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
348	KASSERT(wl->wl_logpbn == wr->wr_logpbn);
349	KASSERT(wl->wl_circ_size == wr->wr_circ_size);
350	KASSERT(wl->wl_circ_off == wr->wr_circ_off);
351	KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
352	KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
353
354	wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
355
356	for (i = 0; i < wr->wr_inodescnt; i++)
357		wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
358		    wr->wr_inodes[i].wr_imode);
359
360	/* Make sure new transaction won't overwrite old inodes list */
361	KDASSERT(wapbl_transaction_len(wl) <=
362	    wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
363	    wr->wr_inodestail));
364
365	wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
366	wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
367	    wapbl_transaction_len(wl);
368
369	error = wapbl_write_inodes(wl, &wl->wl_head);
370	if (error)
371		return error;
372
373	KASSERT(wl->wl_head != wl->wl_tail);
374	KASSERT(wl->wl_head != 0);
375
376	return 0;
377}
378
379int
380wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
381	daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
382	wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
383{
384	struct wapbl *wl;
385	struct vnode *devvp;
386	daddr_t logpbn;
387	int error;
388	int log_dev_bshift = ilog2(blksize);
389	int fs_dev_bshift = log_dev_bshift;
390	int run;
391
392	WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
393	    " count=%zu blksize=%zu\n", vp, off, count, blksize));
394
395	if (log_dev_bshift > fs_dev_bshift) {
396		WAPBL_PRINTF(WAPBL_PRINT_OPEN,
397			("wapbl: log device's block size cannot be larger "
398			 "than filesystem's\n"));
399		/*
400		 * Not currently implemented, although it could be if
401		 * needed someday.
402		 */
403		return ENOSYS;
404	}
405
406	if (off < 0)
407		return EINVAL;
408
409	if (blksize < DEV_BSIZE)
410		return EINVAL;
411	if (blksize % DEV_BSIZE)
412		return EINVAL;
413
414	/* XXXTODO: verify that the full load is writable */
415
416	/*
417	 * XXX check for minimum log size
418	 * minimum is governed by minimum amount of space
419	 * to complete a transaction. (probably truncate)
420	 */
421	/* XXX for now pick something minimal */
422	if ((count * blksize) < MAXPHYS) {
423		return ENOSPC;
424	}
425
426	if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
427		return error;
428	}
429
430	wl = wapbl_calloc(1, sizeof(*wl));
431	rw_init(&wl->wl_rwlock);
432	mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
433	cv_init(&wl->wl_reclaimable_cv, "wapblrec");
434	LIST_INIT(&wl->wl_bufs);
435	SIMPLEQ_INIT(&wl->wl_entries);
436
437	wl->wl_logvp = vp;
438	wl->wl_devvp = devvp;
439	wl->wl_mount = mp;
440	wl->wl_logpbn = logpbn;
441	wl->wl_log_dev_bshift = log_dev_bshift;
442	wl->wl_fs_dev_bshift = fs_dev_bshift;
443
444	wl->wl_flush = flushfn;
445	wl->wl_flush_abort = flushabortfn;
446
447	/* Reserve two log device blocks for the commit headers */
448	wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
449	wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
450	/* truncate the log usage to a multiple of log_dev_bshift */
451	wl->wl_circ_size >>= wl->wl_log_dev_bshift;
452	wl->wl_circ_size <<= wl->wl_log_dev_bshift;
453
454	/*
455	 * wl_bufbytes_max limits the size of the in memory transaction space.
456	 * - Since buffers are allocated and accounted for in units of
457	 *   PAGE_SIZE it is required to be a multiple of PAGE_SIZE
458	 *   (i.e. 1<<PAGE_SHIFT)
459	 * - Since the log device has to be written in units of
460	 *   1<<wl_log_dev_bshift it is required to be a mulitple of
461	 *   1<<wl_log_dev_bshift.
462	 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
463	 *   it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
464	 * Therefore it must be multiple of the least common multiple of those
465	 * three quantities.  Fortunately, all of those quantities are
466	 * guaranteed to be a power of two, and the least common multiple of
467	 * a set of numbers which are all powers of two is simply the maximum
468	 * of those numbers.  Finally, the maximum logarithm of a power of two
469	 * is the same as the log of the maximum power of two.  So we can do
470	 * the following operations to size wl_bufbytes_max:
471	 */
472
473	/* XXX fix actual number of pages reserved per filesystem. */
474	wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
475
476	/* Round wl_bufbytes_max to the largest power of two constraint */
477	wl->wl_bufbytes_max >>= PAGE_SHIFT;
478	wl->wl_bufbytes_max <<= PAGE_SHIFT;
479	wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
480	wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
481	wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
482	wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
483
484	/* XXX maybe use filesystem fragment size instead of 1024 */
485	/* XXX fix actual number of buffers reserved per filesystem. */
486	wl->wl_bufcount_max = (nbuf / 2) * 1024;
487
488	/* XXX tie this into resource estimation */
489	wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
490
491	wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) *
492	    wl->wl_dealloclim);
493	wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) *
494	    wl->wl_dealloclim);
495
496	wl->wl_buffer = wapbl_alloc(MAXPHYS);
497	wl->wl_buffer_used = 0;
498
499	wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
500
501	/* Initialize the commit header */
502	{
503		struct wapbl_wc_header *wc;
504		size_t len = 1 << wl->wl_log_dev_bshift;
505		wc = wapbl_calloc(1, len);
506		wc->wc_type = WAPBL_WC_HEADER;
507		wc->wc_len = len;
508		wc->wc_circ_off = wl->wl_circ_off;
509		wc->wc_circ_size = wl->wl_circ_size;
510		/* XXX wc->wc_fsid */
511		wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
512		wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
513		wl->wl_wc_header = wc;
514		wl->wl_wc_scratch = wapbl_alloc(len);
515	}
516
517	/*
518	 * if there was an existing set of unlinked but
519	 * allocated inodes, preserve it in the new
520	 * log.
521	 */
522	if (wr && wr->wr_inodescnt) {
523		error = wapbl_start_flush_inodes(wl, wr);
524		if (error)
525			goto errout;
526	}
527
528	error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
529	if (error) {
530		goto errout;
531	}
532
533	*wlp = wl;
534#if defined(WAPBL_DEBUG)
535	wapbl_debug_wl = wl;
536#endif
537
538	return 0;
539 errout:
540	wapbl_discard(wl);
541	wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
542	wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
543	wapbl_free(wl->wl_deallocblks,
544	    sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
545	wapbl_free(wl->wl_dealloclens,
546	    sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
547	wapbl_free(wl->wl_buffer, MAXPHYS);
548	wapbl_inodetrk_free(wl);
549	wapbl_free(wl, sizeof(*wl));
550
551	return error;
552}
553
554/*
555 * Like wapbl_flush, only discards the transaction
556 * completely
557 */
558
559void
560wapbl_discard(struct wapbl *wl)
561{
562	struct wapbl_entry *we;
563	struct buf *bp;
564	int i;
565
566	/*
567	 * XXX we may consider using upgrade here
568	 * if we want to call flush from inside a transaction
569	 */
570	rw_enter(&wl->wl_rwlock, RW_WRITER);
571	wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
572	    wl->wl_dealloccnt);
573
574#ifdef WAPBL_DEBUG_PRINT
575	{
576		pid_t pid = -1;
577		lwpid_t lid = -1;
578		if (curproc)
579			pid = curproc->p_pid;
580		if (curlwp)
581			lid = curlwp->l_lid;
582#ifdef WAPBL_DEBUG_BUFBYTES
583		WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
584		    ("wapbl_discard: thread %d.%d discarding "
585		    "transaction\n"
586		    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
587		    "deallocs=%d inodes=%d\n"
588		    "\terrcnt = %u, reclaimable=%zu reserved=%zu "
589		    "unsynced=%zu\n",
590		    pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
591		    wl->wl_bcount, wl->wl_dealloccnt,
592		    wl->wl_inohashcnt, wl->wl_error_count,
593		    wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
594		    wl->wl_unsynced_bufbytes));
595		SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
596			WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
597			    ("\tentry: bufcount = %zu, reclaimable = %zu, "
598			     "error = %d, unsynced = %zu\n",
599			     we->we_bufcount, we->we_reclaimable_bytes,
600			     we->we_error, we->we_unsynced_bufbytes));
601		}
602#else /* !WAPBL_DEBUG_BUFBYTES */
603		WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
604		    ("wapbl_discard: thread %d.%d discarding transaction\n"
605		    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
606		    "deallocs=%d inodes=%d\n"
607		    "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
608		    pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
609		    wl->wl_bcount, wl->wl_dealloccnt,
610		    wl->wl_inohashcnt, wl->wl_error_count,
611		    wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
612		SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
613			WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
614			    ("\tentry: bufcount = %zu, reclaimable = %zu, "
615			     "error = %d\n",
616			     we->we_bufcount, we->we_reclaimable_bytes,
617			     we->we_error));
618		}
619#endif /* !WAPBL_DEBUG_BUFBYTES */
620	}
621#endif /* WAPBL_DEBUG_PRINT */
622
623	for (i = 0; i <= wl->wl_inohashmask; i++) {
624		struct wapbl_ino_head *wih;
625		struct wapbl_ino *wi;
626
627		wih = &wl->wl_inohash[i];
628		while ((wi = LIST_FIRST(wih)) != NULL) {
629			LIST_REMOVE(wi, wi_hash);
630			pool_put(&wapbl_ino_pool, wi);
631			KASSERT(wl->wl_inohashcnt > 0);
632			wl->wl_inohashcnt--;
633		}
634	}
635
636	/*
637	 * clean buffer list
638	 */
639	mutex_enter(&bufcache_lock);
640	mutex_enter(&wl->wl_mtx);
641	while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
642		if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
643			/*
644			 * The buffer will be unlocked and
645			 * removed from the transaction in brelse
646			 */
647			mutex_exit(&wl->wl_mtx);
648			brelsel(bp, 0);
649			mutex_enter(&wl->wl_mtx);
650		}
651	}
652	mutex_exit(&wl->wl_mtx);
653	mutex_exit(&bufcache_lock);
654
655	/*
656	 * Remove references to this wl from wl_entries, free any which
657	 * no longer have buffers, others will be freed in wapbl_biodone
658	 * when they no longer have any buffers.
659	 */
660	while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
661		SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
662		/* XXX should we be accumulating wl_error_count
663		 * and increasing reclaimable bytes ? */
664		we->we_wapbl = NULL;
665		if (we->we_bufcount == 0) {
666#ifdef WAPBL_DEBUG_BUFBYTES
667			KASSERT(we->we_unsynced_bufbytes == 0);
668#endif
669			pool_put(&wapbl_entry_pool, we);
670		}
671	}
672
673	/* Discard list of deallocs */
674	wl->wl_dealloccnt = 0;
675	/* XXX should we clear wl_reserved_bytes? */
676
677	KASSERT(wl->wl_bufbytes == 0);
678	KASSERT(wl->wl_bcount == 0);
679	KASSERT(wl->wl_bufcount == 0);
680	KASSERT(LIST_EMPTY(&wl->wl_bufs));
681	KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
682	KASSERT(wl->wl_inohashcnt == 0);
683
684	rw_exit(&wl->wl_rwlock);
685}
686
687int
688wapbl_stop(struct wapbl *wl, int force)
689{
690	struct vnode *vp;
691	int error;
692
693	WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
694	error = wapbl_flush(wl, 1);
695	if (error) {
696		if (force)
697			wapbl_discard(wl);
698		else
699			return error;
700	}
701
702	/* Unlinked inodes persist after a flush */
703	if (wl->wl_inohashcnt) {
704		if (force) {
705			wapbl_discard(wl);
706		} else {
707			return EBUSY;
708		}
709	}
710
711	KASSERT(wl->wl_bufbytes == 0);
712	KASSERT(wl->wl_bcount == 0);
713	KASSERT(wl->wl_bufcount == 0);
714	KASSERT(LIST_EMPTY(&wl->wl_bufs));
715	KASSERT(wl->wl_dealloccnt == 0);
716	KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
717	KASSERT(wl->wl_inohashcnt == 0);
718
719	vp = wl->wl_logvp;
720
721	wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
722	wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
723	wapbl_free(wl->wl_deallocblks,
724	    sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
725	wapbl_free(wl->wl_dealloclens,
726	    sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
727	wapbl_free(wl->wl_buffer, MAXPHYS);
728	wapbl_inodetrk_free(wl);
729
730	cv_destroy(&wl->wl_reclaimable_cv);
731	mutex_destroy(&wl->wl_mtx);
732	rw_destroy(&wl->wl_rwlock);
733	wapbl_free(wl, sizeof(*wl));
734
735	return 0;
736}
737
738static int
739wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
740{
741	struct pstats *pstats = curlwp->l_proc->p_stats;
742	struct buf *bp;
743	int error;
744
745	KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
746	KASSERT(devvp->v_type == VBLK);
747
748	if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
749		mutex_enter(devvp->v_interlock);
750		devvp->v_numoutput++;
751		mutex_exit(devvp->v_interlock);
752		pstats->p_ru.ru_oublock++;
753	} else {
754		pstats->p_ru.ru_inblock++;
755	}
756
757	bp = getiobuf(devvp, true);
758	bp->b_flags = flags;
759	bp->b_cflags = BC_BUSY; /* silly & dubious */
760	bp->b_dev = devvp->v_rdev;
761	bp->b_data = data;
762	bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
763	bp->b_blkno = pbn;
764	BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
765
766	WAPBL_PRINTF(WAPBL_PRINT_IO,
767	    ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
768	    BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
769	    bp->b_blkno, bp->b_dev));
770
771	VOP_STRATEGY(devvp, bp);
772
773	error = biowait(bp);
774	putiobuf(bp);
775
776	if (error) {
777		WAPBL_PRINTF(WAPBL_PRINT_ERROR,
778		    ("wapbl_doio: %s %zu bytes at block %" PRId64
779		    " on dev 0x%"PRIx64" failed with error %d\n",
780		    (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
781		     "write" : "read"),
782		    len, pbn, devvp->v_rdev, error));
783	}
784
785	return error;
786}
787
788int
789wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
790{
791
792	return wapbl_doio(data, len, devvp, pbn, B_WRITE);
793}
794
795int
796wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
797{
798
799	return wapbl_doio(data, len, devvp, pbn, B_READ);
800}
801
802/*
803 * Flush buffered data if any.
804 */
805static int
806wapbl_buffered_flush(struct wapbl *wl)
807{
808	int error;
809
810	if (wl->wl_buffer_used == 0)
811		return 0;
812
813	error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
814	    wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
815	wl->wl_buffer_used = 0;
816
817	return error;
818}
819
820/*
821 * Write data to the log.
822 * Try to coalesce writes and emit MAXPHYS aligned blocks.
823 */
824static int
825wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
826{
827	int error;
828	size_t resid;
829
830	/*
831	 * If not adjacent to buffered data flush first.  Disk block
832	 * address is always valid for non-empty buffer.
833	 */
834	if (wl->wl_buffer_used > 0 &&
835	    pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) {
836		error = wapbl_buffered_flush(wl);
837		if (error)
838			return error;
839	}
840	/*
841	 * If this write goes to an empty buffer we have to
842	 * save the disk block address first.
843	 */
844	if (wl->wl_buffer_used == 0)
845		wl->wl_buffer_dblk = pbn;
846	/*
847	 * Remaining space so this buffer ends on a MAXPHYS boundary.
848	 *
849	 * Cannot become less or equal zero as the buffer would have been
850	 * flushed on the last call then.
851	 */
852	resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) -
853	    wl->wl_buffer_used;
854	KASSERT(resid > 0);
855	KASSERT(dbtob(btodb(resid)) == resid);
856	if (len >= resid) {
857		memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
858		wl->wl_buffer_used += resid;
859		error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
860		    wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
861		data = (uint8_t *)data + resid;
862		len -= resid;
863		wl->wl_buffer_dblk = pbn + btodb(resid);
864		wl->wl_buffer_used = 0;
865		if (error)
866			return error;
867	}
868	KASSERT(len < MAXPHYS);
869	if (len > 0) {
870		memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len);
871		wl->wl_buffer_used += len;
872	}
873
874	return 0;
875}
876
877/*
878 * Off is byte offset returns new offset for next write
879 * handles log wraparound
880 */
881static int
882wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
883{
884	size_t slen;
885	off_t off = *offp;
886	int error;
887	daddr_t pbn;
888
889	KDASSERT(((len >> wl->wl_log_dev_bshift) <<
890	    wl->wl_log_dev_bshift) == len);
891
892	if (off < wl->wl_circ_off)
893		off = wl->wl_circ_off;
894	slen = wl->wl_circ_off + wl->wl_circ_size - off;
895	if (slen < len) {
896		pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
897#ifdef _KERNEL
898		pbn = btodb(pbn << wl->wl_log_dev_bshift);
899#endif
900		error = wapbl_buffered_write(data, slen, wl, pbn);
901		if (error)
902			return error;
903		data = (uint8_t *)data + slen;
904		len -= slen;
905		off = wl->wl_circ_off;
906	}
907	pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
908#ifdef _KERNEL
909	pbn = btodb(pbn << wl->wl_log_dev_bshift);
910#endif
911	error = wapbl_buffered_write(data, len, wl, pbn);
912	if (error)
913		return error;
914	off += len;
915	if (off >= wl->wl_circ_off + wl->wl_circ_size)
916		off = wl->wl_circ_off;
917	*offp = off;
918	return 0;
919}
920
921/****************************************************************/
922
923int
924wapbl_begin(struct wapbl *wl, const char *file, int line)
925{
926	int doflush;
927	unsigned lockcount;
928
929	KDASSERT(wl);
930
931	/*
932	 * XXX this needs to be made much more sophisticated.
933	 * perhaps each wapbl_begin could reserve a specified
934	 * number of buffers and bytes.
935	 */
936	mutex_enter(&wl->wl_mtx);
937	lockcount = wl->wl_lock_count;
938	doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
939		   wl->wl_bufbytes_max / 2) ||
940		  ((wl->wl_bufcount + (lockcount * 10)) >
941		   wl->wl_bufcount_max / 2) ||
942		  (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
943		  (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
944	mutex_exit(&wl->wl_mtx);
945
946	if (doflush) {
947		WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
948		    ("force flush lockcnt=%d bufbytes=%zu "
949		    "(max=%zu) bufcount=%zu (max=%zu) "
950		    "dealloccnt %d (lim=%d)\n",
951		    lockcount, wl->wl_bufbytes,
952		    wl->wl_bufbytes_max, wl->wl_bufcount,
953		    wl->wl_bufcount_max,
954		    wl->wl_dealloccnt, wl->wl_dealloclim));
955	}
956
957	if (doflush) {
958		int error = wapbl_flush(wl, 0);
959		if (error)
960			return error;
961	}
962
963	rw_enter(&wl->wl_rwlock, RW_READER);
964	mutex_enter(&wl->wl_mtx);
965	wl->wl_lock_count++;
966	mutex_exit(&wl->wl_mtx);
967
968#if defined(WAPBL_DEBUG_PRINT)
969	WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
970	    ("wapbl_begin thread %d.%d with bufcount=%zu "
971	    "bufbytes=%zu bcount=%zu at %s:%d\n",
972	    curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
973	    wl->wl_bufbytes, wl->wl_bcount, file, line));
974#endif
975
976	return 0;
977}
978
979void
980wapbl_end(struct wapbl *wl)
981{
982
983#if defined(WAPBL_DEBUG_PRINT)
984	WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
985	     ("wapbl_end thread %d.%d with bufcount=%zu "
986	      "bufbytes=%zu bcount=%zu\n",
987	      curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
988	      wl->wl_bufbytes, wl->wl_bcount));
989#endif
990
991#ifdef DIAGNOSTIC
992	size_t flushsize = wapbl_transaction_len(wl);
993	if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
994		/*
995		 * XXX this could be handled more gracefully, perhaps place
996		 * only a partial transaction in the log and allow the
997		 * remaining to flush without the protection of the journal.
998		 */
999		panic("wapbl_end: current transaction too big to flush\n");
1000	}
1001#endif
1002
1003	mutex_enter(&wl->wl_mtx);
1004	KASSERT(wl->wl_lock_count > 0);
1005	wl->wl_lock_count--;
1006	mutex_exit(&wl->wl_mtx);
1007
1008	rw_exit(&wl->wl_rwlock);
1009}
1010
1011void
1012wapbl_add_buf(struct wapbl *wl, struct buf * bp)
1013{
1014
1015	KASSERT(bp->b_cflags & BC_BUSY);
1016	KASSERT(bp->b_vp);
1017
1018	wapbl_jlock_assert(wl);
1019
1020#if 0
1021	/*
1022	 * XXX this might be an issue for swapfiles.
1023	 * see uvm_swap.c:1702
1024	 *
1025	 * XXX2 why require it then?  leap of semantics?
1026	 */
1027	KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
1028#endif
1029
1030	mutex_enter(&wl->wl_mtx);
1031	if (bp->b_flags & B_LOCKED) {
1032		LIST_REMOVE(bp, b_wapbllist);
1033		WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
1034		   ("wapbl_add_buf thread %d.%d re-adding buf %p "
1035		    "with %d bytes %d bcount\n",
1036		    curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1037		    bp->b_bcount));
1038	} else {
1039		/* unlocked by dirty buffers shouldn't exist */
1040		KASSERT(!(bp->b_oflags & BO_DELWRI));
1041		wl->wl_bufbytes += bp->b_bufsize;
1042		wl->wl_bcount += bp->b_bcount;
1043		wl->wl_bufcount++;
1044		WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1045		   ("wapbl_add_buf thread %d.%d adding buf %p "
1046		    "with %d bytes %d bcount\n",
1047		    curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1048		    bp->b_bcount));
1049	}
1050	LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
1051	mutex_exit(&wl->wl_mtx);
1052
1053	bp->b_flags |= B_LOCKED;
1054}
1055
1056static void
1057wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
1058{
1059
1060	KASSERT(mutex_owned(&wl->wl_mtx));
1061	KASSERT(bp->b_cflags & BC_BUSY);
1062	wapbl_jlock_assert(wl);
1063
1064#if 0
1065	/*
1066	 * XXX this might be an issue for swapfiles.
1067	 * see uvm_swap.c:1725
1068	 *
1069	 * XXXdeux: see above
1070	 */
1071	KASSERT((bp->b_flags & BC_NOCACHE) == 0);
1072#endif
1073	KASSERT(bp->b_flags & B_LOCKED);
1074
1075	WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1076	   ("wapbl_remove_buf thread %d.%d removing buf %p with "
1077	    "%d bytes %d bcount\n",
1078	    curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
1079
1080	KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
1081	wl->wl_bufbytes -= bp->b_bufsize;
1082	KASSERT(wl->wl_bcount >= bp->b_bcount);
1083	wl->wl_bcount -= bp->b_bcount;
1084	KASSERT(wl->wl_bufcount > 0);
1085	wl->wl_bufcount--;
1086	KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1087	KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1088	LIST_REMOVE(bp, b_wapbllist);
1089
1090	bp->b_flags &= ~B_LOCKED;
1091}
1092
1093/* called from brelsel() in vfs_bio among other places */
1094void
1095wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1096{
1097
1098	mutex_enter(&wl->wl_mtx);
1099	wapbl_remove_buf_locked(wl, bp);
1100	mutex_exit(&wl->wl_mtx);
1101}
1102
1103void
1104wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1105{
1106
1107	KASSERT(bp->b_cflags & BC_BUSY);
1108
1109	/*
1110	 * XXX: why does this depend on B_LOCKED?  otherwise the buf
1111	 * is not for a transaction?  if so, why is this called in the
1112	 * first place?
1113	 */
1114	if (bp->b_flags & B_LOCKED) {
1115		mutex_enter(&wl->wl_mtx);
1116		wl->wl_bufbytes += bp->b_bufsize - oldsz;
1117		wl->wl_bcount += bp->b_bcount - oldcnt;
1118		mutex_exit(&wl->wl_mtx);
1119	}
1120}
1121
1122#endif /* _KERNEL */
1123
1124/****************************************************************/
1125/* Some utility inlines */
1126
1127/* This is used to advance the pointer at old to new value at old+delta */
1128static inline off_t
1129wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
1130{
1131	off_t new;
1132
1133	/* Define acceptable ranges for inputs. */
1134	KASSERT(delta <= (size_t)size);
1135	KASSERT((old == 0) || ((size_t)old >= off));
1136	KASSERT(old < (off_t)(size + off));
1137
1138	if ((old == 0) && (delta != 0))
1139		new = off + delta;
1140	else if ((old + delta) < (size + off))
1141		new = old + delta;
1142	else
1143		new = (old + delta) - size;
1144
1145	/* Note some interesting axioms */
1146	KASSERT((delta != 0) || (new == old));
1147	KASSERT((delta == 0) || (new != 0));
1148	KASSERT((delta != (size)) || (new == old));
1149
1150	/* Define acceptable ranges for output. */
1151	KASSERT((new == 0) || ((size_t)new >= off));
1152	KASSERT((size_t)new < (size + off));
1153	return new;
1154}
1155
1156static inline size_t
1157wapbl_space_used(size_t avail, off_t head, off_t tail)
1158{
1159
1160	if (tail == 0) {
1161		KASSERT(head == 0);
1162		return 0;
1163	}
1164	return ((head + (avail - 1) - tail) % avail) + 1;
1165}
1166
1167static inline size_t
1168wapbl_space_free(size_t avail, off_t head, off_t tail)
1169{
1170
1171	return avail - wapbl_space_used(avail, head, tail);
1172}
1173
1174static inline void
1175wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1176		   off_t *tailp)
1177{
1178	off_t head = *headp;
1179	off_t tail = *tailp;
1180
1181	KASSERT(delta <= wapbl_space_free(size, head, tail));
1182	head = wapbl_advance(size, off, head, delta);
1183	if ((tail == 0) && (head != 0))
1184		tail = off;
1185	*headp = head;
1186	*tailp = tail;
1187}
1188
1189static inline void
1190wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1191		   off_t *tailp)
1192{
1193	off_t head = *headp;
1194	off_t tail = *tailp;
1195
1196	KASSERT(delta <= wapbl_space_used(size, head, tail));
1197	tail = wapbl_advance(size, off, tail, delta);
1198	if (head == tail) {
1199		head = tail = 0;
1200	}
1201	*headp = head;
1202	*tailp = tail;
1203}
1204
1205#ifdef _KERNEL
1206
1207/****************************************************************/
1208
1209/*
1210 * Remove transactions whose buffers are completely flushed to disk.
1211 * Will block until at least minfree space is available.
1212 * only intended to be called from inside wapbl_flush and therefore
1213 * does not protect against commit races with itself or with flush.
1214 */
1215static int
1216wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1217{
1218	size_t delta;
1219	size_t avail;
1220	off_t head;
1221	off_t tail;
1222	int error = 0;
1223
1224	KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1225	KASSERT(rw_write_held(&wl->wl_rwlock));
1226
1227	mutex_enter(&wl->wl_mtx);
1228
1229	/*
1230	 * First check to see if we have to do a commit
1231	 * at all.
1232	 */
1233	avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1234	if (minfree < avail) {
1235		mutex_exit(&wl->wl_mtx);
1236		return 0;
1237	}
1238	minfree -= avail;
1239	while ((wl->wl_error_count == 0) &&
1240	    (wl->wl_reclaimable_bytes < minfree)) {
1241        	WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1242                   ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1243		    "minfree=%zd\n",
1244                    &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1245		    minfree));
1246
1247		cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1248	}
1249	if (wl->wl_reclaimable_bytes < minfree) {
1250		KASSERT(wl->wl_error_count);
1251		/* XXX maybe get actual error from buffer instead someday? */
1252		error = EIO;
1253	}
1254	head = wl->wl_head;
1255	tail = wl->wl_tail;
1256	delta = wl->wl_reclaimable_bytes;
1257
1258	/* If all of of the entries are flushed, then be sure to keep
1259	 * the reserved bytes reserved.  Watch out for discarded transactions,
1260	 * which could leave more bytes reserved than are reclaimable.
1261	 */
1262	if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1263	    (delta >= wl->wl_reserved_bytes)) {
1264		delta -= wl->wl_reserved_bytes;
1265	}
1266	wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1267			   &tail);
1268	KDASSERT(wl->wl_reserved_bytes <=
1269		wapbl_space_used(wl->wl_circ_size, head, tail));
1270	mutex_exit(&wl->wl_mtx);
1271
1272	if (error)
1273		return error;
1274
1275	if (waitonly)
1276		return 0;
1277
1278	/*
1279	 * This is where head, tail and delta are unprotected
1280	 * from races against itself or flush.  This is ok since
1281	 * we only call this routine from inside flush itself.
1282	 *
1283	 * XXX: how can it race against itself when accessed only
1284	 * from behind the write-locked rwlock?
1285	 */
1286	error = wapbl_write_commit(wl, head, tail);
1287	if (error)
1288		return error;
1289
1290	wl->wl_head = head;
1291	wl->wl_tail = tail;
1292
1293	mutex_enter(&wl->wl_mtx);
1294	KASSERT(wl->wl_reclaimable_bytes >= delta);
1295	wl->wl_reclaimable_bytes -= delta;
1296	mutex_exit(&wl->wl_mtx);
1297	WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1298	    ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1299	    curproc->p_pid, curlwp->l_lid, delta));
1300
1301	return 0;
1302}
1303
1304/****************************************************************/
1305
1306void
1307wapbl_biodone(struct buf *bp)
1308{
1309	struct wapbl_entry *we = bp->b_private;
1310	struct wapbl *wl = we->we_wapbl;
1311#ifdef WAPBL_DEBUG_BUFBYTES
1312	const int bufsize = bp->b_bufsize;
1313#endif
1314
1315	/*
1316	 * Handle possible flushing of buffers after log has been
1317	 * decomissioned.
1318	 */
1319	if (!wl) {
1320		KASSERT(we->we_bufcount > 0);
1321		we->we_bufcount--;
1322#ifdef WAPBL_DEBUG_BUFBYTES
1323		KASSERT(we->we_unsynced_bufbytes >= bufsize);
1324		we->we_unsynced_bufbytes -= bufsize;
1325#endif
1326
1327		if (we->we_bufcount == 0) {
1328#ifdef WAPBL_DEBUG_BUFBYTES
1329			KASSERT(we->we_unsynced_bufbytes == 0);
1330#endif
1331			pool_put(&wapbl_entry_pool, we);
1332		}
1333
1334		brelse(bp, 0);
1335		return;
1336	}
1337
1338#ifdef ohbother
1339	KDASSERT(bp->b_oflags & BO_DONE);
1340	KDASSERT(!(bp->b_oflags & BO_DELWRI));
1341	KDASSERT(bp->b_flags & B_ASYNC);
1342	KDASSERT(bp->b_cflags & BC_BUSY);
1343	KDASSERT(!(bp->b_flags & B_LOCKED));
1344	KDASSERT(!(bp->b_flags & B_READ));
1345	KDASSERT(!(bp->b_cflags & BC_INVAL));
1346	KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1347#endif
1348
1349	if (bp->b_error) {
1350#ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1351		/*
1352		 * XXXpooka: interfaces not fully updated
1353		 * Note: this was not enabled in the original patch
1354		 * against netbsd4 either.  I don't know if comment
1355		 * above is true or not.
1356		 */
1357
1358		/*
1359		 * If an error occurs, report the error and leave the
1360		 * buffer as a delayed write on the LRU queue.
1361		 * restarting the write would likely result in
1362		 * an error spinloop, so let it be done harmlessly
1363		 * by the syncer.
1364		 */
1365		bp->b_flags &= ~(B_DONE);
1366		simple_unlock(&bp->b_interlock);
1367
1368		if (we->we_error == 0) {
1369			mutex_enter(&wl->wl_mtx);
1370			wl->wl_error_count++;
1371			mutex_exit(&wl->wl_mtx);
1372			cv_broadcast(&wl->wl_reclaimable_cv);
1373		}
1374		we->we_error = bp->b_error;
1375		bp->b_error = 0;
1376		brelse(bp);
1377		return;
1378#else
1379		/* For now, just mark the log permanently errored out */
1380
1381		mutex_enter(&wl->wl_mtx);
1382		if (wl->wl_error_count == 0) {
1383			wl->wl_error_count++;
1384			cv_broadcast(&wl->wl_reclaimable_cv);
1385		}
1386		mutex_exit(&wl->wl_mtx);
1387#endif
1388	}
1389
1390	/*
1391	 * Release the buffer here. wapbl_flush() may wait for the
1392	 * log to become empty and we better unbusy the buffer before
1393	 * wapbl_flush() returns.
1394	 */
1395	brelse(bp, 0);
1396
1397	mutex_enter(&wl->wl_mtx);
1398
1399	KASSERT(we->we_bufcount > 0);
1400	we->we_bufcount--;
1401#ifdef WAPBL_DEBUG_BUFBYTES
1402	KASSERT(we->we_unsynced_bufbytes >= bufsize);
1403	we->we_unsynced_bufbytes -= bufsize;
1404	KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
1405	wl->wl_unsynced_bufbytes -= bufsize;
1406#endif
1407
1408	/*
1409	 * If the current transaction can be reclaimed, start
1410	 * at the beginning and reclaim any consecutive reclaimable
1411	 * transactions.  If we successfully reclaim anything,
1412	 * then wakeup anyone waiting for the reclaim.
1413	 */
1414	if (we->we_bufcount == 0) {
1415		size_t delta = 0;
1416		int errcnt = 0;
1417#ifdef WAPBL_DEBUG_BUFBYTES
1418		KDASSERT(we->we_unsynced_bufbytes == 0);
1419#endif
1420		/*
1421		 * clear any posted error, since the buffer it came from
1422		 * has successfully flushed by now
1423		 */
1424		while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1425		       (we->we_bufcount == 0)) {
1426			delta += we->we_reclaimable_bytes;
1427			if (we->we_error)
1428				errcnt++;
1429			SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1430			pool_put(&wapbl_entry_pool, we);
1431		}
1432
1433		if (delta) {
1434			wl->wl_reclaimable_bytes += delta;
1435			KASSERT(wl->wl_error_count >= errcnt);
1436			wl->wl_error_count -= errcnt;
1437			cv_broadcast(&wl->wl_reclaimable_cv);
1438		}
1439	}
1440
1441	mutex_exit(&wl->wl_mtx);
1442}
1443
1444/*
1445 * Write transactions to disk + start I/O for contents
1446 */
1447int
1448wapbl_flush(struct wapbl *wl, int waitfor)
1449{
1450	struct buf *bp;
1451	struct wapbl_entry *we;
1452	off_t off;
1453	off_t head;
1454	off_t tail;
1455	size_t delta = 0;
1456	size_t flushsize;
1457	size_t reserved;
1458	int error = 0;
1459
1460	/*
1461	 * Do a quick check to see if a full flush can be skipped
1462	 * This assumes that the flush callback does not need to be called
1463	 * unless there are other outstanding bufs.
1464	 */
1465	if (!waitfor) {
1466		size_t nbufs;
1467		mutex_enter(&wl->wl_mtx);	/* XXX need mutex here to
1468						   protect the KASSERTS */
1469		nbufs = wl->wl_bufcount;
1470		KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1471		KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1472		mutex_exit(&wl->wl_mtx);
1473		if (nbufs == 0)
1474			return 0;
1475	}
1476
1477	/*
1478	 * XXX we may consider using LK_UPGRADE here
1479	 * if we want to call flush from inside a transaction
1480	 */
1481	rw_enter(&wl->wl_rwlock, RW_WRITER);
1482	wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1483	    wl->wl_dealloccnt);
1484
1485	/*
1486	 * Now that we are fully locked and flushed,
1487	 * do another check for nothing to do.
1488	 */
1489	if (wl->wl_bufcount == 0) {
1490		goto out;
1491	}
1492
1493#if 0
1494	WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1495		     ("wapbl_flush thread %d.%d flushing entries with "
1496		      "bufcount=%zu bufbytes=%zu\n",
1497		      curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1498		      wl->wl_bufbytes));
1499#endif
1500
1501	/* Calculate amount of space needed to flush */
1502	flushsize = wapbl_transaction_len(wl);
1503	if (wapbl_verbose_commit) {
1504		struct timespec ts;
1505		getnanotime(&ts);
1506		printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1507		    __func__, (long long)ts.tv_sec,
1508		    (long)ts.tv_nsec, flushsize);
1509	}
1510
1511	if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1512		/*
1513		 * XXX this could be handled more gracefully, perhaps place
1514		 * only a partial transaction in the log and allow the
1515		 * remaining to flush without the protection of the journal.
1516		 */
1517		panic("wapbl_flush: current transaction too big to flush\n");
1518	}
1519
1520	error = wapbl_truncate(wl, flushsize, 0);
1521	if (error)
1522		goto out2;
1523
1524	off = wl->wl_head;
1525	KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1526	                      (off < wl->wl_circ_off + wl->wl_circ_size)));
1527	error = wapbl_write_blocks(wl, &off);
1528	if (error)
1529		goto out2;
1530	error = wapbl_write_revocations(wl, &off);
1531	if (error)
1532		goto out2;
1533	error = wapbl_write_inodes(wl, &off);
1534	if (error)
1535		goto out2;
1536
1537	reserved = 0;
1538	if (wl->wl_inohashcnt)
1539		reserved = wapbl_transaction_inodes_len(wl);
1540
1541	head = wl->wl_head;
1542	tail = wl->wl_tail;
1543
1544	wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1545	    &head, &tail);
1546#ifdef WAPBL_DEBUG
1547	if (head != off) {
1548		panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1549		      " off=%"PRIdMAX" flush=%zu\n",
1550		      (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1551		      flushsize);
1552	}
1553#else
1554	KASSERT(head == off);
1555#endif
1556
1557	/* Opportunistically move the tail forward if we can */
1558	if (!wapbl_lazy_truncate) {
1559		mutex_enter(&wl->wl_mtx);
1560		delta = wl->wl_reclaimable_bytes;
1561		mutex_exit(&wl->wl_mtx);
1562		wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1563		    &head, &tail);
1564	}
1565
1566	error = wapbl_write_commit(wl, head, tail);
1567	if (error)
1568		goto out2;
1569
1570	we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1571
1572#ifdef WAPBL_DEBUG_BUFBYTES
1573	WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1574		("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1575		 " unsynced=%zu"
1576		 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1577		 "inodes=%d\n",
1578		 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1579		 wapbl_space_used(wl->wl_circ_size, head, tail),
1580		 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1581		 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1582		 wl->wl_inohashcnt));
1583#else
1584	WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1585		("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1586		 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1587		 "inodes=%d\n",
1588		 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1589		 wapbl_space_used(wl->wl_circ_size, head, tail),
1590		 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1591		 wl->wl_dealloccnt, wl->wl_inohashcnt));
1592#endif
1593
1594
1595	mutex_enter(&bufcache_lock);
1596	mutex_enter(&wl->wl_mtx);
1597
1598	wl->wl_reserved_bytes = reserved;
1599	wl->wl_head = head;
1600	wl->wl_tail = tail;
1601	KASSERT(wl->wl_reclaimable_bytes >= delta);
1602	wl->wl_reclaimable_bytes -= delta;
1603	wl->wl_dealloccnt = 0;
1604#ifdef WAPBL_DEBUG_BUFBYTES
1605	wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1606#endif
1607
1608	we->we_wapbl = wl;
1609	we->we_bufcount = wl->wl_bufcount;
1610#ifdef WAPBL_DEBUG_BUFBYTES
1611	we->we_unsynced_bufbytes = wl->wl_bufbytes;
1612#endif
1613	we->we_reclaimable_bytes = flushsize;
1614	we->we_error = 0;
1615	SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1616
1617	/*
1618	 * this flushes bufs in reverse order than they were queued
1619	 * it shouldn't matter, but if we care we could use TAILQ instead.
1620	 * XXX Note they will get put on the lru queue when they flush
1621	 * so we might actually want to change this to preserve order.
1622	 */
1623	while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1624		if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1625			continue;
1626		}
1627		bp->b_iodone = wapbl_biodone;
1628		bp->b_private = we;
1629		bremfree(bp);
1630		wapbl_remove_buf_locked(wl, bp);
1631		mutex_exit(&wl->wl_mtx);
1632		mutex_exit(&bufcache_lock);
1633		bawrite(bp);
1634		mutex_enter(&bufcache_lock);
1635		mutex_enter(&wl->wl_mtx);
1636	}
1637	mutex_exit(&wl->wl_mtx);
1638	mutex_exit(&bufcache_lock);
1639
1640#if 0
1641	WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1642		     ("wapbl_flush thread %d.%d done flushing entries...\n",
1643		     curproc->p_pid, curlwp->l_lid));
1644#endif
1645
1646 out:
1647
1648	/*
1649	 * If the waitfor flag is set, don't return until everything is
1650	 * fully flushed and the on disk log is empty.
1651	 */
1652	if (waitfor) {
1653		error = wapbl_truncate(wl, wl->wl_circ_size -
1654			wl->wl_reserved_bytes, wapbl_lazy_truncate);
1655	}
1656
1657 out2:
1658	if (error) {
1659		wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1660		    wl->wl_dealloclens, wl->wl_dealloccnt);
1661	}
1662
1663#ifdef WAPBL_DEBUG_PRINT
1664	if (error) {
1665		pid_t pid = -1;
1666		lwpid_t lid = -1;
1667		if (curproc)
1668			pid = curproc->p_pid;
1669		if (curlwp)
1670			lid = curlwp->l_lid;
1671		mutex_enter(&wl->wl_mtx);
1672#ifdef WAPBL_DEBUG_BUFBYTES
1673		WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1674		    ("wapbl_flush: thread %d.%d aborted flush: "
1675		    "error = %d\n"
1676		    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1677		    "deallocs=%d inodes=%d\n"
1678		    "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1679		    "unsynced=%zu\n",
1680		    pid, lid, error, wl->wl_bufcount,
1681		    wl->wl_bufbytes, wl->wl_bcount,
1682		    wl->wl_dealloccnt, wl->wl_inohashcnt,
1683		    wl->wl_error_count, wl->wl_reclaimable_bytes,
1684		    wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1685		SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1686			WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1687			    ("\tentry: bufcount = %zu, reclaimable = %zu, "
1688			     "error = %d, unsynced = %zu\n",
1689			     we->we_bufcount, we->we_reclaimable_bytes,
1690			     we->we_error, we->we_unsynced_bufbytes));
1691		}
1692#else
1693		WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1694		    ("wapbl_flush: thread %d.%d aborted flush: "
1695		     "error = %d\n"
1696		     "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1697		     "deallocs=%d inodes=%d\n"
1698		     "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1699		     pid, lid, error, wl->wl_bufcount,
1700		     wl->wl_bufbytes, wl->wl_bcount,
1701		     wl->wl_dealloccnt, wl->wl_inohashcnt,
1702		     wl->wl_error_count, wl->wl_reclaimable_bytes,
1703		     wl->wl_reserved_bytes));
1704		SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1705			WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1706			    ("\tentry: bufcount = %zu, reclaimable = %zu, "
1707			     "error = %d\n", we->we_bufcount,
1708			     we->we_reclaimable_bytes, we->we_error));
1709		}
1710#endif
1711		mutex_exit(&wl->wl_mtx);
1712	}
1713#endif
1714
1715	rw_exit(&wl->wl_rwlock);
1716	return error;
1717}
1718
1719/****************************************************************/
1720
1721void
1722wapbl_jlock_assert(struct wapbl *wl)
1723{
1724
1725	KASSERT(rw_lock_held(&wl->wl_rwlock));
1726}
1727
1728void
1729wapbl_junlock_assert(struct wapbl *wl)
1730{
1731
1732	KASSERT(!rw_write_held(&wl->wl_rwlock));
1733}
1734
1735/****************************************************************/
1736
1737/* locks missing */
1738void
1739wapbl_print(struct wapbl *wl,
1740		int full,
1741		void (*pr)(const char *, ...))
1742{
1743	struct buf *bp;
1744	struct wapbl_entry *we;
1745	(*pr)("wapbl %p", wl);
1746	(*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1747	      wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1748	(*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1749	      wl->wl_circ_size, wl->wl_circ_off,
1750	      (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1751	(*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1752	      wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1753#ifdef WAPBL_DEBUG_BUFBYTES
1754	(*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1755	      "reserved = %zu errcnt = %d unsynced = %zu\n",
1756	      wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1757	      wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1758				wl->wl_error_count, wl->wl_unsynced_bufbytes);
1759#else
1760	(*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1761	      "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1762	      wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1763				wl->wl_error_count);
1764#endif
1765	(*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1766	      wl->wl_dealloccnt, wl->wl_dealloclim);
1767	(*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1768	      wl->wl_inohashcnt, wl->wl_inohashmask);
1769	(*pr)("entries:\n");
1770	SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1771#ifdef WAPBL_DEBUG_BUFBYTES
1772		(*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1773		      "unsynced = %zu\n",
1774		      we->we_bufcount, we->we_reclaimable_bytes,
1775		      we->we_error, we->we_unsynced_bufbytes);
1776#else
1777		(*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1778		      we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1779#endif
1780	}
1781	if (full) {
1782		int cnt = 0;
1783		(*pr)("bufs =");
1784		LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1785			if (!LIST_NEXT(bp, b_wapbllist)) {
1786				(*pr)(" %p", bp);
1787			} else if ((++cnt % 6) == 0) {
1788				(*pr)(" %p,\n\t", bp);
1789			} else {
1790				(*pr)(" %p,", bp);
1791			}
1792		}
1793		(*pr)("\n");
1794
1795		(*pr)("dealloced blks = ");
1796		{
1797			int i;
1798			cnt = 0;
1799			for (i = 0; i < wl->wl_dealloccnt; i++) {
1800				(*pr)(" %"PRId64":%d,",
1801				      wl->wl_deallocblks[i],
1802				      wl->wl_dealloclens[i]);
1803				if ((++cnt % 4) == 0) {
1804					(*pr)("\n\t");
1805				}
1806			}
1807		}
1808		(*pr)("\n");
1809
1810		(*pr)("registered inodes = ");
1811		{
1812			int i;
1813			cnt = 0;
1814			for (i = 0; i <= wl->wl_inohashmask; i++) {
1815				struct wapbl_ino_head *wih;
1816				struct wapbl_ino *wi;
1817
1818				wih = &wl->wl_inohash[i];
1819				LIST_FOREACH(wi, wih, wi_hash) {
1820					if (wi->wi_ino == 0)
1821						continue;
1822					(*pr)(" %"PRId32"/0%06"PRIo32",",
1823					    wi->wi_ino, wi->wi_mode);
1824					if ((++cnt % 4) == 0) {
1825						(*pr)("\n\t");
1826					}
1827				}
1828			}
1829			(*pr)("\n");
1830		}
1831	}
1832}
1833
1834#if defined(WAPBL_DEBUG) || defined(DDB)
1835void
1836wapbl_dump(struct wapbl *wl)
1837{
1838#if defined(WAPBL_DEBUG)
1839	if (!wl)
1840		wl = wapbl_debug_wl;
1841#endif
1842	if (!wl)
1843		return;
1844	wapbl_print(wl, 1, printf);
1845}
1846#endif
1847
1848/****************************************************************/
1849
1850void
1851wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1852{
1853
1854	wapbl_jlock_assert(wl);
1855
1856	mutex_enter(&wl->wl_mtx);
1857	/* XXX should eventually instead tie this into resource estimation */
1858	/*
1859	 * XXX this panic needs locking/mutex analysis and the
1860	 * ability to cope with the failure.
1861	 */
1862	/* XXX this XXX doesn't have enough XXX */
1863	if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
1864		panic("wapbl_register_deallocation: out of resources");
1865
1866	wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1867	wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1868	wl->wl_dealloccnt++;
1869	WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1870	    ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1871	mutex_exit(&wl->wl_mtx);
1872}
1873
1874/****************************************************************/
1875
1876static void
1877wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1878{
1879
1880	wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1881	if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1882		pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1883		    "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1884	}
1885}
1886
1887static void
1888wapbl_inodetrk_free(struct wapbl *wl)
1889{
1890
1891	/* XXX this KASSERT needs locking/mutex analysis */
1892	KASSERT(wl->wl_inohashcnt == 0);
1893	hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1894	if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1895		pool_destroy(&wapbl_ino_pool);
1896	}
1897}
1898
1899static struct wapbl_ino *
1900wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1901{
1902	struct wapbl_ino_head *wih;
1903	struct wapbl_ino *wi;
1904
1905	KASSERT(mutex_owned(&wl->wl_mtx));
1906
1907	wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1908	LIST_FOREACH(wi, wih, wi_hash) {
1909		if (ino == wi->wi_ino)
1910			return wi;
1911	}
1912	return 0;
1913}
1914
1915void
1916wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1917{
1918	struct wapbl_ino_head *wih;
1919	struct wapbl_ino *wi;
1920
1921	wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1922
1923	mutex_enter(&wl->wl_mtx);
1924	if (wapbl_inodetrk_get(wl, ino) == NULL) {
1925		wi->wi_ino = ino;
1926		wi->wi_mode = mode;
1927		wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1928		LIST_INSERT_HEAD(wih, wi, wi_hash);
1929		wl->wl_inohashcnt++;
1930		WAPBL_PRINTF(WAPBL_PRINT_INODE,
1931		    ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1932		mutex_exit(&wl->wl_mtx);
1933	} else {
1934		mutex_exit(&wl->wl_mtx);
1935		pool_put(&wapbl_ino_pool, wi);
1936	}
1937}
1938
1939void
1940wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1941{
1942	struct wapbl_ino *wi;
1943
1944	mutex_enter(&wl->wl_mtx);
1945	wi = wapbl_inodetrk_get(wl, ino);
1946	if (wi) {
1947		WAPBL_PRINTF(WAPBL_PRINT_INODE,
1948		    ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1949		KASSERT(wl->wl_inohashcnt > 0);
1950		wl->wl_inohashcnt--;
1951		LIST_REMOVE(wi, wi_hash);
1952		mutex_exit(&wl->wl_mtx);
1953
1954		pool_put(&wapbl_ino_pool, wi);
1955	} else {
1956		mutex_exit(&wl->wl_mtx);
1957	}
1958}
1959
1960/****************************************************************/
1961
1962static inline size_t
1963wapbl_transaction_inodes_len(struct wapbl *wl)
1964{
1965	int blocklen = 1<<wl->wl_log_dev_bshift;
1966	int iph;
1967
1968	/* Calculate number of inodes described in a inodelist header */
1969	iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1970	    sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1971
1972	KASSERT(iph > 0);
1973
1974	return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1975}
1976
1977
1978/* Calculate amount of space a transaction will take on disk */
1979static size_t
1980wapbl_transaction_len(struct wapbl *wl)
1981{
1982	int blocklen = 1<<wl->wl_log_dev_bshift;
1983	size_t len;
1984	int bph;
1985
1986	/* Calculate number of blocks described in a blocklist header */
1987	bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1988	    sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1989
1990	KASSERT(bph > 0);
1991
1992	len = wl->wl_bcount;
1993	len += howmany(wl->wl_bufcount, bph) * blocklen;
1994	len += howmany(wl->wl_dealloccnt, bph) * blocklen;
1995	len += wapbl_transaction_inodes_len(wl);
1996
1997	return len;
1998}
1999
2000/*
2001 * wapbl_cache_sync: issue DIOCCACHESYNC
2002 */
2003static int
2004wapbl_cache_sync(struct wapbl *wl, const char *msg)
2005{
2006	const bool verbose = wapbl_verbose_commit >= 2;
2007	struct bintime start_time;
2008	int force = 1;
2009	int error;
2010
2011	if (!wapbl_flush_disk_cache) {
2012		return 0;
2013	}
2014	if (verbose) {
2015		bintime(&start_time);
2016	}
2017	error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
2018	    FWRITE, FSCRED);
2019	if (error) {
2020		WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2021		    ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%x "
2022		    "returned %d\n", wl->wl_devvp->v_rdev, error));
2023	}
2024	if (verbose) {
2025		struct bintime d;
2026		struct timespec ts;
2027
2028		bintime(&d);
2029		bintime_sub(&d, &start_time);
2030		bintime2timespec(&d, &ts);
2031		printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
2032		    msg, (uintmax_t)wl->wl_devvp->v_rdev,
2033		    (uintmax_t)ts.tv_sec, ts.tv_nsec);
2034	}
2035	return error;
2036}
2037
2038/*
2039 * Perform commit operation
2040 *
2041 * Note that generation number incrementation needs to
2042 * be protected against racing with other invocations
2043 * of wapbl_write_commit.  This is ok since this routine
2044 * is only invoked from wapbl_flush
2045 */
2046static int
2047wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
2048{
2049	struct wapbl_wc_header *wc = wl->wl_wc_header;
2050	struct timespec ts;
2051	int error;
2052	daddr_t pbn;
2053
2054	error = wapbl_buffered_flush(wl);
2055	if (error)
2056		return error;
2057	/*
2058	 * flush disk cache to ensure that blocks we've written are actually
2059	 * written to the stable storage before the commit header.
2060	 *
2061	 * XXX Calc checksum here, instead we do this for now
2062	 */
2063	wapbl_cache_sync(wl, "1");
2064
2065	wc->wc_head = head;
2066	wc->wc_tail = tail;
2067	wc->wc_checksum = 0;
2068	wc->wc_version = 1;
2069	getnanotime(&ts);
2070	wc->wc_time = ts.tv_sec;
2071	wc->wc_timensec = ts.tv_nsec;
2072
2073	WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2074	    ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
2075	    (intmax_t)head, (intmax_t)tail));
2076
2077	/*
2078	 * write the commit header.
2079	 *
2080	 * XXX if generation will rollover, then first zero
2081	 * over second commit header before trying to write both headers.
2082	 */
2083
2084	pbn = wl->wl_logpbn + (wc->wc_generation % 2);
2085#ifdef _KERNEL
2086	pbn = btodb(pbn << wc->wc_log_dev_bshift);
2087#endif
2088	error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
2089	if (error)
2090		return error;
2091	error = wapbl_buffered_flush(wl);
2092	if (error)
2093		return error;
2094
2095	/*
2096	 * flush disk cache to ensure that the commit header is actually
2097	 * written before meta data blocks.
2098	 */
2099	wapbl_cache_sync(wl, "2");
2100
2101	/*
2102	 * If the generation number was zero, write it out a second time.
2103	 * This handles initialization and generation number rollover
2104	 */
2105	if (wc->wc_generation++ == 0) {
2106		error = wapbl_write_commit(wl, head, tail);
2107		/*
2108		 * This panic should be able to be removed if we do the
2109		 * zero'ing mentioned above, and we are certain to roll
2110		 * back generation number on failure.
2111		 */
2112		if (error)
2113			panic("wapbl_write_commit: error writing duplicate "
2114			      "log header: %d\n", error);
2115	}
2116	return 0;
2117}
2118
2119/* Returns new offset value */
2120static int
2121wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2122{
2123	struct wapbl_wc_blocklist *wc =
2124	    (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2125	int blocklen = 1<<wl->wl_log_dev_bshift;
2126	int bph;
2127	struct buf *bp;
2128	off_t off = *offp;
2129	int error;
2130	size_t padding;
2131
2132	KASSERT(rw_write_held(&wl->wl_rwlock));
2133
2134	bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2135	    sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2136
2137	bp = LIST_FIRST(&wl->wl_bufs);
2138
2139	while (bp) {
2140		int cnt;
2141		struct buf *obp = bp;
2142
2143		KASSERT(bp->b_flags & B_LOCKED);
2144
2145		wc->wc_type = WAPBL_WC_BLOCKS;
2146		wc->wc_len = blocklen;
2147		wc->wc_blkcount = 0;
2148		while (bp && (wc->wc_blkcount < bph)) {
2149			/*
2150			 * Make sure all the physical block numbers are up to
2151			 * date.  If this is not always true on a given
2152			 * filesystem, then VOP_BMAP must be called.  We
2153			 * could call VOP_BMAP here, or else in the filesystem
2154			 * specific flush callback, although neither of those
2155			 * solutions allow us to take the vnode lock.  If a
2156			 * filesystem requires that we must take the vnode lock
2157			 * to call VOP_BMAP, then we can probably do it in
2158			 * bwrite when the vnode lock should already be held
2159			 * by the invoking code.
2160			 */
2161			KASSERT((bp->b_vp->v_type == VBLK) ||
2162				 (bp->b_blkno != bp->b_lblkno));
2163			KASSERT(bp->b_blkno > 0);
2164
2165			wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2166			wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2167			wc->wc_len += bp->b_bcount;
2168			wc->wc_blkcount++;
2169			bp = LIST_NEXT(bp, b_wapbllist);
2170		}
2171		if (wc->wc_len % blocklen != 0) {
2172			padding = blocklen - wc->wc_len % blocklen;
2173			wc->wc_len += padding;
2174		} else {
2175			padding = 0;
2176		}
2177
2178		WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2179		    ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2180		    wc->wc_len, padding, (intmax_t)off));
2181
2182		error = wapbl_circ_write(wl, wc, blocklen, &off);
2183		if (error)
2184			return error;
2185		bp = obp;
2186		cnt = 0;
2187		while (bp && (cnt++ < bph)) {
2188			error = wapbl_circ_write(wl, bp->b_data,
2189			    bp->b_bcount, &off);
2190			if (error)
2191				return error;
2192			bp = LIST_NEXT(bp, b_wapbllist);
2193		}
2194		if (padding) {
2195			void *zero;
2196
2197			zero = wapbl_alloc(padding);
2198			memset(zero, 0, padding);
2199			error = wapbl_circ_write(wl, zero, padding, &off);
2200			wapbl_free(zero, padding);
2201			if (error)
2202				return error;
2203		}
2204	}
2205	*offp = off;
2206	return 0;
2207}
2208
2209static int
2210wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2211{
2212	struct wapbl_wc_blocklist *wc =
2213	    (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2214	int i;
2215	int blocklen = 1<<wl->wl_log_dev_bshift;
2216	int bph;
2217	off_t off = *offp;
2218	int error;
2219
2220	if (wl->wl_dealloccnt == 0)
2221		return 0;
2222
2223	bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2224	    sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2225
2226	i = 0;
2227	while (i < wl->wl_dealloccnt) {
2228		wc->wc_type = WAPBL_WC_REVOCATIONS;
2229		wc->wc_len = blocklen;
2230		wc->wc_blkcount = 0;
2231		while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2232			wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2233			    wl->wl_deallocblks[i];
2234			wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2235			    wl->wl_dealloclens[i];
2236			wc->wc_blkcount++;
2237			i++;
2238		}
2239		WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2240		    ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2241		    wc->wc_len, (intmax_t)off));
2242		error = wapbl_circ_write(wl, wc, blocklen, &off);
2243		if (error)
2244			return error;
2245	}
2246	*offp = off;
2247	return 0;
2248}
2249
2250static int
2251wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2252{
2253	struct wapbl_wc_inodelist *wc =
2254	    (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2255	int i;
2256	int blocklen = 1 << wl->wl_log_dev_bshift;
2257	off_t off = *offp;
2258	int error;
2259
2260	struct wapbl_ino_head *wih;
2261	struct wapbl_ino *wi;
2262	int iph;
2263
2264	iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2265	    sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2266
2267	i = 0;
2268	wih = &wl->wl_inohash[0];
2269	wi = 0;
2270	do {
2271		wc->wc_type = WAPBL_WC_INODES;
2272		wc->wc_len = blocklen;
2273		wc->wc_inocnt = 0;
2274		wc->wc_clear = (i == 0);
2275		while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2276			while (!wi) {
2277				KASSERT((wih - &wl->wl_inohash[0])
2278				    <= wl->wl_inohashmask);
2279				wi = LIST_FIRST(wih++);
2280			}
2281			wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2282			wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2283			wc->wc_inocnt++;
2284			i++;
2285			wi = LIST_NEXT(wi, wi_hash);
2286		}
2287		WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2288		    ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2289		    wc->wc_len, (intmax_t)off));
2290		error = wapbl_circ_write(wl, wc, blocklen, &off);
2291		if (error)
2292			return error;
2293	} while (i < wl->wl_inohashcnt);
2294
2295	*offp = off;
2296	return 0;
2297}
2298
2299#endif /* _KERNEL */
2300
2301/****************************************************************/
2302
2303struct wapbl_blk {
2304	LIST_ENTRY(wapbl_blk) wb_hash;
2305	daddr_t wb_blk;
2306	off_t wb_off; /* Offset of this block in the log */
2307};
2308#define	WAPBL_BLKPOOL_MIN 83
2309
2310static void
2311wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2312{
2313	if (size < WAPBL_BLKPOOL_MIN)
2314		size = WAPBL_BLKPOOL_MIN;
2315	KASSERT(wr->wr_blkhash == 0);
2316#ifdef _KERNEL
2317	wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2318#else /* ! _KERNEL */
2319	/* Manually implement hashinit */
2320	{
2321		unsigned long i, hashsize;
2322		for (hashsize = 1; hashsize < size; hashsize <<= 1)
2323			continue;
2324		wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
2325		for (i = 0; i < hashsize; i++)
2326			LIST_INIT(&wr->wr_blkhash[i]);
2327		wr->wr_blkhashmask = hashsize - 1;
2328	}
2329#endif /* ! _KERNEL */
2330}
2331
2332static void
2333wapbl_blkhash_free(struct wapbl_replay *wr)
2334{
2335	KASSERT(wr->wr_blkhashcnt == 0);
2336#ifdef _KERNEL
2337	hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2338#else /* ! _KERNEL */
2339	wapbl_free(wr->wr_blkhash,
2340	    (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2341#endif /* ! _KERNEL */
2342}
2343
2344static struct wapbl_blk *
2345wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2346{
2347	struct wapbl_blk_head *wbh;
2348	struct wapbl_blk *wb;
2349	wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2350	LIST_FOREACH(wb, wbh, wb_hash) {
2351		if (blk == wb->wb_blk)
2352			return wb;
2353	}
2354	return 0;
2355}
2356
2357static void
2358wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2359{
2360	struct wapbl_blk_head *wbh;
2361	struct wapbl_blk *wb;
2362	wb = wapbl_blkhash_get(wr, blk);
2363	if (wb) {
2364		KASSERT(wb->wb_blk == blk);
2365		wb->wb_off = off;
2366	} else {
2367		wb = wapbl_alloc(sizeof(*wb));
2368		wb->wb_blk = blk;
2369		wb->wb_off = off;
2370		wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2371		LIST_INSERT_HEAD(wbh, wb, wb_hash);
2372		wr->wr_blkhashcnt++;
2373	}
2374}
2375
2376static void
2377wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2378{
2379	struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2380	if (wb) {
2381		KASSERT(wr->wr_blkhashcnt > 0);
2382		wr->wr_blkhashcnt--;
2383		LIST_REMOVE(wb, wb_hash);
2384		wapbl_free(wb, sizeof(*wb));
2385	}
2386}
2387
2388static void
2389wapbl_blkhash_clear(struct wapbl_replay *wr)
2390{
2391	unsigned long i;
2392	for (i = 0; i <= wr->wr_blkhashmask; i++) {
2393		struct wapbl_blk *wb;
2394
2395		while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2396			KASSERT(wr->wr_blkhashcnt > 0);
2397			wr->wr_blkhashcnt--;
2398			LIST_REMOVE(wb, wb_hash);
2399			wapbl_free(wb, sizeof(*wb));
2400		}
2401	}
2402	KASSERT(wr->wr_blkhashcnt == 0);
2403}
2404
2405/****************************************************************/
2406
2407static int
2408wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2409{
2410	size_t slen;
2411	off_t off = *offp;
2412	int error;
2413	daddr_t pbn;
2414
2415	KASSERT(((len >> wr->wr_log_dev_bshift) <<
2416	    wr->wr_log_dev_bshift) == len);
2417
2418	if (off < wr->wr_circ_off)
2419		off = wr->wr_circ_off;
2420	slen = wr->wr_circ_off + wr->wr_circ_size - off;
2421	if (slen < len) {
2422		pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2423#ifdef _KERNEL
2424		pbn = btodb(pbn << wr->wr_log_dev_bshift);
2425#endif
2426		error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2427		if (error)
2428			return error;
2429		data = (uint8_t *)data + slen;
2430		len -= slen;
2431		off = wr->wr_circ_off;
2432	}
2433	pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2434#ifdef _KERNEL
2435	pbn = btodb(pbn << wr->wr_log_dev_bshift);
2436#endif
2437	error = wapbl_read(data, len, wr->wr_devvp, pbn);
2438	if (error)
2439		return error;
2440	off += len;
2441	if (off >= wr->wr_circ_off + wr->wr_circ_size)
2442		off = wr->wr_circ_off;
2443	*offp = off;
2444	return 0;
2445}
2446
2447static void
2448wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2449{
2450	size_t slen;
2451	off_t off = *offp;
2452
2453	KASSERT(((len >> wr->wr_log_dev_bshift) <<
2454	    wr->wr_log_dev_bshift) == len);
2455
2456	if (off < wr->wr_circ_off)
2457		off = wr->wr_circ_off;
2458	slen = wr->wr_circ_off + wr->wr_circ_size - off;
2459	if (slen < len) {
2460		len -= slen;
2461		off = wr->wr_circ_off;
2462	}
2463	off += len;
2464	if (off >= wr->wr_circ_off + wr->wr_circ_size)
2465		off = wr->wr_circ_off;
2466	*offp = off;
2467}
2468
2469/****************************************************************/
2470
2471int
2472wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2473	daddr_t off, size_t count, size_t blksize)
2474{
2475	struct wapbl_replay *wr;
2476	int error;
2477	struct vnode *devvp;
2478	daddr_t logpbn;
2479	uint8_t *scratch;
2480	struct wapbl_wc_header *wch;
2481	struct wapbl_wc_header *wch2;
2482	/* Use this until we read the actual log header */
2483	int log_dev_bshift = ilog2(blksize);
2484	size_t used;
2485	daddr_t pbn;
2486
2487	WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2488	    ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2489	    vp, off, count, blksize));
2490
2491	if (off < 0)
2492		return EINVAL;
2493
2494	if (blksize < DEV_BSIZE)
2495		return EINVAL;
2496	if (blksize % DEV_BSIZE)
2497		return EINVAL;
2498
2499#ifdef _KERNEL
2500#if 0
2501	/* XXX vp->v_size isn't reliably set for VBLK devices,
2502	 * especially root.  However, we might still want to verify
2503	 * that the full load is readable */
2504	if ((off + count) * blksize > vp->v_size)
2505		return EINVAL;
2506#endif
2507	if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2508		return error;
2509	}
2510#else /* ! _KERNEL */
2511	devvp = vp;
2512	logpbn = off;
2513#endif /* ! _KERNEL */
2514
2515	scratch = wapbl_alloc(MAXBSIZE);
2516
2517	pbn = logpbn;
2518#ifdef _KERNEL
2519	pbn = btodb(pbn << log_dev_bshift);
2520#endif
2521	error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2522	if (error)
2523		goto errout;
2524
2525	wch = (struct wapbl_wc_header *)scratch;
2526	wch2 =
2527	    (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2528	/* XXX verify checksums and magic numbers */
2529	if (wch->wc_type != WAPBL_WC_HEADER) {
2530		printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2531		error = EFTYPE;
2532		goto errout;
2533	}
2534
2535	if (wch2->wc_generation > wch->wc_generation)
2536		wch = wch2;
2537
2538	wr = wapbl_calloc(1, sizeof(*wr));
2539
2540	wr->wr_logvp = vp;
2541	wr->wr_devvp = devvp;
2542	wr->wr_logpbn = logpbn;
2543
2544	wr->wr_scratch = scratch;
2545
2546	wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2547	wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2548	wr->wr_circ_off = wch->wc_circ_off;
2549	wr->wr_circ_size = wch->wc_circ_size;
2550	wr->wr_generation = wch->wc_generation;
2551
2552	used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2553
2554	WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2555	    ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2556	    " len=%"PRId64" used=%zu\n",
2557	    wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2558	    wch->wc_circ_size, used));
2559
2560	wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2561
2562	error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2563	if (error) {
2564		wapbl_replay_stop(wr);
2565		wapbl_replay_free(wr);
2566		return error;
2567	}
2568
2569	*wrp = wr;
2570	return 0;
2571
2572 errout:
2573	wapbl_free(scratch, MAXBSIZE);
2574	return error;
2575}
2576
2577void
2578wapbl_replay_stop(struct wapbl_replay *wr)
2579{
2580
2581	if (!wapbl_replay_isopen(wr))
2582		return;
2583
2584	WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2585
2586	wapbl_free(wr->wr_scratch, MAXBSIZE);
2587	wr->wr_scratch = NULL;
2588
2589	wr->wr_logvp = NULL;
2590
2591	wapbl_blkhash_clear(wr);
2592	wapbl_blkhash_free(wr);
2593}
2594
2595void
2596wapbl_replay_free(struct wapbl_replay *wr)
2597{
2598
2599	KDASSERT(!wapbl_replay_isopen(wr));
2600
2601	if (wr->wr_inodes)
2602		wapbl_free(wr->wr_inodes,
2603		    wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2604	wapbl_free(wr, sizeof(*wr));
2605}
2606
2607#ifdef _KERNEL
2608int
2609wapbl_replay_isopen1(struct wapbl_replay *wr)
2610{
2611
2612	return wapbl_replay_isopen(wr);
2613}
2614#endif
2615
2616static void
2617wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2618{
2619	struct wapbl_wc_blocklist *wc =
2620	    (struct wapbl_wc_blocklist *)wr->wr_scratch;
2621	int fsblklen = 1 << wr->wr_fs_dev_bshift;
2622	int i, j, n;
2623
2624	for (i = 0; i < wc->wc_blkcount; i++) {
2625		/*
2626		 * Enter each physical block into the hashtable independently.
2627		 */
2628		n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2629		for (j = 0; j < n; j++) {
2630			wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen),
2631			    *offp);
2632			wapbl_circ_advance(wr, fsblklen, offp);
2633		}
2634	}
2635}
2636
2637static void
2638wapbl_replay_process_revocations(struct wapbl_replay *wr)
2639{
2640	struct wapbl_wc_blocklist *wc =
2641	    (struct wapbl_wc_blocklist *)wr->wr_scratch;
2642	int fsblklen = 1 << wr->wr_fs_dev_bshift;
2643	int i, j, n;
2644
2645	for (i = 0; i < wc->wc_blkcount; i++) {
2646		/*
2647		 * Remove any blocks found from the hashtable.
2648		 */
2649		n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2650		for (j = 0; j < n; j++)
2651			wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2652	}
2653}
2654
2655static void
2656wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2657{
2658	struct wapbl_wc_inodelist *wc =
2659	    (struct wapbl_wc_inodelist *)wr->wr_scratch;
2660	void *new_inodes;
2661	const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2662
2663	KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2664
2665	/*
2666	 * Keep track of where we found this so location won't be
2667	 * overwritten.
2668	 */
2669	if (wc->wc_clear) {
2670		wr->wr_inodestail = oldoff;
2671		wr->wr_inodescnt = 0;
2672		if (wr->wr_inodes != NULL) {
2673			wapbl_free(wr->wr_inodes, oldsize);
2674			wr->wr_inodes = NULL;
2675		}
2676	}
2677	wr->wr_inodeshead = newoff;
2678	if (wc->wc_inocnt == 0)
2679		return;
2680
2681	new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
2682	    sizeof(wr->wr_inodes[0]));
2683	if (wr->wr_inodes != NULL) {
2684		memcpy(new_inodes, wr->wr_inodes, oldsize);
2685		wapbl_free(wr->wr_inodes, oldsize);
2686	}
2687	wr->wr_inodes = new_inodes;
2688	memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2689	    wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
2690	wr->wr_inodescnt += wc->wc_inocnt;
2691}
2692
2693static int
2694wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2695{
2696	off_t off;
2697	int error;
2698
2699	int logblklen = 1 << wr->wr_log_dev_bshift;
2700
2701	wapbl_blkhash_clear(wr);
2702
2703	off = tail;
2704	while (off != head) {
2705		struct wapbl_wc_null *wcn;
2706		off_t saveoff = off;
2707		error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2708		if (error)
2709			goto errout;
2710		wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2711		switch (wcn->wc_type) {
2712		case WAPBL_WC_BLOCKS:
2713			wapbl_replay_process_blocks(wr, &off);
2714			break;
2715
2716		case WAPBL_WC_REVOCATIONS:
2717			wapbl_replay_process_revocations(wr);
2718			break;
2719
2720		case WAPBL_WC_INODES:
2721			wapbl_replay_process_inodes(wr, saveoff, off);
2722			break;
2723
2724		default:
2725			printf("Unrecognized wapbl type: 0x%08x\n",
2726			       wcn->wc_type);
2727 			error = EFTYPE;
2728			goto errout;
2729		}
2730		wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2731		if (off != saveoff) {
2732			printf("wapbl_replay: corrupted records\n");
2733			error = EFTYPE;
2734			goto errout;
2735		}
2736	}
2737	return 0;
2738
2739 errout:
2740	wapbl_blkhash_clear(wr);
2741	return error;
2742}
2743
2744#if 0
2745int
2746wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2747{
2748	off_t off;
2749	int mismatchcnt = 0;
2750	int logblklen = 1 << wr->wr_log_dev_bshift;
2751	int fsblklen = 1 << wr->wr_fs_dev_bshift;
2752	void *scratch1 = wapbl_alloc(MAXBSIZE);
2753	void *scratch2 = wapbl_alloc(MAXBSIZE);
2754	int error = 0;
2755
2756	KDASSERT(wapbl_replay_isopen(wr));
2757
2758	off = wch->wc_tail;
2759	while (off != wch->wc_head) {
2760		struct wapbl_wc_null *wcn;
2761#ifdef DEBUG
2762		off_t saveoff = off;
2763#endif
2764		error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2765		if (error)
2766			goto out;
2767		wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2768		switch (wcn->wc_type) {
2769		case WAPBL_WC_BLOCKS:
2770			{
2771				struct wapbl_wc_blocklist *wc =
2772				    (struct wapbl_wc_blocklist *)wr->wr_scratch;
2773				int i;
2774				for (i = 0; i < wc->wc_blkcount; i++) {
2775					int foundcnt = 0;
2776					int dirtycnt = 0;
2777					int j, n;
2778					/*
2779					 * Check each physical block into the
2780					 * hashtable independently
2781					 */
2782					n = wc->wc_blocks[i].wc_dlen >>
2783					    wch->wc_fs_dev_bshift;
2784					for (j = 0; j < n; j++) {
2785						struct wapbl_blk *wb =
2786						   wapbl_blkhash_get(wr,
2787						   wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2788						if (wb && (wb->wb_off == off)) {
2789							foundcnt++;
2790							error =
2791							    wapbl_circ_read(wr,
2792							    scratch1, fsblklen,
2793							    &off);
2794							if (error)
2795								goto out;
2796							error =
2797							    wapbl_read(scratch2,
2798							    fsblklen, fsdevvp,
2799							    wb->wb_blk);
2800							if (error)
2801								goto out;
2802							if (memcmp(scratch1,
2803								   scratch2,
2804								   fsblklen)) {
2805								printf(
2806		"wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2807		wb->wb_blk, (intmax_t)off);
2808								dirtycnt++;
2809								mismatchcnt++;
2810							}
2811						} else {
2812							wapbl_circ_advance(wr,
2813							    fsblklen, &off);
2814						}
2815					}
2816#if 0
2817					/*
2818					 * If all of the blocks in an entry
2819					 * are clean, then remove all of its
2820					 * blocks from the hashtable since they
2821					 * never will need replay.
2822					 */
2823					if ((foundcnt != 0) &&
2824					    (dirtycnt == 0)) {
2825						off = saveoff;
2826						wapbl_circ_advance(wr,
2827						    logblklen, &off);
2828						for (j = 0; j < n; j++) {
2829							struct wapbl_blk *wb =
2830							   wapbl_blkhash_get(wr,
2831							   wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2832							if (wb &&
2833							  (wb->wb_off == off)) {
2834								wapbl_blkhash_rem(wr, wb->wb_blk);
2835							}
2836							wapbl_circ_advance(wr,
2837							    fsblklen, &off);
2838						}
2839					}
2840#endif
2841				}
2842			}
2843			break;
2844		case WAPBL_WC_REVOCATIONS:
2845		case WAPBL_WC_INODES:
2846			break;
2847		default:
2848			KASSERT(0);
2849		}
2850#ifdef DEBUG
2851		wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2852		KASSERT(off == saveoff);
2853#endif
2854	}
2855 out:
2856	wapbl_free(scratch1, MAXBSIZE);
2857	wapbl_free(scratch2, MAXBSIZE);
2858	if (!error && mismatchcnt)
2859		error = EFTYPE;
2860	return error;
2861}
2862#endif
2863
2864int
2865wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2866{
2867	struct wapbl_blk *wb;
2868	size_t i;
2869	off_t off;
2870	void *scratch;
2871	int error = 0;
2872	int fsblklen = 1 << wr->wr_fs_dev_bshift;
2873
2874	KDASSERT(wapbl_replay_isopen(wr));
2875
2876	scratch = wapbl_alloc(MAXBSIZE);
2877
2878	for (i = 0; i <= wr->wr_blkhashmask; ++i) {
2879		LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2880			off = wb->wb_off;
2881			error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2882			if (error)
2883				break;
2884			error = wapbl_write(scratch, fsblklen, fsdevvp,
2885			    wb->wb_blk);
2886			if (error)
2887				break;
2888		}
2889	}
2890
2891	wapbl_free(scratch, MAXBSIZE);
2892	return error;
2893}
2894
2895int
2896wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2897{
2898	int fsblklen = 1 << wr->wr_fs_dev_bshift;
2899
2900	KDASSERT(wapbl_replay_isopen(wr));
2901	KASSERT((len % fsblklen) == 0);
2902
2903	while (len != 0) {
2904		struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2905		if (wb)
2906			return 1;
2907		len -= fsblklen;
2908	}
2909	return 0;
2910}
2911
2912int
2913wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2914{
2915	int fsblklen = 1 << wr->wr_fs_dev_bshift;
2916
2917	KDASSERT(wapbl_replay_isopen(wr));
2918
2919	KASSERT((len % fsblklen) == 0);
2920
2921	while (len != 0) {
2922		struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2923		if (wb) {
2924			off_t off = wb->wb_off;
2925			int error;
2926			error = wapbl_circ_read(wr, data, fsblklen, &off);
2927			if (error)
2928				return error;
2929		}
2930		data = (uint8_t *)data + fsblklen;
2931		len -= fsblklen;
2932		blk++;
2933	}
2934	return 0;
2935}
2936
2937#ifdef _KERNEL
2938/*
2939 * This is not really a module now, but maybe on it's way to
2940 * being one some day.
2941 */
2942MODULE(MODULE_CLASS_VFS, wapbl, NULL);
2943
2944static int
2945wapbl_modcmd(modcmd_t cmd, void *arg)
2946{
2947
2948	switch (cmd) {
2949	case MODULE_CMD_INIT:
2950		wapbl_init();
2951		return 0;
2952	case MODULE_CMD_FINI:
2953#ifdef notyet
2954		return wapbl_fini(true);
2955#endif
2956		return EOPNOTSUPP;
2957	default:
2958		return ENOTTY;
2959	}
2960}
2961#endif /* _KERNEL */
2962