1/* $NetBSD: vfs_wapbl.c,v 1.51.2.1 2012/05/07 03:01:13 riz Exp $ */ 2 3/*- 4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32/* 33 * This implements file system independent write ahead filesystem logging. 34 */ 35 36#define WAPBL_INTERNAL 37 38#include <sys/cdefs.h> 39__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.51.2.1 2012/05/07 03:01:13 riz Exp $"); 40 41#include <sys/param.h> 42#include <sys/bitops.h> 43 44#ifdef _KERNEL 45#include <sys/param.h> 46#include <sys/namei.h> 47#include <sys/proc.h> 48#include <sys/sysctl.h> 49#include <sys/uio.h> 50#include <sys/vnode.h> 51#include <sys/file.h> 52#include <sys/module.h> 53#include <sys/resourcevar.h> 54#include <sys/conf.h> 55#include <sys/mount.h> 56#include <sys/kernel.h> 57#include <sys/kauth.h> 58#include <sys/mutex.h> 59#include <sys/atomic.h> 60#include <sys/wapbl.h> 61#include <sys/wapbl_replay.h> 62 63#include <miscfs/specfs/specdev.h> 64 65#define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP) 66#define wapbl_free(a, s) kmem_free((a), (s)) 67#define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP) 68 69static struct sysctllog *wapbl_sysctl; 70static int wapbl_flush_disk_cache = 1; 71static int wapbl_verbose_commit = 0; 72 73#else /* !_KERNEL */ 74#include <assert.h> 75#include <errno.h> 76#include <stdio.h> 77#include <stdbool.h> 78#include <stdlib.h> 79#include <string.h> 80 81#include <sys/time.h> 82#include <sys/wapbl.h> 83#include <sys/wapbl_replay.h> 84 85#define KDASSERT(x) assert(x) 86#define KASSERT(x) assert(x) 87#define wapbl_alloc(s) malloc(s) 88#define wapbl_free(a, s) free(a) 89#define wapbl_calloc(n, s) calloc((n), (s)) 90 91#endif /* !_KERNEL */ 92 93/* 94 * INTERNAL DATA STRUCTURES 95 */ 96 97/* 98 * This structure holds per-mount log information. 99 * 100 * Legend: a = atomic access only 101 * r = read-only after init 102 * l = rwlock held 103 * m = mutex held 104 * lm = rwlock held writing or mutex held 105 * u = unlocked access ok 106 * b = bufcache_lock held 107 */ 108struct wapbl { 109 struct vnode *wl_logvp; /* r: log here */ 110 struct vnode *wl_devvp; /* r: log on this device */ 111 struct mount *wl_mount; /* r: mountpoint wl is associated with */ 112 daddr_t wl_logpbn; /* r: Physical block number of start of log */ 113 int wl_log_dev_bshift; /* r: logarithm of device block size of log 114 device */ 115 int wl_fs_dev_bshift; /* r: logarithm of device block size of 116 filesystem device */ 117 118 unsigned wl_lock_count; /* m: Count of transactions in progress */ 119 120 size_t wl_circ_size; /* r: Number of bytes in buffer of log */ 121 size_t wl_circ_off; /* r: Number of bytes reserved at start */ 122 123 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */ 124 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */ 125 126 off_t wl_head; /* l: Byte offset of log head */ 127 off_t wl_tail; /* l: Byte offset of log tail */ 128 /* 129 * head == tail == 0 means log is empty 130 * head == tail != 0 means log is full 131 * see assertions in wapbl_advance() for other boundary conditions. 132 * only truncate moves the tail, except when flush sets it to 133 * wl_header_size only flush moves the head, except when truncate 134 * sets it to 0. 135 */ 136 137 struct wapbl_wc_header *wl_wc_header; /* l */ 138 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */ 139 140 kmutex_t wl_mtx; /* u: short-term lock */ 141 krwlock_t wl_rwlock; /* u: File system transaction lock */ 142 143 /* 144 * Must be held while accessing 145 * wl_count or wl_bufs or head or tail 146 */ 147 148 /* 149 * Callback called from within the flush routine to flush any extra 150 * bits. Note that flush may be skipped without calling this if 151 * there are no outstanding buffers in the transaction. 152 */ 153#if _KERNEL 154 wapbl_flush_fn_t wl_flush; /* r */ 155 wapbl_flush_fn_t wl_flush_abort;/* r */ 156#endif 157 158 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */ 159 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */ 160 size_t wl_bcount; /* m: Total bcount of wl_bufs */ 161 162 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */ 163 164 kcondvar_t wl_reclaimable_cv; /* m (obviously) */ 165 size_t wl_reclaimable_bytes; /* m: Amount of space available for 166 reclamation by truncate */ 167 int wl_error_count; /* m: # of wl_entries with errors */ 168 size_t wl_reserved_bytes; /* never truncate log smaller than this */ 169 170#ifdef WAPBL_DEBUG_BUFBYTES 171 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */ 172#endif 173 174 daddr_t *wl_deallocblks;/* lm: address of block */ 175 int *wl_dealloclens; /* lm: size of block */ 176 int wl_dealloccnt; /* lm: total count */ 177 int wl_dealloclim; /* l: max count */ 178 179 /* hashtable of inode numbers for allocated but unlinked inodes */ 180 /* synch ??? */ 181 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash; 182 u_long wl_inohashmask; 183 int wl_inohashcnt; 184 185 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction 186 accounting */ 187 188 u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */ 189 daddr_t wl_buffer_dblk; /* l: buffer disk block address */ 190 size_t wl_buffer_used; /* l: buffer current use */ 191}; 192 193#ifdef WAPBL_DEBUG_PRINT 194int wapbl_debug_print = WAPBL_DEBUG_PRINT; 195#endif 196 197/****************************************************************/ 198#ifdef _KERNEL 199 200#ifdef WAPBL_DEBUG 201struct wapbl *wapbl_debug_wl; 202#endif 203 204static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail); 205static int wapbl_write_blocks(struct wapbl *wl, off_t *offp); 206static int wapbl_write_revocations(struct wapbl *wl, off_t *offp); 207static int wapbl_write_inodes(struct wapbl *wl, off_t *offp); 208#endif /* _KERNEL */ 209 210static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t); 211 212static inline size_t wapbl_space_free(size_t avail, off_t head, 213 off_t tail); 214static inline size_t wapbl_space_used(size_t avail, off_t head, 215 off_t tail); 216 217#ifdef _KERNEL 218 219static struct pool wapbl_entry_pool; 220 221#define WAPBL_INODETRK_SIZE 83 222static int wapbl_ino_pool_refcount; 223static struct pool wapbl_ino_pool; 224struct wapbl_ino { 225 LIST_ENTRY(wapbl_ino) wi_hash; 226 ino_t wi_ino; 227 mode_t wi_mode; 228}; 229 230static void wapbl_inodetrk_init(struct wapbl *wl, u_int size); 231static void wapbl_inodetrk_free(struct wapbl *wl); 232static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino); 233 234static size_t wapbl_transaction_len(struct wapbl *wl); 235static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl); 236 237#if 0 238int wapbl_replay_verify(struct wapbl_replay *, struct vnode *); 239#endif 240 241static int wapbl_replay_isopen1(struct wapbl_replay *); 242 243/* 244 * This is useful for debugging. If set, the log will 245 * only be truncated when necessary. 246 */ 247int wapbl_lazy_truncate = 0; 248 249struct wapbl_ops wapbl_ops = { 250 .wo_wapbl_discard = wapbl_discard, 251 .wo_wapbl_replay_isopen = wapbl_replay_isopen1, 252 .wo_wapbl_replay_can_read = wapbl_replay_can_read, 253 .wo_wapbl_replay_read = wapbl_replay_read, 254 .wo_wapbl_add_buf = wapbl_add_buf, 255 .wo_wapbl_remove_buf = wapbl_remove_buf, 256 .wo_wapbl_resize_buf = wapbl_resize_buf, 257 .wo_wapbl_begin = wapbl_begin, 258 .wo_wapbl_end = wapbl_end, 259 .wo_wapbl_junlock_assert= wapbl_junlock_assert, 260 261 /* XXX: the following is only used to say "this is a wapbl buf" */ 262 .wo_wapbl_biodone = wapbl_biodone, 263}; 264 265static int 266wapbl_sysctl_init(void) 267{ 268 int rv; 269 const struct sysctlnode *rnode, *cnode; 270 271 wapbl_sysctl = NULL; 272 273 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode, 274 CTLFLAG_PERMANENT, 275 CTLTYPE_NODE, "vfs", NULL, 276 NULL, 0, NULL, 0, 277 CTL_VFS, CTL_EOL); 278 if (rv) 279 return rv; 280 281 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &rnode, 282 CTLFLAG_PERMANENT, 283 CTLTYPE_NODE, "wapbl", 284 SYSCTL_DESCR("WAPBL journaling options"), 285 NULL, 0, NULL, 0, 286 CTL_CREATE, CTL_EOL); 287 if (rv) 288 return rv; 289 290 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, 291 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 292 CTLTYPE_INT, "flush_disk_cache", 293 SYSCTL_DESCR("flush disk cache"), 294 NULL, 0, &wapbl_flush_disk_cache, 0, 295 CTL_CREATE, CTL_EOL); 296 if (rv) 297 return rv; 298 299 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, 300 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 301 CTLTYPE_INT, "verbose_commit", 302 SYSCTL_DESCR("show time and size of wapbl log commits"), 303 NULL, 0, &wapbl_verbose_commit, 0, 304 CTL_CREATE, CTL_EOL); 305 return rv; 306} 307 308static void 309wapbl_init(void) 310{ 311 312 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0, 313 "wapblentrypl", &pool_allocator_kmem, IPL_VM); 314 315 wapbl_sysctl_init(); 316} 317 318#ifdef notyet 319static int 320wapbl_fini(bool interface) 321{ 322 323 if (aio_sysctl != NULL) 324 sysctl_teardown(&aio_sysctl); 325 326 pool_destroy(&wapbl_entry_pool); 327 328 return 0; 329} 330#endif 331 332static int 333wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr) 334{ 335 int error, i; 336 337 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 338 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt)); 339 340 /* 341 * Its only valid to reuse the replay log if its 342 * the same as the new log we just opened. 343 */ 344 KDASSERT(!wapbl_replay_isopen(wr)); 345 KASSERT(wl->wl_devvp->v_type == VBLK); 346 KASSERT(wr->wr_devvp->v_type == VBLK); 347 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev); 348 KASSERT(wl->wl_logpbn == wr->wr_logpbn); 349 KASSERT(wl->wl_circ_size == wr->wr_circ_size); 350 KASSERT(wl->wl_circ_off == wr->wr_circ_off); 351 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift); 352 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift); 353 354 wl->wl_wc_header->wc_generation = wr->wr_generation + 1; 355 356 for (i = 0; i < wr->wr_inodescnt; i++) 357 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber, 358 wr->wr_inodes[i].wr_imode); 359 360 /* Make sure new transaction won't overwrite old inodes list */ 361 KDASSERT(wapbl_transaction_len(wl) <= 362 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead, 363 wr->wr_inodestail)); 364 365 wl->wl_head = wl->wl_tail = wr->wr_inodeshead; 366 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes = 367 wapbl_transaction_len(wl); 368 369 error = wapbl_write_inodes(wl, &wl->wl_head); 370 if (error) 371 return error; 372 373 KASSERT(wl->wl_head != wl->wl_tail); 374 KASSERT(wl->wl_head != 0); 375 376 return 0; 377} 378 379int 380wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp, 381 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr, 382 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn) 383{ 384 struct wapbl *wl; 385 struct vnode *devvp; 386 daddr_t logpbn; 387 int error; 388 int log_dev_bshift = ilog2(blksize); 389 int fs_dev_bshift = log_dev_bshift; 390 int run; 391 392 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64 393 " count=%zu blksize=%zu\n", vp, off, count, blksize)); 394 395 if (log_dev_bshift > fs_dev_bshift) { 396 WAPBL_PRINTF(WAPBL_PRINT_OPEN, 397 ("wapbl: log device's block size cannot be larger " 398 "than filesystem's\n")); 399 /* 400 * Not currently implemented, although it could be if 401 * needed someday. 402 */ 403 return ENOSYS; 404 } 405 406 if (off < 0) 407 return EINVAL; 408 409 if (blksize < DEV_BSIZE) 410 return EINVAL; 411 if (blksize % DEV_BSIZE) 412 return EINVAL; 413 414 /* XXXTODO: verify that the full load is writable */ 415 416 /* 417 * XXX check for minimum log size 418 * minimum is governed by minimum amount of space 419 * to complete a transaction. (probably truncate) 420 */ 421 /* XXX for now pick something minimal */ 422 if ((count * blksize) < MAXPHYS) { 423 return ENOSPC; 424 } 425 426 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) { 427 return error; 428 } 429 430 wl = wapbl_calloc(1, sizeof(*wl)); 431 rw_init(&wl->wl_rwlock); 432 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE); 433 cv_init(&wl->wl_reclaimable_cv, "wapblrec"); 434 LIST_INIT(&wl->wl_bufs); 435 SIMPLEQ_INIT(&wl->wl_entries); 436 437 wl->wl_logvp = vp; 438 wl->wl_devvp = devvp; 439 wl->wl_mount = mp; 440 wl->wl_logpbn = logpbn; 441 wl->wl_log_dev_bshift = log_dev_bshift; 442 wl->wl_fs_dev_bshift = fs_dev_bshift; 443 444 wl->wl_flush = flushfn; 445 wl->wl_flush_abort = flushabortfn; 446 447 /* Reserve two log device blocks for the commit headers */ 448 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift; 449 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off); 450 /* truncate the log usage to a multiple of log_dev_bshift */ 451 wl->wl_circ_size >>= wl->wl_log_dev_bshift; 452 wl->wl_circ_size <<= wl->wl_log_dev_bshift; 453 454 /* 455 * wl_bufbytes_max limits the size of the in memory transaction space. 456 * - Since buffers are allocated and accounted for in units of 457 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE 458 * (i.e. 1<<PAGE_SHIFT) 459 * - Since the log device has to be written in units of 460 * 1<<wl_log_dev_bshift it is required to be a mulitple of 461 * 1<<wl_log_dev_bshift. 462 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift, 463 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift. 464 * Therefore it must be multiple of the least common multiple of those 465 * three quantities. Fortunately, all of those quantities are 466 * guaranteed to be a power of two, and the least common multiple of 467 * a set of numbers which are all powers of two is simply the maximum 468 * of those numbers. Finally, the maximum logarithm of a power of two 469 * is the same as the log of the maximum power of two. So we can do 470 * the following operations to size wl_bufbytes_max: 471 */ 472 473 /* XXX fix actual number of pages reserved per filesystem. */ 474 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2); 475 476 /* Round wl_bufbytes_max to the largest power of two constraint */ 477 wl->wl_bufbytes_max >>= PAGE_SHIFT; 478 wl->wl_bufbytes_max <<= PAGE_SHIFT; 479 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift; 480 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift; 481 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift; 482 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift; 483 484 /* XXX maybe use filesystem fragment size instead of 1024 */ 485 /* XXX fix actual number of buffers reserved per filesystem. */ 486 wl->wl_bufcount_max = (nbuf / 2) * 1024; 487 488 /* XXX tie this into resource estimation */ 489 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2; 490 491 wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) * 492 wl->wl_dealloclim); 493 wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) * 494 wl->wl_dealloclim); 495 496 wl->wl_buffer = wapbl_alloc(MAXPHYS); 497 wl->wl_buffer_used = 0; 498 499 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); 500 501 /* Initialize the commit header */ 502 { 503 struct wapbl_wc_header *wc; 504 size_t len = 1 << wl->wl_log_dev_bshift; 505 wc = wapbl_calloc(1, len); 506 wc->wc_type = WAPBL_WC_HEADER; 507 wc->wc_len = len; 508 wc->wc_circ_off = wl->wl_circ_off; 509 wc->wc_circ_size = wl->wl_circ_size; 510 /* XXX wc->wc_fsid */ 511 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift; 512 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift; 513 wl->wl_wc_header = wc; 514 wl->wl_wc_scratch = wapbl_alloc(len); 515 } 516 517 /* 518 * if there was an existing set of unlinked but 519 * allocated inodes, preserve it in the new 520 * log. 521 */ 522 if (wr && wr->wr_inodescnt) { 523 error = wapbl_start_flush_inodes(wl, wr); 524 if (error) 525 goto errout; 526 } 527 528 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail); 529 if (error) { 530 goto errout; 531 } 532 533 *wlp = wl; 534#if defined(WAPBL_DEBUG) 535 wapbl_debug_wl = wl; 536#endif 537 538 return 0; 539 errout: 540 wapbl_discard(wl); 541 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 542 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 543 wapbl_free(wl->wl_deallocblks, 544 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 545 wapbl_free(wl->wl_dealloclens, 546 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 547 wapbl_free(wl->wl_buffer, MAXPHYS); 548 wapbl_inodetrk_free(wl); 549 wapbl_free(wl, sizeof(*wl)); 550 551 return error; 552} 553 554/* 555 * Like wapbl_flush, only discards the transaction 556 * completely 557 */ 558 559void 560wapbl_discard(struct wapbl *wl) 561{ 562 struct wapbl_entry *we; 563 struct buf *bp; 564 int i; 565 566 /* 567 * XXX we may consider using upgrade here 568 * if we want to call flush from inside a transaction 569 */ 570 rw_enter(&wl->wl_rwlock, RW_WRITER); 571 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 572 wl->wl_dealloccnt); 573 574#ifdef WAPBL_DEBUG_PRINT 575 { 576 pid_t pid = -1; 577 lwpid_t lid = -1; 578 if (curproc) 579 pid = curproc->p_pid; 580 if (curlwp) 581 lid = curlwp->l_lid; 582#ifdef WAPBL_DEBUG_BUFBYTES 583 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 584 ("wapbl_discard: thread %d.%d discarding " 585 "transaction\n" 586 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 587 "deallocs=%d inodes=%d\n" 588 "\terrcnt = %u, reclaimable=%zu reserved=%zu " 589 "unsynced=%zu\n", 590 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 591 wl->wl_bcount, wl->wl_dealloccnt, 592 wl->wl_inohashcnt, wl->wl_error_count, 593 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 594 wl->wl_unsynced_bufbytes)); 595 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 596 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 597 ("\tentry: bufcount = %zu, reclaimable = %zu, " 598 "error = %d, unsynced = %zu\n", 599 we->we_bufcount, we->we_reclaimable_bytes, 600 we->we_error, we->we_unsynced_bufbytes)); 601 } 602#else /* !WAPBL_DEBUG_BUFBYTES */ 603 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 604 ("wapbl_discard: thread %d.%d discarding transaction\n" 605 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 606 "deallocs=%d inodes=%d\n" 607 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n", 608 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 609 wl->wl_bcount, wl->wl_dealloccnt, 610 wl->wl_inohashcnt, wl->wl_error_count, 611 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); 612 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 613 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 614 ("\tentry: bufcount = %zu, reclaimable = %zu, " 615 "error = %d\n", 616 we->we_bufcount, we->we_reclaimable_bytes, 617 we->we_error)); 618 } 619#endif /* !WAPBL_DEBUG_BUFBYTES */ 620 } 621#endif /* WAPBL_DEBUG_PRINT */ 622 623 for (i = 0; i <= wl->wl_inohashmask; i++) { 624 struct wapbl_ino_head *wih; 625 struct wapbl_ino *wi; 626 627 wih = &wl->wl_inohash[i]; 628 while ((wi = LIST_FIRST(wih)) != NULL) { 629 LIST_REMOVE(wi, wi_hash); 630 pool_put(&wapbl_ino_pool, wi); 631 KASSERT(wl->wl_inohashcnt > 0); 632 wl->wl_inohashcnt--; 633 } 634 } 635 636 /* 637 * clean buffer list 638 */ 639 mutex_enter(&bufcache_lock); 640 mutex_enter(&wl->wl_mtx); 641 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 642 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) { 643 /* 644 * The buffer will be unlocked and 645 * removed from the transaction in brelse 646 */ 647 mutex_exit(&wl->wl_mtx); 648 brelsel(bp, 0); 649 mutex_enter(&wl->wl_mtx); 650 } 651 } 652 mutex_exit(&wl->wl_mtx); 653 mutex_exit(&bufcache_lock); 654 655 /* 656 * Remove references to this wl from wl_entries, free any which 657 * no longer have buffers, others will be freed in wapbl_biodone 658 * when they no longer have any buffers. 659 */ 660 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) { 661 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 662 /* XXX should we be accumulating wl_error_count 663 * and increasing reclaimable bytes ? */ 664 we->we_wapbl = NULL; 665 if (we->we_bufcount == 0) { 666#ifdef WAPBL_DEBUG_BUFBYTES 667 KASSERT(we->we_unsynced_bufbytes == 0); 668#endif 669 pool_put(&wapbl_entry_pool, we); 670 } 671 } 672 673 /* Discard list of deallocs */ 674 wl->wl_dealloccnt = 0; 675 /* XXX should we clear wl_reserved_bytes? */ 676 677 KASSERT(wl->wl_bufbytes == 0); 678 KASSERT(wl->wl_bcount == 0); 679 KASSERT(wl->wl_bufcount == 0); 680 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 681 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 682 KASSERT(wl->wl_inohashcnt == 0); 683 684 rw_exit(&wl->wl_rwlock); 685} 686 687int 688wapbl_stop(struct wapbl *wl, int force) 689{ 690 struct vnode *vp; 691 int error; 692 693 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n")); 694 error = wapbl_flush(wl, 1); 695 if (error) { 696 if (force) 697 wapbl_discard(wl); 698 else 699 return error; 700 } 701 702 /* Unlinked inodes persist after a flush */ 703 if (wl->wl_inohashcnt) { 704 if (force) { 705 wapbl_discard(wl); 706 } else { 707 return EBUSY; 708 } 709 } 710 711 KASSERT(wl->wl_bufbytes == 0); 712 KASSERT(wl->wl_bcount == 0); 713 KASSERT(wl->wl_bufcount == 0); 714 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 715 KASSERT(wl->wl_dealloccnt == 0); 716 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 717 KASSERT(wl->wl_inohashcnt == 0); 718 719 vp = wl->wl_logvp; 720 721 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 722 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 723 wapbl_free(wl->wl_deallocblks, 724 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 725 wapbl_free(wl->wl_dealloclens, 726 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 727 wapbl_free(wl->wl_buffer, MAXPHYS); 728 wapbl_inodetrk_free(wl); 729 730 cv_destroy(&wl->wl_reclaimable_cv); 731 mutex_destroy(&wl->wl_mtx); 732 rw_destroy(&wl->wl_rwlock); 733 wapbl_free(wl, sizeof(*wl)); 734 735 return 0; 736} 737 738static int 739wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags) 740{ 741 struct pstats *pstats = curlwp->l_proc->p_stats; 742 struct buf *bp; 743 int error; 744 745 KASSERT((flags & ~(B_WRITE | B_READ)) == 0); 746 KASSERT(devvp->v_type == VBLK); 747 748 if ((flags & (B_WRITE | B_READ)) == B_WRITE) { 749 mutex_enter(devvp->v_interlock); 750 devvp->v_numoutput++; 751 mutex_exit(devvp->v_interlock); 752 pstats->p_ru.ru_oublock++; 753 } else { 754 pstats->p_ru.ru_inblock++; 755 } 756 757 bp = getiobuf(devvp, true); 758 bp->b_flags = flags; 759 bp->b_cflags = BC_BUSY; /* silly & dubious */ 760 bp->b_dev = devvp->v_rdev; 761 bp->b_data = data; 762 bp->b_bufsize = bp->b_resid = bp->b_bcount = len; 763 bp->b_blkno = pbn; 764 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 765 766 WAPBL_PRINTF(WAPBL_PRINT_IO, 767 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n", 768 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount, 769 bp->b_blkno, bp->b_dev)); 770 771 VOP_STRATEGY(devvp, bp); 772 773 error = biowait(bp); 774 putiobuf(bp); 775 776 if (error) { 777 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 778 ("wapbl_doio: %s %zu bytes at block %" PRId64 779 " on dev 0x%"PRIx64" failed with error %d\n", 780 (((flags & (B_WRITE | B_READ)) == B_WRITE) ? 781 "write" : "read"), 782 len, pbn, devvp->v_rdev, error)); 783 } 784 785 return error; 786} 787 788int 789wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 790{ 791 792 return wapbl_doio(data, len, devvp, pbn, B_WRITE); 793} 794 795int 796wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 797{ 798 799 return wapbl_doio(data, len, devvp, pbn, B_READ); 800} 801 802/* 803 * Flush buffered data if any. 804 */ 805static int 806wapbl_buffered_flush(struct wapbl *wl) 807{ 808 int error; 809 810 if (wl->wl_buffer_used == 0) 811 return 0; 812 813 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used, 814 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE); 815 wl->wl_buffer_used = 0; 816 817 return error; 818} 819 820/* 821 * Write data to the log. 822 * Try to coalesce writes and emit MAXPHYS aligned blocks. 823 */ 824static int 825wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn) 826{ 827 int error; 828 size_t resid; 829 830 /* 831 * If not adjacent to buffered data flush first. Disk block 832 * address is always valid for non-empty buffer. 833 */ 834 if (wl->wl_buffer_used > 0 && 835 pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) { 836 error = wapbl_buffered_flush(wl); 837 if (error) 838 return error; 839 } 840 /* 841 * If this write goes to an empty buffer we have to 842 * save the disk block address first. 843 */ 844 if (wl->wl_buffer_used == 0) 845 wl->wl_buffer_dblk = pbn; 846 /* 847 * Remaining space so this buffer ends on a MAXPHYS boundary. 848 * 849 * Cannot become less or equal zero as the buffer would have been 850 * flushed on the last call then. 851 */ 852 resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) - 853 wl->wl_buffer_used; 854 KASSERT(resid > 0); 855 KASSERT(dbtob(btodb(resid)) == resid); 856 if (len >= resid) { 857 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid); 858 wl->wl_buffer_used += resid; 859 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used, 860 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE); 861 data = (uint8_t *)data + resid; 862 len -= resid; 863 wl->wl_buffer_dblk = pbn + btodb(resid); 864 wl->wl_buffer_used = 0; 865 if (error) 866 return error; 867 } 868 KASSERT(len < MAXPHYS); 869 if (len > 0) { 870 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len); 871 wl->wl_buffer_used += len; 872 } 873 874 return 0; 875} 876 877/* 878 * Off is byte offset returns new offset for next write 879 * handles log wraparound 880 */ 881static int 882wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp) 883{ 884 size_t slen; 885 off_t off = *offp; 886 int error; 887 daddr_t pbn; 888 889 KDASSERT(((len >> wl->wl_log_dev_bshift) << 890 wl->wl_log_dev_bshift) == len); 891 892 if (off < wl->wl_circ_off) 893 off = wl->wl_circ_off; 894 slen = wl->wl_circ_off + wl->wl_circ_size - off; 895 if (slen < len) { 896 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); 897#ifdef _KERNEL 898 pbn = btodb(pbn << wl->wl_log_dev_bshift); 899#endif 900 error = wapbl_buffered_write(data, slen, wl, pbn); 901 if (error) 902 return error; 903 data = (uint8_t *)data + slen; 904 len -= slen; 905 off = wl->wl_circ_off; 906 } 907 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); 908#ifdef _KERNEL 909 pbn = btodb(pbn << wl->wl_log_dev_bshift); 910#endif 911 error = wapbl_buffered_write(data, len, wl, pbn); 912 if (error) 913 return error; 914 off += len; 915 if (off >= wl->wl_circ_off + wl->wl_circ_size) 916 off = wl->wl_circ_off; 917 *offp = off; 918 return 0; 919} 920 921/****************************************************************/ 922 923int 924wapbl_begin(struct wapbl *wl, const char *file, int line) 925{ 926 int doflush; 927 unsigned lockcount; 928 929 KDASSERT(wl); 930 931 /* 932 * XXX this needs to be made much more sophisticated. 933 * perhaps each wapbl_begin could reserve a specified 934 * number of buffers and bytes. 935 */ 936 mutex_enter(&wl->wl_mtx); 937 lockcount = wl->wl_lock_count; 938 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) > 939 wl->wl_bufbytes_max / 2) || 940 ((wl->wl_bufcount + (lockcount * 10)) > 941 wl->wl_bufcount_max / 2) || 942 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) || 943 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2)); 944 mutex_exit(&wl->wl_mtx); 945 946 if (doflush) { 947 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 948 ("force flush lockcnt=%d bufbytes=%zu " 949 "(max=%zu) bufcount=%zu (max=%zu) " 950 "dealloccnt %d (lim=%d)\n", 951 lockcount, wl->wl_bufbytes, 952 wl->wl_bufbytes_max, wl->wl_bufcount, 953 wl->wl_bufcount_max, 954 wl->wl_dealloccnt, wl->wl_dealloclim)); 955 } 956 957 if (doflush) { 958 int error = wapbl_flush(wl, 0); 959 if (error) 960 return error; 961 } 962 963 rw_enter(&wl->wl_rwlock, RW_READER); 964 mutex_enter(&wl->wl_mtx); 965 wl->wl_lock_count++; 966 mutex_exit(&wl->wl_mtx); 967 968#if defined(WAPBL_DEBUG_PRINT) 969 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 970 ("wapbl_begin thread %d.%d with bufcount=%zu " 971 "bufbytes=%zu bcount=%zu at %s:%d\n", 972 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 973 wl->wl_bufbytes, wl->wl_bcount, file, line)); 974#endif 975 976 return 0; 977} 978 979void 980wapbl_end(struct wapbl *wl) 981{ 982 983#if defined(WAPBL_DEBUG_PRINT) 984 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 985 ("wapbl_end thread %d.%d with bufcount=%zu " 986 "bufbytes=%zu bcount=%zu\n", 987 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 988 wl->wl_bufbytes, wl->wl_bcount)); 989#endif 990 991#ifdef DIAGNOSTIC 992 size_t flushsize = wapbl_transaction_len(wl); 993 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 994 /* 995 * XXX this could be handled more gracefully, perhaps place 996 * only a partial transaction in the log and allow the 997 * remaining to flush without the protection of the journal. 998 */ 999 panic("wapbl_end: current transaction too big to flush\n"); 1000 } 1001#endif 1002 1003 mutex_enter(&wl->wl_mtx); 1004 KASSERT(wl->wl_lock_count > 0); 1005 wl->wl_lock_count--; 1006 mutex_exit(&wl->wl_mtx); 1007 1008 rw_exit(&wl->wl_rwlock); 1009} 1010 1011void 1012wapbl_add_buf(struct wapbl *wl, struct buf * bp) 1013{ 1014 1015 KASSERT(bp->b_cflags & BC_BUSY); 1016 KASSERT(bp->b_vp); 1017 1018 wapbl_jlock_assert(wl); 1019 1020#if 0 1021 /* 1022 * XXX this might be an issue for swapfiles. 1023 * see uvm_swap.c:1702 1024 * 1025 * XXX2 why require it then? leap of semantics? 1026 */ 1027 KASSERT((bp->b_cflags & BC_NOCACHE) == 0); 1028#endif 1029 1030 mutex_enter(&wl->wl_mtx); 1031 if (bp->b_flags & B_LOCKED) { 1032 LIST_REMOVE(bp, b_wapbllist); 1033 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2, 1034 ("wapbl_add_buf thread %d.%d re-adding buf %p " 1035 "with %d bytes %d bcount\n", 1036 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 1037 bp->b_bcount)); 1038 } else { 1039 /* unlocked by dirty buffers shouldn't exist */ 1040 KASSERT(!(bp->b_oflags & BO_DELWRI)); 1041 wl->wl_bufbytes += bp->b_bufsize; 1042 wl->wl_bcount += bp->b_bcount; 1043 wl->wl_bufcount++; 1044 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 1045 ("wapbl_add_buf thread %d.%d adding buf %p " 1046 "with %d bytes %d bcount\n", 1047 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 1048 bp->b_bcount)); 1049 } 1050 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist); 1051 mutex_exit(&wl->wl_mtx); 1052 1053 bp->b_flags |= B_LOCKED; 1054} 1055 1056static void 1057wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp) 1058{ 1059 1060 KASSERT(mutex_owned(&wl->wl_mtx)); 1061 KASSERT(bp->b_cflags & BC_BUSY); 1062 wapbl_jlock_assert(wl); 1063 1064#if 0 1065 /* 1066 * XXX this might be an issue for swapfiles. 1067 * see uvm_swap.c:1725 1068 * 1069 * XXXdeux: see above 1070 */ 1071 KASSERT((bp->b_flags & BC_NOCACHE) == 0); 1072#endif 1073 KASSERT(bp->b_flags & B_LOCKED); 1074 1075 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 1076 ("wapbl_remove_buf thread %d.%d removing buf %p with " 1077 "%d bytes %d bcount\n", 1078 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); 1079 1080 KASSERT(wl->wl_bufbytes >= bp->b_bufsize); 1081 wl->wl_bufbytes -= bp->b_bufsize; 1082 KASSERT(wl->wl_bcount >= bp->b_bcount); 1083 wl->wl_bcount -= bp->b_bcount; 1084 KASSERT(wl->wl_bufcount > 0); 1085 wl->wl_bufcount--; 1086 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1087 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1088 LIST_REMOVE(bp, b_wapbllist); 1089 1090 bp->b_flags &= ~B_LOCKED; 1091} 1092 1093/* called from brelsel() in vfs_bio among other places */ 1094void 1095wapbl_remove_buf(struct wapbl * wl, struct buf *bp) 1096{ 1097 1098 mutex_enter(&wl->wl_mtx); 1099 wapbl_remove_buf_locked(wl, bp); 1100 mutex_exit(&wl->wl_mtx); 1101} 1102 1103void 1104wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt) 1105{ 1106 1107 KASSERT(bp->b_cflags & BC_BUSY); 1108 1109 /* 1110 * XXX: why does this depend on B_LOCKED? otherwise the buf 1111 * is not for a transaction? if so, why is this called in the 1112 * first place? 1113 */ 1114 if (bp->b_flags & B_LOCKED) { 1115 mutex_enter(&wl->wl_mtx); 1116 wl->wl_bufbytes += bp->b_bufsize - oldsz; 1117 wl->wl_bcount += bp->b_bcount - oldcnt; 1118 mutex_exit(&wl->wl_mtx); 1119 } 1120} 1121 1122#endif /* _KERNEL */ 1123 1124/****************************************************************/ 1125/* Some utility inlines */ 1126 1127/* This is used to advance the pointer at old to new value at old+delta */ 1128static inline off_t 1129wapbl_advance(size_t size, size_t off, off_t old, size_t delta) 1130{ 1131 off_t new; 1132 1133 /* Define acceptable ranges for inputs. */ 1134 KASSERT(delta <= (size_t)size); 1135 KASSERT((old == 0) || ((size_t)old >= off)); 1136 KASSERT(old < (off_t)(size + off)); 1137 1138 if ((old == 0) && (delta != 0)) 1139 new = off + delta; 1140 else if ((old + delta) < (size + off)) 1141 new = old + delta; 1142 else 1143 new = (old + delta) - size; 1144 1145 /* Note some interesting axioms */ 1146 KASSERT((delta != 0) || (new == old)); 1147 KASSERT((delta == 0) || (new != 0)); 1148 KASSERT((delta != (size)) || (new == old)); 1149 1150 /* Define acceptable ranges for output. */ 1151 KASSERT((new == 0) || ((size_t)new >= off)); 1152 KASSERT((size_t)new < (size + off)); 1153 return new; 1154} 1155 1156static inline size_t 1157wapbl_space_used(size_t avail, off_t head, off_t tail) 1158{ 1159 1160 if (tail == 0) { 1161 KASSERT(head == 0); 1162 return 0; 1163 } 1164 return ((head + (avail - 1) - tail) % avail) + 1; 1165} 1166 1167static inline size_t 1168wapbl_space_free(size_t avail, off_t head, off_t tail) 1169{ 1170 1171 return avail - wapbl_space_used(avail, head, tail); 1172} 1173 1174static inline void 1175wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp, 1176 off_t *tailp) 1177{ 1178 off_t head = *headp; 1179 off_t tail = *tailp; 1180 1181 KASSERT(delta <= wapbl_space_free(size, head, tail)); 1182 head = wapbl_advance(size, off, head, delta); 1183 if ((tail == 0) && (head != 0)) 1184 tail = off; 1185 *headp = head; 1186 *tailp = tail; 1187} 1188 1189static inline void 1190wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp, 1191 off_t *tailp) 1192{ 1193 off_t head = *headp; 1194 off_t tail = *tailp; 1195 1196 KASSERT(delta <= wapbl_space_used(size, head, tail)); 1197 tail = wapbl_advance(size, off, tail, delta); 1198 if (head == tail) { 1199 head = tail = 0; 1200 } 1201 *headp = head; 1202 *tailp = tail; 1203} 1204 1205#ifdef _KERNEL 1206 1207/****************************************************************/ 1208 1209/* 1210 * Remove transactions whose buffers are completely flushed to disk. 1211 * Will block until at least minfree space is available. 1212 * only intended to be called from inside wapbl_flush and therefore 1213 * does not protect against commit races with itself or with flush. 1214 */ 1215static int 1216wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly) 1217{ 1218 size_t delta; 1219 size_t avail; 1220 off_t head; 1221 off_t tail; 1222 int error = 0; 1223 1224 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes)); 1225 KASSERT(rw_write_held(&wl->wl_rwlock)); 1226 1227 mutex_enter(&wl->wl_mtx); 1228 1229 /* 1230 * First check to see if we have to do a commit 1231 * at all. 1232 */ 1233 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail); 1234 if (minfree < avail) { 1235 mutex_exit(&wl->wl_mtx); 1236 return 0; 1237 } 1238 minfree -= avail; 1239 while ((wl->wl_error_count == 0) && 1240 (wl->wl_reclaimable_bytes < minfree)) { 1241 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1242 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd " 1243 "minfree=%zd\n", 1244 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes, 1245 minfree)); 1246 1247 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx); 1248 } 1249 if (wl->wl_reclaimable_bytes < minfree) { 1250 KASSERT(wl->wl_error_count); 1251 /* XXX maybe get actual error from buffer instead someday? */ 1252 error = EIO; 1253 } 1254 head = wl->wl_head; 1255 tail = wl->wl_tail; 1256 delta = wl->wl_reclaimable_bytes; 1257 1258 /* If all of of the entries are flushed, then be sure to keep 1259 * the reserved bytes reserved. Watch out for discarded transactions, 1260 * which could leave more bytes reserved than are reclaimable. 1261 */ 1262 if (SIMPLEQ_EMPTY(&wl->wl_entries) && 1263 (delta >= wl->wl_reserved_bytes)) { 1264 delta -= wl->wl_reserved_bytes; 1265 } 1266 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, 1267 &tail); 1268 KDASSERT(wl->wl_reserved_bytes <= 1269 wapbl_space_used(wl->wl_circ_size, head, tail)); 1270 mutex_exit(&wl->wl_mtx); 1271 1272 if (error) 1273 return error; 1274 1275 if (waitonly) 1276 return 0; 1277 1278 /* 1279 * This is where head, tail and delta are unprotected 1280 * from races against itself or flush. This is ok since 1281 * we only call this routine from inside flush itself. 1282 * 1283 * XXX: how can it race against itself when accessed only 1284 * from behind the write-locked rwlock? 1285 */ 1286 error = wapbl_write_commit(wl, head, tail); 1287 if (error) 1288 return error; 1289 1290 wl->wl_head = head; 1291 wl->wl_tail = tail; 1292 1293 mutex_enter(&wl->wl_mtx); 1294 KASSERT(wl->wl_reclaimable_bytes >= delta); 1295 wl->wl_reclaimable_bytes -= delta; 1296 mutex_exit(&wl->wl_mtx); 1297 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1298 ("wapbl_truncate thread %d.%d truncating %zu bytes\n", 1299 curproc->p_pid, curlwp->l_lid, delta)); 1300 1301 return 0; 1302} 1303 1304/****************************************************************/ 1305 1306void 1307wapbl_biodone(struct buf *bp) 1308{ 1309 struct wapbl_entry *we = bp->b_private; 1310 struct wapbl *wl = we->we_wapbl; 1311#ifdef WAPBL_DEBUG_BUFBYTES 1312 const int bufsize = bp->b_bufsize; 1313#endif 1314 1315 /* 1316 * Handle possible flushing of buffers after log has been 1317 * decomissioned. 1318 */ 1319 if (!wl) { 1320 KASSERT(we->we_bufcount > 0); 1321 we->we_bufcount--; 1322#ifdef WAPBL_DEBUG_BUFBYTES 1323 KASSERT(we->we_unsynced_bufbytes >= bufsize); 1324 we->we_unsynced_bufbytes -= bufsize; 1325#endif 1326 1327 if (we->we_bufcount == 0) { 1328#ifdef WAPBL_DEBUG_BUFBYTES 1329 KASSERT(we->we_unsynced_bufbytes == 0); 1330#endif 1331 pool_put(&wapbl_entry_pool, we); 1332 } 1333 1334 brelse(bp, 0); 1335 return; 1336 } 1337 1338#ifdef ohbother 1339 KDASSERT(bp->b_oflags & BO_DONE); 1340 KDASSERT(!(bp->b_oflags & BO_DELWRI)); 1341 KDASSERT(bp->b_flags & B_ASYNC); 1342 KDASSERT(bp->b_cflags & BC_BUSY); 1343 KDASSERT(!(bp->b_flags & B_LOCKED)); 1344 KDASSERT(!(bp->b_flags & B_READ)); 1345 KDASSERT(!(bp->b_cflags & BC_INVAL)); 1346 KDASSERT(!(bp->b_cflags & BC_NOCACHE)); 1347#endif 1348 1349 if (bp->b_error) { 1350#ifdef notyet /* Can't currently handle possible dirty buffer reuse */ 1351 /* 1352 * XXXpooka: interfaces not fully updated 1353 * Note: this was not enabled in the original patch 1354 * against netbsd4 either. I don't know if comment 1355 * above is true or not. 1356 */ 1357 1358 /* 1359 * If an error occurs, report the error and leave the 1360 * buffer as a delayed write on the LRU queue. 1361 * restarting the write would likely result in 1362 * an error spinloop, so let it be done harmlessly 1363 * by the syncer. 1364 */ 1365 bp->b_flags &= ~(B_DONE); 1366 simple_unlock(&bp->b_interlock); 1367 1368 if (we->we_error == 0) { 1369 mutex_enter(&wl->wl_mtx); 1370 wl->wl_error_count++; 1371 mutex_exit(&wl->wl_mtx); 1372 cv_broadcast(&wl->wl_reclaimable_cv); 1373 } 1374 we->we_error = bp->b_error; 1375 bp->b_error = 0; 1376 brelse(bp); 1377 return; 1378#else 1379 /* For now, just mark the log permanently errored out */ 1380 1381 mutex_enter(&wl->wl_mtx); 1382 if (wl->wl_error_count == 0) { 1383 wl->wl_error_count++; 1384 cv_broadcast(&wl->wl_reclaimable_cv); 1385 } 1386 mutex_exit(&wl->wl_mtx); 1387#endif 1388 } 1389 1390 /* 1391 * Release the buffer here. wapbl_flush() may wait for the 1392 * log to become empty and we better unbusy the buffer before 1393 * wapbl_flush() returns. 1394 */ 1395 brelse(bp, 0); 1396 1397 mutex_enter(&wl->wl_mtx); 1398 1399 KASSERT(we->we_bufcount > 0); 1400 we->we_bufcount--; 1401#ifdef WAPBL_DEBUG_BUFBYTES 1402 KASSERT(we->we_unsynced_bufbytes >= bufsize); 1403 we->we_unsynced_bufbytes -= bufsize; 1404 KASSERT(wl->wl_unsynced_bufbytes >= bufsize); 1405 wl->wl_unsynced_bufbytes -= bufsize; 1406#endif 1407 1408 /* 1409 * If the current transaction can be reclaimed, start 1410 * at the beginning and reclaim any consecutive reclaimable 1411 * transactions. If we successfully reclaim anything, 1412 * then wakeup anyone waiting for the reclaim. 1413 */ 1414 if (we->we_bufcount == 0) { 1415 size_t delta = 0; 1416 int errcnt = 0; 1417#ifdef WAPBL_DEBUG_BUFBYTES 1418 KDASSERT(we->we_unsynced_bufbytes == 0); 1419#endif 1420 /* 1421 * clear any posted error, since the buffer it came from 1422 * has successfully flushed by now 1423 */ 1424 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) && 1425 (we->we_bufcount == 0)) { 1426 delta += we->we_reclaimable_bytes; 1427 if (we->we_error) 1428 errcnt++; 1429 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 1430 pool_put(&wapbl_entry_pool, we); 1431 } 1432 1433 if (delta) { 1434 wl->wl_reclaimable_bytes += delta; 1435 KASSERT(wl->wl_error_count >= errcnt); 1436 wl->wl_error_count -= errcnt; 1437 cv_broadcast(&wl->wl_reclaimable_cv); 1438 } 1439 } 1440 1441 mutex_exit(&wl->wl_mtx); 1442} 1443 1444/* 1445 * Write transactions to disk + start I/O for contents 1446 */ 1447int 1448wapbl_flush(struct wapbl *wl, int waitfor) 1449{ 1450 struct buf *bp; 1451 struct wapbl_entry *we; 1452 off_t off; 1453 off_t head; 1454 off_t tail; 1455 size_t delta = 0; 1456 size_t flushsize; 1457 size_t reserved; 1458 int error = 0; 1459 1460 /* 1461 * Do a quick check to see if a full flush can be skipped 1462 * This assumes that the flush callback does not need to be called 1463 * unless there are other outstanding bufs. 1464 */ 1465 if (!waitfor) { 1466 size_t nbufs; 1467 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to 1468 protect the KASSERTS */ 1469 nbufs = wl->wl_bufcount; 1470 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1471 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1472 mutex_exit(&wl->wl_mtx); 1473 if (nbufs == 0) 1474 return 0; 1475 } 1476 1477 /* 1478 * XXX we may consider using LK_UPGRADE here 1479 * if we want to call flush from inside a transaction 1480 */ 1481 rw_enter(&wl->wl_rwlock, RW_WRITER); 1482 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 1483 wl->wl_dealloccnt); 1484 1485 /* 1486 * Now that we are fully locked and flushed, 1487 * do another check for nothing to do. 1488 */ 1489 if (wl->wl_bufcount == 0) { 1490 goto out; 1491 } 1492 1493#if 0 1494 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1495 ("wapbl_flush thread %d.%d flushing entries with " 1496 "bufcount=%zu bufbytes=%zu\n", 1497 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 1498 wl->wl_bufbytes)); 1499#endif 1500 1501 /* Calculate amount of space needed to flush */ 1502 flushsize = wapbl_transaction_len(wl); 1503 if (wapbl_verbose_commit) { 1504 struct timespec ts; 1505 getnanotime(&ts); 1506 printf("%s: %lld.%09ld this transaction = %zu bytes\n", 1507 __func__, (long long)ts.tv_sec, 1508 (long)ts.tv_nsec, flushsize); 1509 } 1510 1511 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 1512 /* 1513 * XXX this could be handled more gracefully, perhaps place 1514 * only a partial transaction in the log and allow the 1515 * remaining to flush without the protection of the journal. 1516 */ 1517 panic("wapbl_flush: current transaction too big to flush\n"); 1518 } 1519 1520 error = wapbl_truncate(wl, flushsize, 0); 1521 if (error) 1522 goto out2; 1523 1524 off = wl->wl_head; 1525 KASSERT((off == 0) || ((off >= wl->wl_circ_off) && 1526 (off < wl->wl_circ_off + wl->wl_circ_size))); 1527 error = wapbl_write_blocks(wl, &off); 1528 if (error) 1529 goto out2; 1530 error = wapbl_write_revocations(wl, &off); 1531 if (error) 1532 goto out2; 1533 error = wapbl_write_inodes(wl, &off); 1534 if (error) 1535 goto out2; 1536 1537 reserved = 0; 1538 if (wl->wl_inohashcnt) 1539 reserved = wapbl_transaction_inodes_len(wl); 1540 1541 head = wl->wl_head; 1542 tail = wl->wl_tail; 1543 1544 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize, 1545 &head, &tail); 1546#ifdef WAPBL_DEBUG 1547 if (head != off) { 1548 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX 1549 " off=%"PRIdMAX" flush=%zu\n", 1550 (intmax_t)head, (intmax_t)tail, (intmax_t)off, 1551 flushsize); 1552 } 1553#else 1554 KASSERT(head == off); 1555#endif 1556 1557 /* Opportunistically move the tail forward if we can */ 1558 if (!wapbl_lazy_truncate) { 1559 mutex_enter(&wl->wl_mtx); 1560 delta = wl->wl_reclaimable_bytes; 1561 mutex_exit(&wl->wl_mtx); 1562 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, 1563 &head, &tail); 1564 } 1565 1566 error = wapbl_write_commit(wl, head, tail); 1567 if (error) 1568 goto out2; 1569 1570 we = pool_get(&wapbl_entry_pool, PR_WAITOK); 1571 1572#ifdef WAPBL_DEBUG_BUFBYTES 1573 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1574 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1575 " unsynced=%zu" 1576 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1577 "inodes=%d\n", 1578 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1579 wapbl_space_used(wl->wl_circ_size, head, tail), 1580 wl->wl_unsynced_bufbytes, wl->wl_bufcount, 1581 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, 1582 wl->wl_inohashcnt)); 1583#else 1584 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1585 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1586 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1587 "inodes=%d\n", 1588 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1589 wapbl_space_used(wl->wl_circ_size, head, tail), 1590 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1591 wl->wl_dealloccnt, wl->wl_inohashcnt)); 1592#endif 1593 1594 1595 mutex_enter(&bufcache_lock); 1596 mutex_enter(&wl->wl_mtx); 1597 1598 wl->wl_reserved_bytes = reserved; 1599 wl->wl_head = head; 1600 wl->wl_tail = tail; 1601 KASSERT(wl->wl_reclaimable_bytes >= delta); 1602 wl->wl_reclaimable_bytes -= delta; 1603 wl->wl_dealloccnt = 0; 1604#ifdef WAPBL_DEBUG_BUFBYTES 1605 wl->wl_unsynced_bufbytes += wl->wl_bufbytes; 1606#endif 1607 1608 we->we_wapbl = wl; 1609 we->we_bufcount = wl->wl_bufcount; 1610#ifdef WAPBL_DEBUG_BUFBYTES 1611 we->we_unsynced_bufbytes = wl->wl_bufbytes; 1612#endif 1613 we->we_reclaimable_bytes = flushsize; 1614 we->we_error = 0; 1615 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries); 1616 1617 /* 1618 * this flushes bufs in reverse order than they were queued 1619 * it shouldn't matter, but if we care we could use TAILQ instead. 1620 * XXX Note they will get put on the lru queue when they flush 1621 * so we might actually want to change this to preserve order. 1622 */ 1623 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 1624 if (bbusy(bp, 0, 0, &wl->wl_mtx)) { 1625 continue; 1626 } 1627 bp->b_iodone = wapbl_biodone; 1628 bp->b_private = we; 1629 bremfree(bp); 1630 wapbl_remove_buf_locked(wl, bp); 1631 mutex_exit(&wl->wl_mtx); 1632 mutex_exit(&bufcache_lock); 1633 bawrite(bp); 1634 mutex_enter(&bufcache_lock); 1635 mutex_enter(&wl->wl_mtx); 1636 } 1637 mutex_exit(&wl->wl_mtx); 1638 mutex_exit(&bufcache_lock); 1639 1640#if 0 1641 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1642 ("wapbl_flush thread %d.%d done flushing entries...\n", 1643 curproc->p_pid, curlwp->l_lid)); 1644#endif 1645 1646 out: 1647 1648 /* 1649 * If the waitfor flag is set, don't return until everything is 1650 * fully flushed and the on disk log is empty. 1651 */ 1652 if (waitfor) { 1653 error = wapbl_truncate(wl, wl->wl_circ_size - 1654 wl->wl_reserved_bytes, wapbl_lazy_truncate); 1655 } 1656 1657 out2: 1658 if (error) { 1659 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks, 1660 wl->wl_dealloclens, wl->wl_dealloccnt); 1661 } 1662 1663#ifdef WAPBL_DEBUG_PRINT 1664 if (error) { 1665 pid_t pid = -1; 1666 lwpid_t lid = -1; 1667 if (curproc) 1668 pid = curproc->p_pid; 1669 if (curlwp) 1670 lid = curlwp->l_lid; 1671 mutex_enter(&wl->wl_mtx); 1672#ifdef WAPBL_DEBUG_BUFBYTES 1673 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1674 ("wapbl_flush: thread %d.%d aborted flush: " 1675 "error = %d\n" 1676 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1677 "deallocs=%d inodes=%d\n" 1678 "\terrcnt = %d, reclaimable=%zu reserved=%zu " 1679 "unsynced=%zu\n", 1680 pid, lid, error, wl->wl_bufcount, 1681 wl->wl_bufbytes, wl->wl_bcount, 1682 wl->wl_dealloccnt, wl->wl_inohashcnt, 1683 wl->wl_error_count, wl->wl_reclaimable_bytes, 1684 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); 1685 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1686 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1687 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1688 "error = %d, unsynced = %zu\n", 1689 we->we_bufcount, we->we_reclaimable_bytes, 1690 we->we_error, we->we_unsynced_bufbytes)); 1691 } 1692#else 1693 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1694 ("wapbl_flush: thread %d.%d aborted flush: " 1695 "error = %d\n" 1696 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1697 "deallocs=%d inodes=%d\n" 1698 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n", 1699 pid, lid, error, wl->wl_bufcount, 1700 wl->wl_bufbytes, wl->wl_bcount, 1701 wl->wl_dealloccnt, wl->wl_inohashcnt, 1702 wl->wl_error_count, wl->wl_reclaimable_bytes, 1703 wl->wl_reserved_bytes)); 1704 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1705 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1706 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1707 "error = %d\n", we->we_bufcount, 1708 we->we_reclaimable_bytes, we->we_error)); 1709 } 1710#endif 1711 mutex_exit(&wl->wl_mtx); 1712 } 1713#endif 1714 1715 rw_exit(&wl->wl_rwlock); 1716 return error; 1717} 1718 1719/****************************************************************/ 1720 1721void 1722wapbl_jlock_assert(struct wapbl *wl) 1723{ 1724 1725 KASSERT(rw_lock_held(&wl->wl_rwlock)); 1726} 1727 1728void 1729wapbl_junlock_assert(struct wapbl *wl) 1730{ 1731 1732 KASSERT(!rw_write_held(&wl->wl_rwlock)); 1733} 1734 1735/****************************************************************/ 1736 1737/* locks missing */ 1738void 1739wapbl_print(struct wapbl *wl, 1740 int full, 1741 void (*pr)(const char *, ...)) 1742{ 1743 struct buf *bp; 1744 struct wapbl_entry *we; 1745 (*pr)("wapbl %p", wl); 1746 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n", 1747 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn); 1748 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n", 1749 wl->wl_circ_size, wl->wl_circ_off, 1750 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail); 1751 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n", 1752 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift); 1753#ifdef WAPBL_DEBUG_BUFBYTES 1754 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1755 "reserved = %zu errcnt = %d unsynced = %zu\n", 1756 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1757 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1758 wl->wl_error_count, wl->wl_unsynced_bufbytes); 1759#else 1760 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1761 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes, 1762 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1763 wl->wl_error_count); 1764#endif 1765 (*pr)("\tdealloccnt = %d, dealloclim = %d\n", 1766 wl->wl_dealloccnt, wl->wl_dealloclim); 1767 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n", 1768 wl->wl_inohashcnt, wl->wl_inohashmask); 1769 (*pr)("entries:\n"); 1770 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1771#ifdef WAPBL_DEBUG_BUFBYTES 1772 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, " 1773 "unsynced = %zu\n", 1774 we->we_bufcount, we->we_reclaimable_bytes, 1775 we->we_error, we->we_unsynced_bufbytes); 1776#else 1777 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n", 1778 we->we_bufcount, we->we_reclaimable_bytes, we->we_error); 1779#endif 1780 } 1781 if (full) { 1782 int cnt = 0; 1783 (*pr)("bufs ="); 1784 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) { 1785 if (!LIST_NEXT(bp, b_wapbllist)) { 1786 (*pr)(" %p", bp); 1787 } else if ((++cnt % 6) == 0) { 1788 (*pr)(" %p,\n\t", bp); 1789 } else { 1790 (*pr)(" %p,", bp); 1791 } 1792 } 1793 (*pr)("\n"); 1794 1795 (*pr)("dealloced blks = "); 1796 { 1797 int i; 1798 cnt = 0; 1799 for (i = 0; i < wl->wl_dealloccnt; i++) { 1800 (*pr)(" %"PRId64":%d,", 1801 wl->wl_deallocblks[i], 1802 wl->wl_dealloclens[i]); 1803 if ((++cnt % 4) == 0) { 1804 (*pr)("\n\t"); 1805 } 1806 } 1807 } 1808 (*pr)("\n"); 1809 1810 (*pr)("registered inodes = "); 1811 { 1812 int i; 1813 cnt = 0; 1814 for (i = 0; i <= wl->wl_inohashmask; i++) { 1815 struct wapbl_ino_head *wih; 1816 struct wapbl_ino *wi; 1817 1818 wih = &wl->wl_inohash[i]; 1819 LIST_FOREACH(wi, wih, wi_hash) { 1820 if (wi->wi_ino == 0) 1821 continue; 1822 (*pr)(" %"PRId32"/0%06"PRIo32",", 1823 wi->wi_ino, wi->wi_mode); 1824 if ((++cnt % 4) == 0) { 1825 (*pr)("\n\t"); 1826 } 1827 } 1828 } 1829 (*pr)("\n"); 1830 } 1831 } 1832} 1833 1834#if defined(WAPBL_DEBUG) || defined(DDB) 1835void 1836wapbl_dump(struct wapbl *wl) 1837{ 1838#if defined(WAPBL_DEBUG) 1839 if (!wl) 1840 wl = wapbl_debug_wl; 1841#endif 1842 if (!wl) 1843 return; 1844 wapbl_print(wl, 1, printf); 1845} 1846#endif 1847 1848/****************************************************************/ 1849 1850void 1851wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len) 1852{ 1853 1854 wapbl_jlock_assert(wl); 1855 1856 mutex_enter(&wl->wl_mtx); 1857 /* XXX should eventually instead tie this into resource estimation */ 1858 /* 1859 * XXX this panic needs locking/mutex analysis and the 1860 * ability to cope with the failure. 1861 */ 1862 /* XXX this XXX doesn't have enough XXX */ 1863 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) 1864 panic("wapbl_register_deallocation: out of resources"); 1865 1866 wl->wl_deallocblks[wl->wl_dealloccnt] = blk; 1867 wl->wl_dealloclens[wl->wl_dealloccnt] = len; 1868 wl->wl_dealloccnt++; 1869 WAPBL_PRINTF(WAPBL_PRINT_ALLOC, 1870 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len)); 1871 mutex_exit(&wl->wl_mtx); 1872} 1873 1874/****************************************************************/ 1875 1876static void 1877wapbl_inodetrk_init(struct wapbl *wl, u_int size) 1878{ 1879 1880 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask); 1881 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) { 1882 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0, 1883 "wapblinopl", &pool_allocator_nointr, IPL_NONE); 1884 } 1885} 1886 1887static void 1888wapbl_inodetrk_free(struct wapbl *wl) 1889{ 1890 1891 /* XXX this KASSERT needs locking/mutex analysis */ 1892 KASSERT(wl->wl_inohashcnt == 0); 1893 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask); 1894 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) { 1895 pool_destroy(&wapbl_ino_pool); 1896 } 1897} 1898 1899static struct wapbl_ino * 1900wapbl_inodetrk_get(struct wapbl *wl, ino_t ino) 1901{ 1902 struct wapbl_ino_head *wih; 1903 struct wapbl_ino *wi; 1904 1905 KASSERT(mutex_owned(&wl->wl_mtx)); 1906 1907 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1908 LIST_FOREACH(wi, wih, wi_hash) { 1909 if (ino == wi->wi_ino) 1910 return wi; 1911 } 1912 return 0; 1913} 1914 1915void 1916wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1917{ 1918 struct wapbl_ino_head *wih; 1919 struct wapbl_ino *wi; 1920 1921 wi = pool_get(&wapbl_ino_pool, PR_WAITOK); 1922 1923 mutex_enter(&wl->wl_mtx); 1924 if (wapbl_inodetrk_get(wl, ino) == NULL) { 1925 wi->wi_ino = ino; 1926 wi->wi_mode = mode; 1927 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1928 LIST_INSERT_HEAD(wih, wi, wi_hash); 1929 wl->wl_inohashcnt++; 1930 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1931 ("wapbl_register_inode: ino=%"PRId64"\n", ino)); 1932 mutex_exit(&wl->wl_mtx); 1933 } else { 1934 mutex_exit(&wl->wl_mtx); 1935 pool_put(&wapbl_ino_pool, wi); 1936 } 1937} 1938 1939void 1940wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1941{ 1942 struct wapbl_ino *wi; 1943 1944 mutex_enter(&wl->wl_mtx); 1945 wi = wapbl_inodetrk_get(wl, ino); 1946 if (wi) { 1947 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1948 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino)); 1949 KASSERT(wl->wl_inohashcnt > 0); 1950 wl->wl_inohashcnt--; 1951 LIST_REMOVE(wi, wi_hash); 1952 mutex_exit(&wl->wl_mtx); 1953 1954 pool_put(&wapbl_ino_pool, wi); 1955 } else { 1956 mutex_exit(&wl->wl_mtx); 1957 } 1958} 1959 1960/****************************************************************/ 1961 1962static inline size_t 1963wapbl_transaction_inodes_len(struct wapbl *wl) 1964{ 1965 int blocklen = 1<<wl->wl_log_dev_bshift; 1966 int iph; 1967 1968 /* Calculate number of inodes described in a inodelist header */ 1969 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 1970 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 1971 1972 KASSERT(iph > 0); 1973 1974 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen; 1975} 1976 1977 1978/* Calculate amount of space a transaction will take on disk */ 1979static size_t 1980wapbl_transaction_len(struct wapbl *wl) 1981{ 1982 int blocklen = 1<<wl->wl_log_dev_bshift; 1983 size_t len; 1984 int bph; 1985 1986 /* Calculate number of blocks described in a blocklist header */ 1987 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1988 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1989 1990 KASSERT(bph > 0); 1991 1992 len = wl->wl_bcount; 1993 len += howmany(wl->wl_bufcount, bph) * blocklen; 1994 len += howmany(wl->wl_dealloccnt, bph) * blocklen; 1995 len += wapbl_transaction_inodes_len(wl); 1996 1997 return len; 1998} 1999 2000/* 2001 * wapbl_cache_sync: issue DIOCCACHESYNC 2002 */ 2003static int 2004wapbl_cache_sync(struct wapbl *wl, const char *msg) 2005{ 2006 const bool verbose = wapbl_verbose_commit >= 2; 2007 struct bintime start_time; 2008 int force = 1; 2009 int error; 2010 2011 if (!wapbl_flush_disk_cache) { 2012 return 0; 2013 } 2014 if (verbose) { 2015 bintime(&start_time); 2016 } 2017 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, 2018 FWRITE, FSCRED); 2019 if (error) { 2020 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 2021 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%x " 2022 "returned %d\n", wl->wl_devvp->v_rdev, error)); 2023 } 2024 if (verbose) { 2025 struct bintime d; 2026 struct timespec ts; 2027 2028 bintime(&d); 2029 bintime_sub(&d, &start_time); 2030 bintime2timespec(&d, &ts); 2031 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n", 2032 msg, (uintmax_t)wl->wl_devvp->v_rdev, 2033 (uintmax_t)ts.tv_sec, ts.tv_nsec); 2034 } 2035 return error; 2036} 2037 2038/* 2039 * Perform commit operation 2040 * 2041 * Note that generation number incrementation needs to 2042 * be protected against racing with other invocations 2043 * of wapbl_write_commit. This is ok since this routine 2044 * is only invoked from wapbl_flush 2045 */ 2046static int 2047wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail) 2048{ 2049 struct wapbl_wc_header *wc = wl->wl_wc_header; 2050 struct timespec ts; 2051 int error; 2052 daddr_t pbn; 2053 2054 error = wapbl_buffered_flush(wl); 2055 if (error) 2056 return error; 2057 /* 2058 * flush disk cache to ensure that blocks we've written are actually 2059 * written to the stable storage before the commit header. 2060 * 2061 * XXX Calc checksum here, instead we do this for now 2062 */ 2063 wapbl_cache_sync(wl, "1"); 2064 2065 wc->wc_head = head; 2066 wc->wc_tail = tail; 2067 wc->wc_checksum = 0; 2068 wc->wc_version = 1; 2069 getnanotime(&ts); 2070 wc->wc_time = ts.tv_sec; 2071 wc->wc_timensec = ts.tv_nsec; 2072 2073 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2074 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n", 2075 (intmax_t)head, (intmax_t)tail)); 2076 2077 /* 2078 * write the commit header. 2079 * 2080 * XXX if generation will rollover, then first zero 2081 * over second commit header before trying to write both headers. 2082 */ 2083 2084 pbn = wl->wl_logpbn + (wc->wc_generation % 2); 2085#ifdef _KERNEL 2086 pbn = btodb(pbn << wc->wc_log_dev_bshift); 2087#endif 2088 error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn); 2089 if (error) 2090 return error; 2091 error = wapbl_buffered_flush(wl); 2092 if (error) 2093 return error; 2094 2095 /* 2096 * flush disk cache to ensure that the commit header is actually 2097 * written before meta data blocks. 2098 */ 2099 wapbl_cache_sync(wl, "2"); 2100 2101 /* 2102 * If the generation number was zero, write it out a second time. 2103 * This handles initialization and generation number rollover 2104 */ 2105 if (wc->wc_generation++ == 0) { 2106 error = wapbl_write_commit(wl, head, tail); 2107 /* 2108 * This panic should be able to be removed if we do the 2109 * zero'ing mentioned above, and we are certain to roll 2110 * back generation number on failure. 2111 */ 2112 if (error) 2113 panic("wapbl_write_commit: error writing duplicate " 2114 "log header: %d\n", error); 2115 } 2116 return 0; 2117} 2118 2119/* Returns new offset value */ 2120static int 2121wapbl_write_blocks(struct wapbl *wl, off_t *offp) 2122{ 2123 struct wapbl_wc_blocklist *wc = 2124 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 2125 int blocklen = 1<<wl->wl_log_dev_bshift; 2126 int bph; 2127 struct buf *bp; 2128 off_t off = *offp; 2129 int error; 2130 size_t padding; 2131 2132 KASSERT(rw_write_held(&wl->wl_rwlock)); 2133 2134 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 2135 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 2136 2137 bp = LIST_FIRST(&wl->wl_bufs); 2138 2139 while (bp) { 2140 int cnt; 2141 struct buf *obp = bp; 2142 2143 KASSERT(bp->b_flags & B_LOCKED); 2144 2145 wc->wc_type = WAPBL_WC_BLOCKS; 2146 wc->wc_len = blocklen; 2147 wc->wc_blkcount = 0; 2148 while (bp && (wc->wc_blkcount < bph)) { 2149 /* 2150 * Make sure all the physical block numbers are up to 2151 * date. If this is not always true on a given 2152 * filesystem, then VOP_BMAP must be called. We 2153 * could call VOP_BMAP here, or else in the filesystem 2154 * specific flush callback, although neither of those 2155 * solutions allow us to take the vnode lock. If a 2156 * filesystem requires that we must take the vnode lock 2157 * to call VOP_BMAP, then we can probably do it in 2158 * bwrite when the vnode lock should already be held 2159 * by the invoking code. 2160 */ 2161 KASSERT((bp->b_vp->v_type == VBLK) || 2162 (bp->b_blkno != bp->b_lblkno)); 2163 KASSERT(bp->b_blkno > 0); 2164 2165 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno; 2166 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount; 2167 wc->wc_len += bp->b_bcount; 2168 wc->wc_blkcount++; 2169 bp = LIST_NEXT(bp, b_wapbllist); 2170 } 2171 if (wc->wc_len % blocklen != 0) { 2172 padding = blocklen - wc->wc_len % blocklen; 2173 wc->wc_len += padding; 2174 } else { 2175 padding = 0; 2176 } 2177 2178 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2179 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n", 2180 wc->wc_len, padding, (intmax_t)off)); 2181 2182 error = wapbl_circ_write(wl, wc, blocklen, &off); 2183 if (error) 2184 return error; 2185 bp = obp; 2186 cnt = 0; 2187 while (bp && (cnt++ < bph)) { 2188 error = wapbl_circ_write(wl, bp->b_data, 2189 bp->b_bcount, &off); 2190 if (error) 2191 return error; 2192 bp = LIST_NEXT(bp, b_wapbllist); 2193 } 2194 if (padding) { 2195 void *zero; 2196 2197 zero = wapbl_alloc(padding); 2198 memset(zero, 0, padding); 2199 error = wapbl_circ_write(wl, zero, padding, &off); 2200 wapbl_free(zero, padding); 2201 if (error) 2202 return error; 2203 } 2204 } 2205 *offp = off; 2206 return 0; 2207} 2208 2209static int 2210wapbl_write_revocations(struct wapbl *wl, off_t *offp) 2211{ 2212 struct wapbl_wc_blocklist *wc = 2213 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 2214 int i; 2215 int blocklen = 1<<wl->wl_log_dev_bshift; 2216 int bph; 2217 off_t off = *offp; 2218 int error; 2219 2220 if (wl->wl_dealloccnt == 0) 2221 return 0; 2222 2223 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 2224 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 2225 2226 i = 0; 2227 while (i < wl->wl_dealloccnt) { 2228 wc->wc_type = WAPBL_WC_REVOCATIONS; 2229 wc->wc_len = blocklen; 2230 wc->wc_blkcount = 0; 2231 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) { 2232 wc->wc_blocks[wc->wc_blkcount].wc_daddr = 2233 wl->wl_deallocblks[i]; 2234 wc->wc_blocks[wc->wc_blkcount].wc_dlen = 2235 wl->wl_dealloclens[i]; 2236 wc->wc_blkcount++; 2237 i++; 2238 } 2239 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2240 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n", 2241 wc->wc_len, (intmax_t)off)); 2242 error = wapbl_circ_write(wl, wc, blocklen, &off); 2243 if (error) 2244 return error; 2245 } 2246 *offp = off; 2247 return 0; 2248} 2249 2250static int 2251wapbl_write_inodes(struct wapbl *wl, off_t *offp) 2252{ 2253 struct wapbl_wc_inodelist *wc = 2254 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch; 2255 int i; 2256 int blocklen = 1 << wl->wl_log_dev_bshift; 2257 off_t off = *offp; 2258 int error; 2259 2260 struct wapbl_ino_head *wih; 2261 struct wapbl_ino *wi; 2262 int iph; 2263 2264 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 2265 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 2266 2267 i = 0; 2268 wih = &wl->wl_inohash[0]; 2269 wi = 0; 2270 do { 2271 wc->wc_type = WAPBL_WC_INODES; 2272 wc->wc_len = blocklen; 2273 wc->wc_inocnt = 0; 2274 wc->wc_clear = (i == 0); 2275 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) { 2276 while (!wi) { 2277 KASSERT((wih - &wl->wl_inohash[0]) 2278 <= wl->wl_inohashmask); 2279 wi = LIST_FIRST(wih++); 2280 } 2281 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino; 2282 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode; 2283 wc->wc_inocnt++; 2284 i++; 2285 wi = LIST_NEXT(wi, wi_hash); 2286 } 2287 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2288 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n", 2289 wc->wc_len, (intmax_t)off)); 2290 error = wapbl_circ_write(wl, wc, blocklen, &off); 2291 if (error) 2292 return error; 2293 } while (i < wl->wl_inohashcnt); 2294 2295 *offp = off; 2296 return 0; 2297} 2298 2299#endif /* _KERNEL */ 2300 2301/****************************************************************/ 2302 2303struct wapbl_blk { 2304 LIST_ENTRY(wapbl_blk) wb_hash; 2305 daddr_t wb_blk; 2306 off_t wb_off; /* Offset of this block in the log */ 2307}; 2308#define WAPBL_BLKPOOL_MIN 83 2309 2310static void 2311wapbl_blkhash_init(struct wapbl_replay *wr, u_int size) 2312{ 2313 if (size < WAPBL_BLKPOOL_MIN) 2314 size = WAPBL_BLKPOOL_MIN; 2315 KASSERT(wr->wr_blkhash == 0); 2316#ifdef _KERNEL 2317 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask); 2318#else /* ! _KERNEL */ 2319 /* Manually implement hashinit */ 2320 { 2321 unsigned long i, hashsize; 2322 for (hashsize = 1; hashsize < size; hashsize <<= 1) 2323 continue; 2324 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash)); 2325 for (i = 0; i < hashsize; i++) 2326 LIST_INIT(&wr->wr_blkhash[i]); 2327 wr->wr_blkhashmask = hashsize - 1; 2328 } 2329#endif /* ! _KERNEL */ 2330} 2331 2332static void 2333wapbl_blkhash_free(struct wapbl_replay *wr) 2334{ 2335 KASSERT(wr->wr_blkhashcnt == 0); 2336#ifdef _KERNEL 2337 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask); 2338#else /* ! _KERNEL */ 2339 wapbl_free(wr->wr_blkhash, 2340 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash)); 2341#endif /* ! _KERNEL */ 2342} 2343 2344static struct wapbl_blk * 2345wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk) 2346{ 2347 struct wapbl_blk_head *wbh; 2348 struct wapbl_blk *wb; 2349 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2350 LIST_FOREACH(wb, wbh, wb_hash) { 2351 if (blk == wb->wb_blk) 2352 return wb; 2353 } 2354 return 0; 2355} 2356 2357static void 2358wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off) 2359{ 2360 struct wapbl_blk_head *wbh; 2361 struct wapbl_blk *wb; 2362 wb = wapbl_blkhash_get(wr, blk); 2363 if (wb) { 2364 KASSERT(wb->wb_blk == blk); 2365 wb->wb_off = off; 2366 } else { 2367 wb = wapbl_alloc(sizeof(*wb)); 2368 wb->wb_blk = blk; 2369 wb->wb_off = off; 2370 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2371 LIST_INSERT_HEAD(wbh, wb, wb_hash); 2372 wr->wr_blkhashcnt++; 2373 } 2374} 2375 2376static void 2377wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk) 2378{ 2379 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2380 if (wb) { 2381 KASSERT(wr->wr_blkhashcnt > 0); 2382 wr->wr_blkhashcnt--; 2383 LIST_REMOVE(wb, wb_hash); 2384 wapbl_free(wb, sizeof(*wb)); 2385 } 2386} 2387 2388static void 2389wapbl_blkhash_clear(struct wapbl_replay *wr) 2390{ 2391 unsigned long i; 2392 for (i = 0; i <= wr->wr_blkhashmask; i++) { 2393 struct wapbl_blk *wb; 2394 2395 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) { 2396 KASSERT(wr->wr_blkhashcnt > 0); 2397 wr->wr_blkhashcnt--; 2398 LIST_REMOVE(wb, wb_hash); 2399 wapbl_free(wb, sizeof(*wb)); 2400 } 2401 } 2402 KASSERT(wr->wr_blkhashcnt == 0); 2403} 2404 2405/****************************************************************/ 2406 2407static int 2408wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp) 2409{ 2410 size_t slen; 2411 off_t off = *offp; 2412 int error; 2413 daddr_t pbn; 2414 2415 KASSERT(((len >> wr->wr_log_dev_bshift) << 2416 wr->wr_log_dev_bshift) == len); 2417 2418 if (off < wr->wr_circ_off) 2419 off = wr->wr_circ_off; 2420 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2421 if (slen < len) { 2422 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); 2423#ifdef _KERNEL 2424 pbn = btodb(pbn << wr->wr_log_dev_bshift); 2425#endif 2426 error = wapbl_read(data, slen, wr->wr_devvp, pbn); 2427 if (error) 2428 return error; 2429 data = (uint8_t *)data + slen; 2430 len -= slen; 2431 off = wr->wr_circ_off; 2432 } 2433 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); 2434#ifdef _KERNEL 2435 pbn = btodb(pbn << wr->wr_log_dev_bshift); 2436#endif 2437 error = wapbl_read(data, len, wr->wr_devvp, pbn); 2438 if (error) 2439 return error; 2440 off += len; 2441 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2442 off = wr->wr_circ_off; 2443 *offp = off; 2444 return 0; 2445} 2446 2447static void 2448wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp) 2449{ 2450 size_t slen; 2451 off_t off = *offp; 2452 2453 KASSERT(((len >> wr->wr_log_dev_bshift) << 2454 wr->wr_log_dev_bshift) == len); 2455 2456 if (off < wr->wr_circ_off) 2457 off = wr->wr_circ_off; 2458 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2459 if (slen < len) { 2460 len -= slen; 2461 off = wr->wr_circ_off; 2462 } 2463 off += len; 2464 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2465 off = wr->wr_circ_off; 2466 *offp = off; 2467} 2468 2469/****************************************************************/ 2470 2471int 2472wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp, 2473 daddr_t off, size_t count, size_t blksize) 2474{ 2475 struct wapbl_replay *wr; 2476 int error; 2477 struct vnode *devvp; 2478 daddr_t logpbn; 2479 uint8_t *scratch; 2480 struct wapbl_wc_header *wch; 2481 struct wapbl_wc_header *wch2; 2482 /* Use this until we read the actual log header */ 2483 int log_dev_bshift = ilog2(blksize); 2484 size_t used; 2485 daddr_t pbn; 2486 2487 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2488 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n", 2489 vp, off, count, blksize)); 2490 2491 if (off < 0) 2492 return EINVAL; 2493 2494 if (blksize < DEV_BSIZE) 2495 return EINVAL; 2496 if (blksize % DEV_BSIZE) 2497 return EINVAL; 2498 2499#ifdef _KERNEL 2500#if 0 2501 /* XXX vp->v_size isn't reliably set for VBLK devices, 2502 * especially root. However, we might still want to verify 2503 * that the full load is readable */ 2504 if ((off + count) * blksize > vp->v_size) 2505 return EINVAL; 2506#endif 2507 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) { 2508 return error; 2509 } 2510#else /* ! _KERNEL */ 2511 devvp = vp; 2512 logpbn = off; 2513#endif /* ! _KERNEL */ 2514 2515 scratch = wapbl_alloc(MAXBSIZE); 2516 2517 pbn = logpbn; 2518#ifdef _KERNEL 2519 pbn = btodb(pbn << log_dev_bshift); 2520#endif 2521 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn); 2522 if (error) 2523 goto errout; 2524 2525 wch = (struct wapbl_wc_header *)scratch; 2526 wch2 = 2527 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift)); 2528 /* XXX verify checksums and magic numbers */ 2529 if (wch->wc_type != WAPBL_WC_HEADER) { 2530 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type); 2531 error = EFTYPE; 2532 goto errout; 2533 } 2534 2535 if (wch2->wc_generation > wch->wc_generation) 2536 wch = wch2; 2537 2538 wr = wapbl_calloc(1, sizeof(*wr)); 2539 2540 wr->wr_logvp = vp; 2541 wr->wr_devvp = devvp; 2542 wr->wr_logpbn = logpbn; 2543 2544 wr->wr_scratch = scratch; 2545 2546 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift; 2547 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift; 2548 wr->wr_circ_off = wch->wc_circ_off; 2549 wr->wr_circ_size = wch->wc_circ_size; 2550 wr->wr_generation = wch->wc_generation; 2551 2552 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail); 2553 2554 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2555 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64 2556 " len=%"PRId64" used=%zu\n", 2557 wch->wc_head, wch->wc_tail, wch->wc_circ_off, 2558 wch->wc_circ_size, used)); 2559 2560 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift)); 2561 2562 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail); 2563 if (error) { 2564 wapbl_replay_stop(wr); 2565 wapbl_replay_free(wr); 2566 return error; 2567 } 2568 2569 *wrp = wr; 2570 return 0; 2571 2572 errout: 2573 wapbl_free(scratch, MAXBSIZE); 2574 return error; 2575} 2576 2577void 2578wapbl_replay_stop(struct wapbl_replay *wr) 2579{ 2580 2581 if (!wapbl_replay_isopen(wr)) 2582 return; 2583 2584 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n")); 2585 2586 wapbl_free(wr->wr_scratch, MAXBSIZE); 2587 wr->wr_scratch = NULL; 2588 2589 wr->wr_logvp = NULL; 2590 2591 wapbl_blkhash_clear(wr); 2592 wapbl_blkhash_free(wr); 2593} 2594 2595void 2596wapbl_replay_free(struct wapbl_replay *wr) 2597{ 2598 2599 KDASSERT(!wapbl_replay_isopen(wr)); 2600 2601 if (wr->wr_inodes) 2602 wapbl_free(wr->wr_inodes, 2603 wr->wr_inodescnt * sizeof(wr->wr_inodes[0])); 2604 wapbl_free(wr, sizeof(*wr)); 2605} 2606 2607#ifdef _KERNEL 2608int 2609wapbl_replay_isopen1(struct wapbl_replay *wr) 2610{ 2611 2612 return wapbl_replay_isopen(wr); 2613} 2614#endif 2615 2616static void 2617wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp) 2618{ 2619 struct wapbl_wc_blocklist *wc = 2620 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2621 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2622 int i, j, n; 2623 2624 for (i = 0; i < wc->wc_blkcount; i++) { 2625 /* 2626 * Enter each physical block into the hashtable independently. 2627 */ 2628 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2629 for (j = 0; j < n; j++) { 2630 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen), 2631 *offp); 2632 wapbl_circ_advance(wr, fsblklen, offp); 2633 } 2634 } 2635} 2636 2637static void 2638wapbl_replay_process_revocations(struct wapbl_replay *wr) 2639{ 2640 struct wapbl_wc_blocklist *wc = 2641 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2642 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2643 int i, j, n; 2644 2645 for (i = 0; i < wc->wc_blkcount; i++) { 2646 /* 2647 * Remove any blocks found from the hashtable. 2648 */ 2649 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2650 for (j = 0; j < n; j++) 2651 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2652 } 2653} 2654 2655static void 2656wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff) 2657{ 2658 struct wapbl_wc_inodelist *wc = 2659 (struct wapbl_wc_inodelist *)wr->wr_scratch; 2660 void *new_inodes; 2661 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]); 2662 2663 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0])); 2664 2665 /* 2666 * Keep track of where we found this so location won't be 2667 * overwritten. 2668 */ 2669 if (wc->wc_clear) { 2670 wr->wr_inodestail = oldoff; 2671 wr->wr_inodescnt = 0; 2672 if (wr->wr_inodes != NULL) { 2673 wapbl_free(wr->wr_inodes, oldsize); 2674 wr->wr_inodes = NULL; 2675 } 2676 } 2677 wr->wr_inodeshead = newoff; 2678 if (wc->wc_inocnt == 0) 2679 return; 2680 2681 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) * 2682 sizeof(wr->wr_inodes[0])); 2683 if (wr->wr_inodes != NULL) { 2684 memcpy(new_inodes, wr->wr_inodes, oldsize); 2685 wapbl_free(wr->wr_inodes, oldsize); 2686 } 2687 wr->wr_inodes = new_inodes; 2688 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes, 2689 wc->wc_inocnt * sizeof(wr->wr_inodes[0])); 2690 wr->wr_inodescnt += wc->wc_inocnt; 2691} 2692 2693static int 2694wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail) 2695{ 2696 off_t off; 2697 int error; 2698 2699 int logblklen = 1 << wr->wr_log_dev_bshift; 2700 2701 wapbl_blkhash_clear(wr); 2702 2703 off = tail; 2704 while (off != head) { 2705 struct wapbl_wc_null *wcn; 2706 off_t saveoff = off; 2707 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2708 if (error) 2709 goto errout; 2710 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2711 switch (wcn->wc_type) { 2712 case WAPBL_WC_BLOCKS: 2713 wapbl_replay_process_blocks(wr, &off); 2714 break; 2715 2716 case WAPBL_WC_REVOCATIONS: 2717 wapbl_replay_process_revocations(wr); 2718 break; 2719 2720 case WAPBL_WC_INODES: 2721 wapbl_replay_process_inodes(wr, saveoff, off); 2722 break; 2723 2724 default: 2725 printf("Unrecognized wapbl type: 0x%08x\n", 2726 wcn->wc_type); 2727 error = EFTYPE; 2728 goto errout; 2729 } 2730 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2731 if (off != saveoff) { 2732 printf("wapbl_replay: corrupted records\n"); 2733 error = EFTYPE; 2734 goto errout; 2735 } 2736 } 2737 return 0; 2738 2739 errout: 2740 wapbl_blkhash_clear(wr); 2741 return error; 2742} 2743 2744#if 0 2745int 2746wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp) 2747{ 2748 off_t off; 2749 int mismatchcnt = 0; 2750 int logblklen = 1 << wr->wr_log_dev_bshift; 2751 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2752 void *scratch1 = wapbl_alloc(MAXBSIZE); 2753 void *scratch2 = wapbl_alloc(MAXBSIZE); 2754 int error = 0; 2755 2756 KDASSERT(wapbl_replay_isopen(wr)); 2757 2758 off = wch->wc_tail; 2759 while (off != wch->wc_head) { 2760 struct wapbl_wc_null *wcn; 2761#ifdef DEBUG 2762 off_t saveoff = off; 2763#endif 2764 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2765 if (error) 2766 goto out; 2767 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2768 switch (wcn->wc_type) { 2769 case WAPBL_WC_BLOCKS: 2770 { 2771 struct wapbl_wc_blocklist *wc = 2772 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2773 int i; 2774 for (i = 0; i < wc->wc_blkcount; i++) { 2775 int foundcnt = 0; 2776 int dirtycnt = 0; 2777 int j, n; 2778 /* 2779 * Check each physical block into the 2780 * hashtable independently 2781 */ 2782 n = wc->wc_blocks[i].wc_dlen >> 2783 wch->wc_fs_dev_bshift; 2784 for (j = 0; j < n; j++) { 2785 struct wapbl_blk *wb = 2786 wapbl_blkhash_get(wr, 2787 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2788 if (wb && (wb->wb_off == off)) { 2789 foundcnt++; 2790 error = 2791 wapbl_circ_read(wr, 2792 scratch1, fsblklen, 2793 &off); 2794 if (error) 2795 goto out; 2796 error = 2797 wapbl_read(scratch2, 2798 fsblklen, fsdevvp, 2799 wb->wb_blk); 2800 if (error) 2801 goto out; 2802 if (memcmp(scratch1, 2803 scratch2, 2804 fsblklen)) { 2805 printf( 2806 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n", 2807 wb->wb_blk, (intmax_t)off); 2808 dirtycnt++; 2809 mismatchcnt++; 2810 } 2811 } else { 2812 wapbl_circ_advance(wr, 2813 fsblklen, &off); 2814 } 2815 } 2816#if 0 2817 /* 2818 * If all of the blocks in an entry 2819 * are clean, then remove all of its 2820 * blocks from the hashtable since they 2821 * never will need replay. 2822 */ 2823 if ((foundcnt != 0) && 2824 (dirtycnt == 0)) { 2825 off = saveoff; 2826 wapbl_circ_advance(wr, 2827 logblklen, &off); 2828 for (j = 0; j < n; j++) { 2829 struct wapbl_blk *wb = 2830 wapbl_blkhash_get(wr, 2831 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2832 if (wb && 2833 (wb->wb_off == off)) { 2834 wapbl_blkhash_rem(wr, wb->wb_blk); 2835 } 2836 wapbl_circ_advance(wr, 2837 fsblklen, &off); 2838 } 2839 } 2840#endif 2841 } 2842 } 2843 break; 2844 case WAPBL_WC_REVOCATIONS: 2845 case WAPBL_WC_INODES: 2846 break; 2847 default: 2848 KASSERT(0); 2849 } 2850#ifdef DEBUG 2851 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2852 KASSERT(off == saveoff); 2853#endif 2854 } 2855 out: 2856 wapbl_free(scratch1, MAXBSIZE); 2857 wapbl_free(scratch2, MAXBSIZE); 2858 if (!error && mismatchcnt) 2859 error = EFTYPE; 2860 return error; 2861} 2862#endif 2863 2864int 2865wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp) 2866{ 2867 struct wapbl_blk *wb; 2868 size_t i; 2869 off_t off; 2870 void *scratch; 2871 int error = 0; 2872 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2873 2874 KDASSERT(wapbl_replay_isopen(wr)); 2875 2876 scratch = wapbl_alloc(MAXBSIZE); 2877 2878 for (i = 0; i <= wr->wr_blkhashmask; ++i) { 2879 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) { 2880 off = wb->wb_off; 2881 error = wapbl_circ_read(wr, scratch, fsblklen, &off); 2882 if (error) 2883 break; 2884 error = wapbl_write(scratch, fsblklen, fsdevvp, 2885 wb->wb_blk); 2886 if (error) 2887 break; 2888 } 2889 } 2890 2891 wapbl_free(scratch, MAXBSIZE); 2892 return error; 2893} 2894 2895int 2896wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len) 2897{ 2898 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2899 2900 KDASSERT(wapbl_replay_isopen(wr)); 2901 KASSERT((len % fsblklen) == 0); 2902 2903 while (len != 0) { 2904 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2905 if (wb) 2906 return 1; 2907 len -= fsblklen; 2908 } 2909 return 0; 2910} 2911 2912int 2913wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len) 2914{ 2915 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2916 2917 KDASSERT(wapbl_replay_isopen(wr)); 2918 2919 KASSERT((len % fsblklen) == 0); 2920 2921 while (len != 0) { 2922 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2923 if (wb) { 2924 off_t off = wb->wb_off; 2925 int error; 2926 error = wapbl_circ_read(wr, data, fsblklen, &off); 2927 if (error) 2928 return error; 2929 } 2930 data = (uint8_t *)data + fsblklen; 2931 len -= fsblklen; 2932 blk++; 2933 } 2934 return 0; 2935} 2936 2937#ifdef _KERNEL 2938/* 2939 * This is not really a module now, but maybe on it's way to 2940 * being one some day. 2941 */ 2942MODULE(MODULE_CLASS_VFS, wapbl, NULL); 2943 2944static int 2945wapbl_modcmd(modcmd_t cmd, void *arg) 2946{ 2947 2948 switch (cmd) { 2949 case MODULE_CMD_INIT: 2950 wapbl_init(); 2951 return 0; 2952 case MODULE_CMD_FINI: 2953#ifdef notyet 2954 return wapbl_fini(true); 2955#endif 2956 return EOPNOTSUPP; 2957 default: 2958 return ENOTTY; 2959 } 2960} 2961#endif /* _KERNEL */ 2962