1/* 2 * Copyright (c) 2002-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28// 29// This file implements a simple write-ahead journaling layer. 30// In theory any file system can make use of it by calling these 31// functions when the fs wants to modify meta-data blocks. See 32// vfs_journal.h for a more detailed description of the api and 33// data structures. 34// 35// Dominic Giampaolo (dbg@apple.com) 36// 37 38#ifdef KERNEL 39 40#include <sys/param.h> 41#include <sys/systm.h> 42#include <sys/kernel.h> 43#include <sys/file_internal.h> 44#include <sys/stat.h> 45#include <sys/buf_internal.h> 46#include <sys/proc_internal.h> 47#include <sys/mount_internal.h> 48#include <sys/namei.h> 49#include <sys/vnode_internal.h> 50#include <sys/ioctl.h> 51#include <sys/tty.h> 52#include <sys/ubc.h> 53#include <sys/malloc.h> 54#include <kern/task.h> 55#include <kern/thread.h> 56#include <kern/kalloc.h> 57#include <sys/disk.h> 58#include <sys/kdebug.h> 59#include <miscfs/specfs/specdev.h> 60#include <libkern/OSAtomic.h> /* OSAddAtomic */ 61 62kern_return_t thread_terminate(thread_t); 63 64/* 65 * Set sysctl vfs.generic.jnl.kdebug.trim=1 to enable KERNEL_DEBUG_CONSTANT 66 * logging of trim-related calls within the journal. (They're 67 * disabled by default because there can be a lot of these events, 68 * and we don't want to overwhelm the kernel debug buffer. If you 69 * want to watch these events in particular, just set the sysctl.) 70 */ 71static int jnl_kdebug = 0; 72SYSCTL_DECL(_vfs_generic); 73SYSCTL_NODE(_vfs_generic, OID_AUTO, jnl, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal"); 74SYSCTL_NODE(_vfs_generic_jnl, OID_AUTO, kdebug, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal kdebug"); 75SYSCTL_INT(_vfs_generic_jnl_kdebug, OID_AUTO, trim, CTLFLAG_RW|CTLFLAG_LOCKED, &jnl_kdebug, 0, "Enable kdebug logging for journal TRIM"); 76 77#define DBG_JOURNAL_FLUSH FSDBG_CODE(DBG_JOURNAL, 1) 78#define DBG_JOURNAL_TRIM_ADD FSDBG_CODE(DBG_JOURNAL, 2) 79#define DBG_JOURNAL_TRIM_REMOVE FSDBG_CODE(DBG_JOURNAL, 3) 80#define DBG_JOURNAL_TRIM_REMOVE_PENDING FSDBG_CODE(DBG_JOURNAL, 4) 81#define DBG_JOURNAL_TRIM_REALLOC FSDBG_CODE(DBG_JOURNAL, 5) 82#define DBG_JOURNAL_TRIM_FLUSH FSDBG_CODE(DBG_JOURNAL, 6) 83#define DBG_JOURNAL_TRIM_UNMAP FSDBG_CODE(DBG_JOURNAL, 7) 84 85/* 86 * Cap the journal max size to 2GB. On HFS, it will attempt to occupy 87 * a full allocation block if the current size is smaller than the allocation 88 * block on which it resides. Once we hit the exabyte filesystem range, then 89 * it will use 2GB allocation blocks. As a result, make the cap 2GB. 90 */ 91#define MAX_JOURNAL_SIZE 0x80000000U 92 93#include <sys/sdt.h> /* DTRACE_IO1 */ 94#else 95 96#include <stdio.h> 97#include <stdlib.h> 98#include <string.h> 99#include <limits.h> 100#include <errno.h> 101#include <fcntl.h> 102#include <unistd.h> 103#include <stdarg.h> 104#include <sys/types.h> 105#include "compat.h" 106 107#endif /* KERNEL */ 108 109#include "vfs_journal.h" 110 111#include <sys/kdebug.h> 112 113#if 0 114#undef KERNEL_DEBUG 115#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT 116#endif 117 118 119#ifndef CONFIG_HFS_TRIM 120#define CONFIG_HFS_TRIM 0 121#endif 122 123 124#if JOURNALING 125 126// 127// By default, we grow the list of extents to trim by 4K at a time. 128// We'll opt to flush a transaction if it contains at least 129// JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number 130// of modified blocks is small). 131// 132enum { 133 JOURNAL_DEFAULT_TRIM_BYTES = 4096, 134 JOURNAL_DEFAULT_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_BYTES / sizeof(dk_extent_t), 135 JOURNAL_FLUSH_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_EXTENTS * 15 / 16 136}; 137 138unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS; 139SYSCTL_UINT (_kern, OID_AUTO, jnl_trim_flush, CTLFLAG_RW, &jnl_trim_flush_limit, 0, "number of trimmed extents to cause a journal flush"); 140 141/* XXX next prototype should be from libsa/stdlib.h> but conflicts libkern */ 142__private_extern__ void qsort( 143 void * array, 144 size_t nmembers, 145 size_t member_size, 146 int (*)(const void *, const void *)); 147 148 149 150// number of bytes to checksum in a block_list_header 151// NOTE: this should be enough to clear out the header 152// fields as well as the first entry of binfo[] 153#define BLHDR_CHECKSUM_SIZE 32 154 155static void lock_condition(journal *jnl, boolean_t *condition, const char *condition_name); 156static void wait_condition(journal *jnl, boolean_t *condition, const char *condition_name); 157static void unlock_condition(journal *jnl, boolean_t *condition); 158static void finish_end_thread(transaction *tr); 159static void write_header_thread(journal *jnl); 160static int finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg); 161static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait); 162static void abort_transaction(journal *jnl, transaction *tr); 163static void dump_journal(journal *jnl); 164 165static __inline__ void lock_oldstart(journal *jnl); 166static __inline__ void unlock_oldstart(journal *jnl); 167static __inline__ void lock_flush(journal *jnl); 168static __inline__ void unlock_flush(journal *jnl); 169 170 171// 172// 3105942 - Coalesce writes to the same block on journal replay 173// 174 175typedef struct bucket { 176 off_t block_num; 177 uint32_t jnl_offset; 178 uint32_t block_size; 179 int32_t cksum; 180} bucket; 181 182#define STARTING_BUCKETS 256 183 184static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr); 185static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size); 186static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full); 187static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr); 188static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting); 189 190#define CHECK_JOURNAL(jnl) \ 191 do { \ 192 if (jnl == NULL) { \ 193 panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \ 194 } \ 195 if (jnl->jdev == NULL) { \ 196 panic("%s:%d: jdev is null!\n", __FILE__, __LINE__); \ 197 } \ 198 if (jnl->fsdev == NULL) { \ 199 panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__); \ 200 } \ 201 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) { \ 202 panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n", \ 203 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \ 204 } \ 205 if ( jnl->jhdr->start <= 0 \ 206 || jnl->jhdr->start > jnl->jhdr->size) { \ 207 panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \ 208 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \ 209 } \ 210 if ( jnl->jhdr->end <= 0 \ 211 || jnl->jhdr->end > jnl->jhdr->size) { \ 212 panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \ 213 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \ 214 } \ 215 } while(0) 216 217#define CHECK_TRANSACTION(tr) \ 218 do { \ 219 if (tr == NULL) { \ 220 panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \ 221 } \ 222 if (tr->jnl == NULL) { \ 223 panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \ 224 } \ 225 if (tr->blhdr != (block_list_header *)tr->tbuffer) { \ 226 panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \ 227 } \ 228 if (tr->total_bytes < 0) { \ 229 panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \ 230 } \ 231 if (tr->journal_start < 0) { \ 232 panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \ 233 } \ 234 if (tr->journal_end < 0) { \ 235 panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \ 236 } \ 237 if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) { \ 238 panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \ 239 } \ 240 } while(0) 241 242 243 244// 245// this isn't a great checksum routine but it will do for now. 246// we use it to checksum the journal header and the block list 247// headers that are at the start of each transaction. 248// 249static unsigned int 250calc_checksum(char *ptr, int len) 251{ 252 int i; 253 unsigned int cksum=0; 254 255 // this is a lame checksum but for now it'll do 256 for(i = 0; i < len; i++, ptr++) { 257 cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr); 258 } 259 260 return (~cksum); 261} 262 263// 264// Journal Locking 265// 266lck_grp_attr_t * jnl_group_attr; 267lck_attr_t * jnl_lock_attr; 268lck_grp_t * jnl_mutex_group; 269 270void 271journal_init(void) 272{ 273 jnl_lock_attr = lck_attr_alloc_init(); 274 jnl_group_attr = lck_grp_attr_alloc_init(); 275 jnl_mutex_group = lck_grp_alloc_init("jnl-mutex", jnl_group_attr); 276} 277 278__inline__ void 279journal_lock(journal *jnl) 280{ 281 lck_mtx_lock(&jnl->jlock); 282 if (jnl->owner) { 283 panic ("jnl: owner is %p, expected NULL\n", jnl->owner); 284 } 285 jnl->owner = current_thread(); 286} 287 288__inline__ void 289journal_unlock(journal *jnl) 290{ 291 jnl->owner = NULL; 292 lck_mtx_unlock(&jnl->jlock); 293} 294 295static __inline__ void 296lock_flush(journal *jnl) 297{ 298 lck_mtx_lock(&jnl->flock); 299} 300 301static __inline__ void 302unlock_flush(journal *jnl) 303{ 304 lck_mtx_unlock(&jnl->flock); 305} 306 307static __inline__ void 308lock_oldstart(journal *jnl) 309{ 310 lck_mtx_lock(&jnl->old_start_lock); 311} 312 313static __inline__ void 314unlock_oldstart(journal *jnl) 315{ 316 lck_mtx_unlock(&jnl->old_start_lock); 317} 318 319 320 321#define JNL_WRITE 0x0001 322#define JNL_READ 0x0002 323#define JNL_HEADER 0x8000 324 325// 326// This function sets up a fake buf and passes it directly to the 327// journal device strategy routine (so that it won't get cached in 328// the block cache. 329// 330// It also handles range checking the i/o so that we don't write 331// outside the journal boundaries and it will wrap the i/o back 332// to the beginning if necessary (skipping over the journal header) 333// 334static size_t 335do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction) 336{ 337 int err, curlen=len; 338 size_t io_sz = 0; 339 buf_t bp; 340 off_t max_iosize; 341 struct bufattr *bap; 342 343 if (*offset < 0 || *offset > jnl->jhdr->size) { 344 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size); 345 } 346 347 if (direction & JNL_WRITE) 348 max_iosize = jnl->max_write_size; 349 else if (direction & JNL_READ) 350 max_iosize = jnl->max_read_size; 351 else 352 max_iosize = 128 * 1024; 353 354again: 355 bp = alloc_io_buf(jnl->jdev, 1); 356 357 if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) { 358 if (*offset == jnl->jhdr->size) { 359 *offset = jnl->jhdr->jhdr_size; 360 } else { 361 curlen = (off_t)jnl->jhdr->size - *offset; 362 } 363 } 364 365 if (curlen > max_iosize) { 366 curlen = max_iosize; 367 } 368 369 if (curlen <= 0) { 370 panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %zd\n", curlen, *offset, len); 371 } 372 373 if (*offset == 0 && (direction & JNL_HEADER) == 0) { 374 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data); 375 } 376 377 /* 378 * As alluded to in the block comment at the top of the function, we use a "fake" iobuf 379 * here and issue directly to the disk device that the journal protects since we don't 380 * want this to enter the block cache. As a result, we lose the ability to mark it 381 * as a metadata buf_t for the layers below us that may care. If we were to 382 * simply attach the B_META flag into the b_flags this may confuse things further 383 * since this is an iobuf, not a metadata buffer. 384 * 385 * To address this, we use the extended bufattr struct embedded in the bp. 386 * Explicitly mark the buf here as a metadata buffer in its bufattr flags. 387 */ 388 bap = &bp->b_attr; 389 bap->ba_flags |= BA_META; 390 391 if (direction & JNL_READ) 392 buf_setflags(bp, B_READ); 393 else { 394 /* 395 * don't have to set any flags 396 */ 397 vnode_startwrite(jnl->jdev); 398 } 399 buf_setsize(bp, curlen); 400 buf_setcount(bp, curlen); 401 buf_setdataptr(bp, (uintptr_t)data); 402 buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); 403 buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); 404 405 if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) { 406 buf_markfua(bp); 407 } 408 409 DTRACE_IO1(journal__start, buf_t, bp); 410 err = VNOP_STRATEGY(bp); 411 if (!err) { 412 err = (int)buf_biowait(bp); 413 } 414 DTRACE_IO1(journal__done, buf_t, bp); 415 free_io_buf(bp); 416 417 if (err) { 418 printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err); 419 return 0; 420 } 421 422 *offset += curlen; 423 io_sz += curlen; 424 425 if (io_sz != len) { 426 // handle wrap-around 427 data = (char *)data + curlen; 428 curlen = len - io_sz; 429 if (*offset >= jnl->jhdr->size) { 430 *offset = jnl->jhdr->jhdr_size; 431 } 432 goto again; 433 } 434 435 return io_sz; 436} 437 438static size_t 439read_journal_data(journal *jnl, off_t *offset, void *data, size_t len) 440{ 441 return do_journal_io(jnl, offset, data, len, JNL_READ); 442} 443 444static size_t 445write_journal_data(journal *jnl, off_t *offset, void *data, size_t len) 446{ 447 return do_journal_io(jnl, offset, data, len, JNL_WRITE); 448} 449 450 451static size_t 452read_journal_header(journal *jnl, void *data, size_t len) 453{ 454 off_t hdr_offset = 0; 455 456 return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER); 457} 458 459static int 460write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num) 461{ 462 static int num_err_prints = 0; 463 int ret=0; 464 off_t jhdr_offset = 0; 465 struct vfs_context context; 466 467 context.vc_thread = current_thread(); 468 context.vc_ucred = NOCRED; 469 // 470 // Flush the track cache if we're not doing force-unit-access 471 // writes. 472 // 473 if (!updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { 474 ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context); 475 } 476 if (ret != 0) { 477 // 478 // Only print this error if it's a different error than the 479 // previous one, or if it's the first time for this device 480 // or if the total number of printfs is less than 25. We 481 // allow for up to 25 printfs to insure that some make it 482 // into the on-disk syslog. Otherwise if we only printed 483 // one, it's possible it would never make it to the syslog 484 // for the root volume and that makes debugging hard. 485 // 486 if ( ret != jnl->last_flush_err 487 || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0 488 || num_err_prints++ < 25) { 489 490 printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret); 491 492 jnl->flags |= JOURNAL_FLUSHCACHE_ERR; 493 jnl->last_flush_err = ret; 494 } 495 } 496 497 jnl->jhdr->sequence_num = sequence_num; 498 jnl->jhdr->checksum = 0; 499 jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); 500 501 if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) { 502 printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name); 503 jnl->flags |= JOURNAL_INVALID; 504 return -1; 505 } 506 507 // If we're not doing force-unit-access writes, then we 508 // have to flush after writing the journal header so that 509 // a future transaction doesn't sneak out to disk before 510 // the header does and thus overwrite data that the old 511 // journal header refers to. Saw this exact case happen 512 // on an IDE bus analyzer with Larry Barras so while it 513 // may seem obscure, it's not. 514 // 515 if (updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { 516 VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context); 517 } 518 519 return 0; 520} 521 522 523 524// 525// this is a work function used to free up transactions that 526// completed. they can't be free'd from buffer_flushed_callback 527// because it is called from deep with the disk driver stack 528// and thus can't do something that would potentially cause 529// paging. it gets called by each of the journal api entry 530// points so stuff shouldn't hang around for too long. 531// 532static void 533free_old_stuff(journal *jnl) 534{ 535 transaction *tr, *next; 536 block_list_header *blhdr=NULL, *next_blhdr=NULL; 537 538 if (jnl->tr_freeme == NULL) 539 return; 540 541 lock_oldstart(jnl); 542 tr = jnl->tr_freeme; 543 jnl->tr_freeme = NULL; 544 unlock_oldstart(jnl); 545 546 for(; tr; tr=next) { 547 for (blhdr = tr->blhdr; blhdr; blhdr = next_blhdr) { 548 next_blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum); 549 blhdr->binfo[0].bnum = 0xdeadc0de; 550 551 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size); 552 553 KERNEL_DEBUG(0xbbbbc01c, jnl, tr, tr->tbuffer_size, 0, 0); 554 } 555 next = tr->next; 556 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); 557 } 558} 559 560 561 562// 563// This is our callback that lets us know when a buffer has been 564// flushed to disk. It's called from deep within the driver stack 565// and thus is quite limited in what it can do. Notably, it can 566// not initiate any new i/o's or allocate/free memory. 567// 568static void 569buffer_flushed_callback(struct buf *bp, void *arg) 570{ 571 transaction *tr; 572 journal *jnl; 573 transaction *ctr, *prev=NULL, *next; 574 size_t i; 575 int bufsize, amt_flushed, total_bytes; 576 577 578 //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n", 579 // bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg); 580 581 // snarf out the bits we want 582 bufsize = buf_size(bp); 583 tr = (transaction *)arg; 584 585 // then we've already seen it 586 if (tr == NULL) { 587 return; 588 } 589 590 CHECK_TRANSACTION(tr); 591 592 jnl = tr->jnl; 593 594 CHECK_JOURNAL(jnl); 595 596 amt_flushed = tr->num_killed; 597 total_bytes = tr->total_bytes; 598 599 // update the number of blocks that have been flushed. 600 // this buf may represent more than one block so take 601 // that into account. 602 // 603 // OSAddAtomic() returns the value of tr->num_flushed before the add 604 // 605 amt_flushed += OSAddAtomic(bufsize, &tr->num_flushed); 606 607 608 // if this transaction isn't done yet, just return as 609 // there is nothing to do. 610 // 611 // NOTE: we are careful to not reference anything through 612 // the tr pointer after doing the OSAddAtomic(). if 613 // this if statement fails then we are the last one 614 // and then it's ok to dereference "tr". 615 // 616 if ((amt_flushed + bufsize) < total_bytes) { 617 return; 618 } 619 620 // this will single thread checking the transaction 621 lock_oldstart(jnl); 622 623 if (tr->total_bytes == (int)0xfbadc0de) { 624 // then someone beat us to it... 625 unlock_oldstart(jnl); 626 return; 627 } 628 629 // mark this so that we're the owner of dealing with the 630 // cleanup for this transaction 631 tr->total_bytes = 0xfbadc0de; 632 633 if (jnl->flags & JOURNAL_INVALID) 634 goto transaction_done; 635 636 //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n", 637 // tr, tr->journal_start, tr->journal_end, jnl); 638 639 // find this entry in the old_start[] index and mark it completed 640 for(i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { 641 642 if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) { 643 jnl->old_start[i] &= ~(0x8000000000000000ULL); 644 break; 645 } 646 } 647 648 if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { 649 panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n", 650 tr->journal_start, tr, jnl); 651 } 652 653 654 // if we are here then we need to update the journal header 655 // to reflect that this transaction is complete 656 if (tr->journal_start == jnl->active_start) { 657 jnl->active_start = tr->journal_end; 658 tr->journal_start = tr->journal_end = (off_t)0; 659 } 660 661 // go through the completed_trs list and try to coalesce 662 // entries, restarting back at the beginning if we have to. 663 for (ctr = jnl->completed_trs; ctr; prev=ctr, ctr=next) { 664 if (ctr->journal_start == jnl->active_start) { 665 jnl->active_start = ctr->journal_end; 666 if (prev) { 667 prev->next = ctr->next; 668 } 669 if (ctr == jnl->completed_trs) { 670 jnl->completed_trs = ctr->next; 671 } 672 673 next = jnl->completed_trs; // this starts us over again 674 ctr->next = jnl->tr_freeme; 675 jnl->tr_freeme = ctr; 676 ctr = NULL; 677 } else if (tr->journal_end == ctr->journal_start) { 678 ctr->journal_start = tr->journal_start; 679 next = jnl->completed_trs; // this starts us over again 680 ctr = NULL; 681 tr->journal_start = tr->journal_end = (off_t)0; 682 } else if (tr->journal_start == ctr->journal_end) { 683 ctr->journal_end = tr->journal_end; 684 next = ctr->next; 685 tr->journal_start = tr->journal_end = (off_t)0; 686 } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) { 687 // coalesce the next entry with this one and link the next 688 // entry in at the head of the tr_freeme list 689 next = ctr->next; // temporarily use the "next" variable 690 ctr->journal_end = next->journal_end; 691 ctr->next = next->next; 692 next->next = jnl->tr_freeme; // link in the next guy at the head of the tr_freeme list 693 jnl->tr_freeme = next; 694 695 next = jnl->completed_trs; // this starts us over again 696 ctr = NULL; 697 } else { 698 next = ctr->next; 699 } 700 } 701 702 // if this is true then we didn't merge with anyone 703 // so link ourselves in at the head of the completed 704 // transaction list. 705 if (tr->journal_start != 0) { 706 // put this entry into the correct sorted place 707 // in the list instead of just at the head. 708 // 709 710 prev = NULL; 711 for (ctr = jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) { 712 // just keep looping 713 } 714 715 if (ctr == NULL && prev == NULL) { 716 jnl->completed_trs = tr; 717 tr->next = NULL; 718 } else if (ctr == jnl->completed_trs) { 719 tr->next = jnl->completed_trs; 720 jnl->completed_trs = tr; 721 } else { 722 tr->next = prev->next; 723 prev->next = tr; 724 } 725 } else { 726 // if we're here this tr got merged with someone else so 727 // put it on the list to be free'd 728 tr->next = jnl->tr_freeme; 729 jnl->tr_freeme = tr; 730 } 731transaction_done: 732 unlock_oldstart(jnl); 733 734 unlock_condition(jnl, &jnl->asyncIO); 735} 736 737 738#include <libkern/OSByteOrder.h> 739 740#define SWAP16(x) OSSwapInt16(x) 741#define SWAP32(x) OSSwapInt32(x) 742#define SWAP64(x) OSSwapInt64(x) 743 744 745static void 746swap_journal_header(journal *jnl) 747{ 748 jnl->jhdr->magic = SWAP32(jnl->jhdr->magic); 749 jnl->jhdr->endian = SWAP32(jnl->jhdr->endian); 750 jnl->jhdr->start = SWAP64(jnl->jhdr->start); 751 jnl->jhdr->end = SWAP64(jnl->jhdr->end); 752 jnl->jhdr->size = SWAP64(jnl->jhdr->size); 753 jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size); 754 jnl->jhdr->checksum = SWAP32(jnl->jhdr->checksum); 755 jnl->jhdr->jhdr_size = SWAP32(jnl->jhdr->jhdr_size); 756 jnl->jhdr->sequence_num = SWAP32(jnl->jhdr->sequence_num); 757} 758 759static void 760swap_block_list_header(journal *jnl, block_list_header *blhdr) 761{ 762 int i; 763 764 blhdr->max_blocks = SWAP16(blhdr->max_blocks); 765 blhdr->num_blocks = SWAP16(blhdr->num_blocks); 766 blhdr->bytes_used = SWAP32(blhdr->bytes_used); 767 blhdr->checksum = SWAP32(blhdr->checksum); 768 blhdr->flags = SWAP32(blhdr->flags); 769 770 if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) { 771 printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d). not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size); 772 return; 773 } 774 775 for(i = 0; i < blhdr->num_blocks; i++) { 776 blhdr->binfo[i].bnum = SWAP64(blhdr->binfo[i].bnum); 777 blhdr->binfo[i].u.bi.bsize = SWAP32(blhdr->binfo[i].u.bi.bsize); 778 blhdr->binfo[i].u.bi.b.cksum = SWAP32(blhdr->binfo[i].u.bi.b.cksum); 779 } 780} 781 782 783static int 784update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize) 785{ 786 int ret; 787 struct buf *oblock_bp=NULL; 788 789 // first read the block we want. 790 ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); 791 if (ret != 0) { 792 printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret); 793 794 if (oblock_bp) { 795 buf_brelse(oblock_bp); 796 oblock_bp = NULL; 797 } 798 799 // let's try to be aggressive here and just re-write the block 800 oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META); 801 if (oblock_bp == NULL) { 802 printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block); 803 return -1; 804 } 805 } 806 807 // make sure it's the correct size. 808 if (buf_size(oblock_bp) != bsize) { 809 buf_brelse(oblock_bp); 810 return -1; 811 } 812 813 // copy the journal data over top of it 814 memcpy((char *)buf_dataptr(oblock_bp), block_ptr, bsize); 815 816 if ((ret = VNOP_BWRITE(oblock_bp)) != 0) { 817 printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret); 818 return ret; 819 } 820 821 // and now invalidate it so that if someone else wants to read 822 // it in a different size they'll be able to do it. 823 ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); 824 if (oblock_bp) { 825 buf_markinvalid(oblock_bp); 826 buf_brelse(oblock_bp); 827 } 828 829 return 0; 830} 831 832static int 833grow_table(struct bucket **buf_ptr, int num_buckets, int new_size) 834{ 835 struct bucket *newBuf; 836 int current_size = num_buckets, i; 837 838 // return if newsize is less than the current size 839 if (new_size < num_buckets) { 840 return current_size; 841 } 842 843 if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) { 844 printf("jnl: grow_table: no memory to expand coalesce buffer!\n"); 845 return -1; 846 } 847 848 // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size); 849 850 // copy existing elements 851 bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket)); 852 853 // initialize the new ones 854 for(i = num_buckets; i < new_size; i++) { 855 newBuf[i].block_num = (off_t)-1; 856 } 857 858 // free the old container 859 FREE(*buf_ptr, M_TEMP); 860 861 // reset the buf_ptr 862 *buf_ptr = newBuf; 863 864 return new_size; 865} 866 867static int 868lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full) 869{ 870 int lo, hi, index, matches, i; 871 872 if (num_full == 0) { 873 return 0; // table is empty, so insert at index=0 874 } 875 876 lo = 0; 877 hi = num_full - 1; 878 index = -1; 879 880 // perform binary search for block_num 881 do { 882 int mid = (hi - lo)/2 + lo; 883 off_t this_num = (*buf_ptr)[mid].block_num; 884 885 if (block_num == this_num) { 886 index = mid; 887 break; 888 } 889 890 if (block_num < this_num) { 891 hi = mid; 892 continue; 893 } 894 895 if (block_num > this_num) { 896 lo = mid + 1; 897 continue; 898 } 899 } while (lo < hi); 900 901 // check if lo and hi converged on the match 902 if (block_num == (*buf_ptr)[hi].block_num) { 903 index = hi; 904 } 905 906 // if no existing entry found, find index for new one 907 if (index == -1) { 908 index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1; 909 } else { 910 // make sure that we return the right-most index in the case of multiple matches 911 matches = 0; 912 i = index + 1; 913 while (i < num_full && block_num == (*buf_ptr)[i].block_num) { 914 matches++; 915 i++; 916 } 917 918 index += matches; 919 } 920 921 return index; 922} 923 924static int 925insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting) 926{ 927 if (!overwriting) { 928 // grow the table if we're out of space 929 if (*num_full_ptr >= *num_buckets_ptr) { 930 int new_size = *num_buckets_ptr * 2; 931 int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size); 932 933 if (grow_size < new_size) { 934 printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name); 935 return -1; 936 } 937 938 *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size 939 } 940 941 // if we're not inserting at the end, we need to bcopy 942 if (blk_index != *num_full_ptr) { 943 bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) ); 944 } 945 946 (*num_full_ptr)++; // increment only if we're not overwriting 947 } 948 949 // sanity check the values we're about to add 950 if ((off_t)offset >= jnl->jhdr->size) { 951 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); 952 } 953 if (size <= 0) { 954 panic("jnl: insert_block: bad size in insert_block (%zd)\n", size); 955 } 956 957 (*buf_ptr)[blk_index].block_num = num; 958 (*buf_ptr)[blk_index].block_size = size; 959 (*buf_ptr)[blk_index].jnl_offset = offset; 960 (*buf_ptr)[blk_index].cksum = cksum; 961 962 return blk_index; 963} 964 965static int 966do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) 967{ 968 int num_to_remove, index, i, overwrite, err; 969 size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset; 970 off_t overlap, block_start, block_end; 971 972 block_start = block_num*jhdr_size; 973 block_end = block_start + size; 974 overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size); 975 976 // first, eliminate any overlap with the previous entry 977 if (blk_index != 0 && !overwrite) { 978 off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size; 979 off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size; 980 overlap = prev_block_end - block_start; 981 if (overlap > 0) { 982 if (overlap % jhdr_size != 0) { 983 panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size); 984 } 985 986 // if the previous entry completely overlaps this one, we need to break it into two pieces. 987 if (prev_block_end > block_end) { 988 off_t new_num = block_end / jhdr_size; 989 size_t new_size = prev_block_end - block_end; 990 991 new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start); 992 993 err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0); 994 if (err < 0) { 995 panic("jnl: do_overlap: error inserting during pre-overlap\n"); 996 } 997 } 998 999 // Regardless, we need to truncate the previous entry to the beginning of the overlap 1000 (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start; 1001 (*buf_ptr)[blk_index-1].cksum = 0; // have to blow it away because there's no way to check it 1002 } 1003 } 1004 1005 // then, bail out fast if there's no overlap with the entries that follow 1006 if (!overwrite && block_end <= (off_t)((*buf_ptr)[blk_index].block_num*jhdr_size)) { 1007 return 0; // no overlap, no overwrite 1008 } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (off_t)((*buf_ptr)[blk_index+1].block_num*jhdr_size))) { 1009 1010 (*buf_ptr)[blk_index].cksum = cksum; // update this 1011 return 1; // simple overwrite 1012 } 1013 1014 // Otherwise, find all cases of total and partial overlap. We use the special 1015 // block_num of -2 to designate entries that are completely overlapped and must 1016 // be eliminated. The block_num, size, and jnl_offset of partially overlapped 1017 // entries must be adjusted to keep the array consistent. 1018 index = blk_index; 1019 num_to_remove = 0; 1020 while (index < *num_full_ptr && block_end > (off_t)((*buf_ptr)[index].block_num*jhdr_size)) { 1021 if (block_end >= (off_t)(((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size))) { 1022 (*buf_ptr)[index].block_num = -2; // mark this for deletion 1023 num_to_remove++; 1024 } else { 1025 overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size; 1026 if (overlap > 0) { 1027 if (overlap % jhdr_size != 0) { 1028 panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap, jhdr_size); 1029 } 1030 1031 // if we partially overlap this entry, adjust its block number, jnl offset, and size 1032 (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up 1033 (*buf_ptr)[index].cksum = 0; 1034 1035 new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around 1036 if ((off_t)new_offset >= jnl->jhdr->size) { 1037 new_offset = jhdr_size + (new_offset - jnl->jhdr->size); 1038 } 1039 (*buf_ptr)[index].jnl_offset = new_offset; 1040 1041 (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value 1042 if ((*buf_ptr)[index].block_size <= 0) { 1043 panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr)[index].block_size); 1044 // return -1; // if above panic is removed, return -1 for error 1045 } 1046 } 1047 1048 } 1049 1050 index++; 1051 } 1052 1053 // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out) 1054 index--; // start with the last index used within the above loop 1055 while (index >= blk_index) { 1056 if ((*buf_ptr)[index].block_num == -2) { 1057 if (index == *num_full_ptr-1) { 1058 (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free 1059 } else { 1060 bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) ); 1061 } 1062 (*num_full_ptr)--; 1063 } 1064 index--; 1065 } 1066 1067 // eliminate any stale entries at the end of the table 1068 for(i = *num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) { 1069 (*buf_ptr)[i].block_num = -1; 1070 } 1071 1072 return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite) 1073} 1074 1075// PR-3105942: Coalesce writes to the same block in journal replay 1076// We coalesce writes by maintaining a dynamic sorted array of physical disk blocks 1077// to be replayed and the corresponding location in the journal which contains 1078// the most recent data for those blocks. The array is "played" once the all the 1079// blocks in the journal have been coalesced. The code for the case of conflicting/ 1080// overlapping writes to a single block is the most dense. Because coalescing can 1081// disrupt the existing time-ordering of blocks in the journal playback, care 1082// is taken to catch any overlaps and keep the array consistent. 1083static int 1084add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) 1085{ 1086 int blk_index, overwriting; 1087 1088 // on return from lookup_bucket(), blk_index is the index into the table where block_num should be 1089 // inserted (or the index of the elem to overwrite). 1090 blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr); 1091 1092 // check if the index is within bounds (if we're adding this block to the end of 1093 // the table, blk_index will be equal to num_full) 1094 if (blk_index < 0 || blk_index > *num_full_ptr) { 1095 //printf("jnl: add_block: trouble adding block to co_buf\n"); 1096 return -1; 1097 } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index); 1098 1099 // Determine whether we're overwriting an existing entry by checking for overlap 1100 overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr); 1101 if (overwriting < 0) { 1102 return -1; // if we got an error, pass it along 1103 } 1104 1105 // returns the index, or -1 on error 1106 blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting); 1107 1108 return blk_index; 1109} 1110 1111static int 1112replay_journal(journal *jnl) 1113{ 1114 int i, bad_blocks=0; 1115 unsigned int orig_checksum, checksum, check_block_checksums = 0; 1116 size_t ret; 1117 size_t max_bsize = 0; /* protected by block_ptr */ 1118 block_list_header *blhdr; 1119 off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start; 1120 char *buff, *block_ptr=NULL; 1121 struct bucket *co_buf; 1122 int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0; 1123 uint32_t last_sequence_num = 0; 1124 int replay_retry_count = 0; 1125 1126 // wrap the start ptr if it points to the very end of the journal 1127 if (jnl->jhdr->start == jnl->jhdr->size) { 1128 jnl->jhdr->start = jnl->jhdr->jhdr_size; 1129 } 1130 if (jnl->jhdr->end == jnl->jhdr->size) { 1131 jnl->jhdr->end = jnl->jhdr->jhdr_size; 1132 } 1133 1134 if (jnl->jhdr->start == jnl->jhdr->end) { 1135 return 0; 1136 } 1137 1138 orig_jnl_start = jnl->jhdr->start; 1139 1140 // allocate memory for the header_block. we'll read each blhdr into this 1141 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size)) { 1142 printf("jnl: %s: replay_journal: no memory for block buffer! (%d bytes)\n", 1143 jnl->jdev_name, jnl->jhdr->blhdr_size); 1144 return -1; 1145 } 1146 1147 // allocate memory for the coalesce buffer 1148 if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) { 1149 printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl->jdev_name); 1150 return -1; 1151 } 1152 1153restart_replay: 1154 1155 // initialize entries 1156 for(i = 0; i < num_buckets; i++) { 1157 co_buf[i].block_num = -1; 1158 } 1159 num_full = 0; // empty at first 1160 1161 1162 printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n", 1163 jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset); 1164 1165 while (check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) { 1166 offset = blhdr_offset = jnl->jhdr->start; 1167 ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size); 1168 if (ret != (size_t)jnl->jhdr->blhdr_size) { 1169 printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset); 1170 bad_blocks = 1; 1171 goto bad_txn_handling; 1172 } 1173 1174 blhdr = (block_list_header *)buff; 1175 1176 orig_checksum = blhdr->checksum; 1177 blhdr->checksum = 0; 1178 if (jnl->flags & JOURNAL_NEED_SWAP) { 1179 // calculate the checksum based on the unswapped data 1180 // because it is done byte-at-a-time. 1181 orig_checksum = (unsigned int)SWAP32(orig_checksum); 1182 checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); 1183 swap_block_list_header(jnl, blhdr); 1184 } else { 1185 checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); 1186 } 1187 1188 1189 // 1190 // XXXdbg - if these checks fail, we should replay as much 1191 // we can in the hopes that it will still leave the 1192 // drive in a better state than if we didn't replay 1193 // anything 1194 // 1195 if (checksum != orig_checksum) { 1196 if (check_past_jnl_end && in_uncharted_territory) { 1197 1198 if (blhdr_offset != jnl->jhdr->end) { 1199 printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset); 1200 } 1201 1202 check_past_jnl_end = 0; 1203 jnl->jhdr->end = blhdr_offset; 1204 continue; 1205 } 1206 1207 printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n", 1208 jnl->jdev_name, blhdr_offset, orig_checksum, checksum); 1209 1210 if (blhdr_offset == orig_jnl_start) { 1211 // if there's nothing in the journal at all, just bail out altogether. 1212 goto bad_replay; 1213 } 1214 1215 bad_blocks = 1; 1216 goto bad_txn_handling; 1217 } 1218 1219 if ( (last_sequence_num != 0) 1220 && (blhdr->binfo[0].u.bi.b.sequence_num != 0) 1221 && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num) 1222 && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num+1)) { 1223 1224 txn_start_offset = jnl->jhdr->end = blhdr_offset; 1225 1226 if (check_past_jnl_end) { 1227 check_past_jnl_end = 0; 1228 printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n", 1229 jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); 1230 continue; 1231 } 1232 1233 printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n", 1234 jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); 1235 bad_blocks = 1; 1236 goto bad_txn_handling; 1237 } 1238 last_sequence_num = blhdr->binfo[0].u.bi.b.sequence_num; 1239 1240 if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) { 1241 if (last_sequence_num == 0) { 1242 check_past_jnl_end = 0; 1243 printf("jnl: %s: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n", 1244 jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); 1245 if (jnl->jhdr->start != jnl->jhdr->end) { 1246 jnl->jhdr->start = jnl->jhdr->end; 1247 } 1248 continue; 1249 } 1250 printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset); 1251 } 1252 1253 if ( blhdr->max_blocks <= 0 || blhdr->max_blocks > (jnl->jhdr->size/jnl->jhdr->jhdr_size) 1254 || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) { 1255 printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n", 1256 jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks); 1257 bad_blocks = 1; 1258 goto bad_txn_handling; 1259 } 1260 1261 max_bsize = 0; 1262 for (i = 1; i < blhdr->num_blocks; i++) { 1263 if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) { 1264 printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum); 1265 bad_blocks = 1; 1266 goto bad_txn_handling; 1267 } 1268 1269 if ((size_t)blhdr->binfo[i].u.bi.bsize > max_bsize) { 1270 max_bsize = blhdr->binfo[i].u.bi.bsize; 1271 } 1272 } 1273 1274 if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) { 1275 check_block_checksums = 1; 1276 if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) { 1277 goto bad_replay; 1278 } 1279 } else { 1280 block_ptr = NULL; 1281 } 1282 1283 if (blhdr->flags & BLHDR_FIRST_HEADER) { 1284 txn_start_offset = blhdr_offset; 1285 } 1286 1287 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n", 1288 // blhdr->num_blocks-1, jnl->jhdr->start); 1289 bad_blocks = 0; 1290 for (i = 1; i < blhdr->num_blocks; i++) { 1291 int size, ret_val; 1292 off_t number; 1293 1294 size = blhdr->binfo[i].u.bi.bsize; 1295 number = blhdr->binfo[i].bnum; 1296 1297 // don't add "killed" blocks 1298 if (number == (off_t)-1) { 1299 //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i); 1300 } else { 1301 1302 if (check_block_checksums) { 1303 int32_t disk_cksum; 1304 off_t block_offset; 1305 1306 block_offset = offset; 1307 1308 // read the block so we can check the checksum 1309 ret = read_journal_data(jnl, &block_offset, block_ptr, size); 1310 if (ret != (size_t)size) { 1311 printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset); 1312 bad_blocks = 1; 1313 goto bad_txn_handling; 1314 } 1315 1316 disk_cksum = calc_checksum(block_ptr, size); 1317 1318 // there is no need to swap the checksum from disk because 1319 // it got swapped when the blhdr was read in. 1320 if (blhdr->binfo[i].u.bi.b.cksum != 0 && disk_cksum != blhdr->binfo[i].u.bi.b.cksum) { 1321 printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n", 1322 jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].u.bi.b.cksum); 1323 printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n", 1324 *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)], 1325 *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]); 1326 1327 bad_blocks = 1; 1328 goto bad_txn_handling; 1329 } 1330 } 1331 1332 1333 // add this bucket to co_buf, coalescing where possible 1334 // printf("jnl: replay_journal: adding block 0x%llx\n", number); 1335 ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].u.bi.b.cksum, &num_buckets, &num_full); 1336 1337 if (ret_val == -1) { 1338 printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name); 1339 goto bad_replay; 1340 } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number); 1341 } 1342 1343 // increment offset 1344 offset += size; 1345 1346 // check if the last block added puts us off the end of the jnl. 1347 // if so, we need to wrap to the beginning and take any remainder 1348 // into account 1349 // 1350 if (offset >= jnl->jhdr->size) { 1351 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); 1352 } 1353 } 1354 1355 if (block_ptr) { 1356 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); 1357 block_ptr = NULL; 1358 } 1359 1360bad_txn_handling: 1361 if (bad_blocks) { 1362 /* Journal replay got error before it found any valid 1363 * transations, abort replay */ 1364 if (txn_start_offset == 0) { 1365 printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name); 1366 goto bad_replay; 1367 } 1368 1369 /* Repeated error during journal replay, abort replay */ 1370 if (replay_retry_count == 3) { 1371 printf("jnl: %s: repeated errors replaying journal! aborting journal replay.\n", jnl->jdev_name); 1372 goto bad_replay; 1373 } 1374 replay_retry_count++; 1375 1376 /* There was an error replaying the journal (possibly 1377 * EIO/ENXIO from the device). So retry replaying all 1378 * the good transactions that we found before getting 1379 * the error. 1380 */ 1381 jnl->jhdr->start = orig_jnl_start; 1382 jnl->jhdr->end = txn_start_offset; 1383 check_past_jnl_end = 0; 1384 last_sequence_num = 0; 1385 printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); 1386 goto restart_replay; 1387 } 1388 1389 jnl->jhdr->start += blhdr->bytes_used; 1390 if (jnl->jhdr->start >= jnl->jhdr->size) { 1391 // wrap around and skip the journal header block 1392 jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size; 1393 } 1394 1395 if (jnl->jhdr->start == jnl->jhdr->end) { 1396 in_uncharted_territory = 1; 1397 } 1398 } 1399 1400 if (jnl->jhdr->start != jnl->jhdr->end) { 1401 printf("jnl: %s: start %lld != end %lld. resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); 1402 jnl->jhdr->end = jnl->jhdr->start; 1403 } 1404 1405 //printf("jnl: replay_journal: replaying %d blocks\n", num_full); 1406 1407 /* 1408 * make sure it's at least one page in size, so 1409 * start max_bsize at PAGE_SIZE 1410 */ 1411 for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) { 1412 1413 if (co_buf[i].block_num == (off_t)-1) 1414 continue; 1415 1416 if (co_buf[i].block_size > max_bsize) 1417 max_bsize = co_buf[i].block_size; 1418 } 1419 /* 1420 * round max_bsize up to the nearest PAGE_SIZE multiple 1421 */ 1422 if (max_bsize & (PAGE_SIZE - 1)) { 1423 max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1); 1424 } 1425 1426 if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) { 1427 goto bad_replay; 1428 } 1429 1430 // Replay the coalesced entries in the co-buf 1431 for(i = 0; i < num_full; i++) { 1432 size_t size = co_buf[i].block_size; 1433 off_t jnl_offset = (off_t) co_buf[i].jnl_offset; 1434 off_t number = co_buf[i].block_num; 1435 1436 1437 // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num, 1438 // co_buf[i].block_size, co_buf[i].jnl_offset); 1439 1440 if (number == (off_t)-1) { 1441 // printf("jnl: replay_journal: skipping killed fs block\n"); 1442 } else { 1443 1444 // do journal read, and set the phys. block 1445 ret = read_journal_data(jnl, &jnl_offset, block_ptr, size); 1446 if (ret != size) { 1447 printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset); 1448 goto bad_replay; 1449 } 1450 1451 if (update_fs_block(jnl, block_ptr, number, size) != 0) { 1452 goto bad_replay; 1453 } 1454 } 1455 } 1456 1457 1458 // done replaying; update jnl header 1459 if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) { 1460 goto bad_replay; 1461 } 1462 1463 printf("jnl: %s: journal replay done.\n", jnl->jdev_name); 1464 1465 // free block_ptr 1466 if (block_ptr) { 1467 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); 1468 block_ptr = NULL; 1469 } 1470 1471 // free the coalesce buffer 1472 FREE(co_buf, M_TEMP); 1473 co_buf = NULL; 1474 1475 kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size); 1476 return 0; 1477 1478bad_replay: 1479 if (block_ptr) { 1480 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); 1481 } 1482 if (co_buf) { 1483 FREE(co_buf, M_TEMP); 1484 } 1485 kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size); 1486 1487 return -1; 1488} 1489 1490 1491#define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024) 1492#define MAX_TRANSACTION_BUFFER_SIZE (3072*1024) 1493 1494// XXXdbg - so I can change it in the debugger 1495int def_tbuffer_size = 0; 1496 1497 1498// 1499// This function sets the size of the tbuffer and the 1500// size of the blhdr. It assumes that jnl->jhdr->size 1501// and jnl->jhdr->jhdr_size are already valid. 1502// 1503static void 1504size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz) 1505{ 1506 // 1507 // one-time initialization based on how much memory 1508 // there is in the machine. 1509 // 1510 if (def_tbuffer_size == 0) { 1511 if (max_mem < (256*1024*1024)) { 1512 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE; 1513 } else if (max_mem < (512*1024*1024)) { 1514 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2; 1515 } else if (max_mem < (1024*1024*1024)) { 1516 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3; 1517 } else { 1518 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (max_mem / (256*1024*1024)); 1519 } 1520 } 1521 1522 // size up the transaction buffer... can't be larger than the number 1523 // of blocks that can fit in a block_list_header block. 1524 if (tbuffer_size == 0) { 1525 jnl->tbuffer_size = def_tbuffer_size; 1526 } else { 1527 // make sure that the specified tbuffer_size isn't too small 1528 if (tbuffer_size < jnl->jhdr->blhdr_size * 2) { 1529 tbuffer_size = jnl->jhdr->blhdr_size * 2; 1530 } 1531 // and make sure it's an even multiple of the block size 1532 if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) { 1533 tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size); 1534 } 1535 1536 jnl->tbuffer_size = tbuffer_size; 1537 } 1538 1539 if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) { 1540 jnl->tbuffer_size = (jnl->jhdr->size / 2); 1541 } 1542 1543 if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) { 1544 jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE; 1545 } 1546 1547 jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info); 1548 if (jnl->jhdr->blhdr_size < phys_blksz) { 1549 jnl->jhdr->blhdr_size = phys_blksz; 1550 } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) { 1551 // have to round up so we're an even multiple of the physical block size 1552 jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1); 1553 } 1554} 1555 1556static void 1557get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context) 1558{ 1559 off_t readblockcnt; 1560 off_t writeblockcnt; 1561 off_t readmaxcnt=0, tmp_readmaxcnt; 1562 off_t writemaxcnt=0, tmp_writemaxcnt; 1563 off_t readsegcnt, writesegcnt; 1564 int32_t features; 1565 1566 if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) { 1567 if (features & DK_FEATURE_FORCE_UNIT_ACCESS) { 1568 const char *name = vnode_getname_printable(devvp); 1569 jnl->flags |= JOURNAL_DO_FUA_WRITES; 1570 printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name, features); 1571 vnode_putname_printable(name); 1572 } 1573 if (features & DK_FEATURE_UNMAP) { 1574 jnl->flags |= JOURNAL_USE_UNMAP; 1575 } 1576 } 1577 1578 // 1579 // First check the max read size via several different mechanisms... 1580 // 1581 VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context); 1582 1583 if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context) == 0) { 1584 tmp_readmaxcnt = readblockcnt * phys_blksz; 1585 if (readmaxcnt == 0 || (readblockcnt > 0 && tmp_readmaxcnt < readmaxcnt)) { 1586 readmaxcnt = tmp_readmaxcnt; 1587 } 1588 } 1589 1590 if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t)&readsegcnt, 0, context)) { 1591 readsegcnt = 0; 1592 } 1593 1594 if (readsegcnt > 0 && (readsegcnt * PAGE_SIZE) < readmaxcnt) { 1595 readmaxcnt = readsegcnt * PAGE_SIZE; 1596 } 1597 1598 if (readmaxcnt == 0) { 1599 readmaxcnt = 128 * 1024; 1600 } else if (readmaxcnt > UINT32_MAX) { 1601 readmaxcnt = UINT32_MAX; 1602 } 1603 1604 1605 // 1606 // Now check the max writes size via several different mechanisms... 1607 // 1608 VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context); 1609 1610 if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context) == 0) { 1611 tmp_writemaxcnt = writeblockcnt * phys_blksz; 1612 if (writemaxcnt == 0 || (writeblockcnt > 0 && tmp_writemaxcnt < writemaxcnt)) { 1613 writemaxcnt = tmp_writemaxcnt; 1614 } 1615 } 1616 1617 if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t)&writesegcnt, 0, context)) { 1618 writesegcnt = 0; 1619 } 1620 1621 if (writesegcnt > 0 && (writesegcnt * PAGE_SIZE) < writemaxcnt) { 1622 writemaxcnt = writesegcnt * PAGE_SIZE; 1623 } 1624 1625 if (writemaxcnt == 0) { 1626 writemaxcnt = 128 * 1024; 1627 } else if (writemaxcnt > UINT32_MAX) { 1628 writemaxcnt = UINT32_MAX; 1629 } 1630 1631 jnl->max_read_size = readmaxcnt; 1632 jnl->max_write_size = writemaxcnt; 1633 // printf("jnl: %s: max read/write: %lld k / %lld k\n", 1634 // jnl->jdev_name ? jnl->jdev_name : "unknown", 1635 // jnl->max_read_size/1024, jnl->max_write_size/1024); 1636} 1637 1638 1639journal * 1640journal_create(struct vnode *jvp, 1641 off_t offset, 1642 off_t journal_size, 1643 struct vnode *fsvp, 1644 size_t min_fs_blksz, 1645 int32_t flags, 1646 int32_t tbuffer_size, 1647 void (*flush)(void *arg), 1648 void *arg, 1649 struct mount *fsmount) 1650{ 1651 journal *jnl; 1652 uint32_t phys_blksz, new_txn_base; 1653 u_int32_t min_size; 1654 struct vfs_context context; 1655 const char *jdev_name; 1656 /* 1657 * Cap the journal max size to 2GB. On HFS, it will attempt to occupy 1658 * a full allocation block if the current size is smaller than the allocation 1659 * block on which it resides. Once we hit the exabyte filesystem range, then 1660 * it will use 2GB allocation blocks. As a result, make the cap 2GB. 1661 */ 1662 context.vc_thread = current_thread(); 1663 context.vc_ucred = FSCRED; 1664 1665 jdev_name = vnode_getname_printable(jvp); 1666 1667 /* Get the real physical block size. */ 1668 if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { 1669 goto cleanup_jdev_name; 1670 } 1671 1672 if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) { 1673 printf("jnl: %s: create: journal size %lld looks bogus.\n", jdev_name, journal_size); 1674 goto cleanup_jdev_name; 1675 } 1676 1677 min_size = phys_blksz * (phys_blksz / sizeof(block_info)); 1678 /* Reject journals that are too small given the sector size of the device */ 1679 if (journal_size < min_size) { 1680 printf("jnl: %s: create: journal size (%lld) too small given sector size of (%u)\n", 1681 jdev_name, journal_size, phys_blksz); 1682 goto cleanup_jdev_name; 1683 } 1684 1685 if (phys_blksz > min_fs_blksz) { 1686 printf("jnl: %s: create: error: phys blksize %u bigger than min fs blksize %zd\n", 1687 jdev_name, phys_blksz, min_fs_blksz); 1688 goto cleanup_jdev_name; 1689 } 1690 1691 if ((journal_size % phys_blksz) != 0) { 1692 printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n", 1693 jdev_name, journal_size, phys_blksz); 1694 goto cleanup_jdev_name; 1695 } 1696 1697 1698 MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK); 1699 memset(jnl, 0, sizeof(*jnl)); 1700 1701 jnl->jdev = jvp; 1702 jnl->jdev_offset = offset; 1703 jnl->fsdev = fsvp; 1704 jnl->flush = flush; 1705 jnl->flush_arg = arg; 1706 jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); 1707 jnl->jdev_name = jdev_name; 1708 lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); 1709 1710 // Keep a point to the mount around for use in IO throttling. 1711 jnl->fsmount = fsmount; 1712 // XXX: This lock discipline looks correct based on dounmount(), but it 1713 // doesn't seem to be documented anywhere. 1714 mount_ref(fsmount, 0); 1715 1716 get_io_info(jvp, phys_blksz, jnl, &context); 1717 1718 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { 1719 printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz); 1720 goto bad_kmem_alloc; 1721 } 1722 jnl->header_buf_size = phys_blksz; 1723 1724 jnl->jhdr = (journal_header *)jnl->header_buf; 1725 memset(jnl->jhdr, 0, sizeof(journal_header)); 1726 1727 // we have to set this up here so that do_journal_io() will work 1728 jnl->jhdr->jhdr_size = phys_blksz; 1729 1730 // 1731 // We try and read the journal header to see if there is already one 1732 // out there. If there is, it's possible that it has transactions 1733 // in it that we might replay if we happen to pick a sequence number 1734 // that is a little less than the old one, there is a crash and the 1735 // last txn written ends right at the start of a txn from the previous 1736 // incarnation of this file system. If all that happens we would 1737 // replay the transactions from the old file system and that would 1738 // destroy your disk. Although it is extremely unlikely for all those 1739 // conditions to happen, the probability is non-zero and the result is 1740 // severe - you lose your file system. Therefore if we find a valid 1741 // journal header and the sequence number is non-zero we write junk 1742 // over the entire journal so that there is no way we will encounter 1743 // any old transactions. This is slow but should be a rare event 1744 // since most tools erase the journal. 1745 // 1746 if ( read_journal_header(jnl, jnl->jhdr, phys_blksz) == phys_blksz 1747 && jnl->jhdr->magic == JOURNAL_HEADER_MAGIC 1748 && jnl->jhdr->sequence_num != 0) { 1749 1750 new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff; 1751 printf("jnl: %s: create: avoiding old sequence number 0x%x (0x%x)\n", jdev_name, jnl->jhdr->sequence_num, new_txn_base); 1752 1753#if 0 1754 int i; 1755 off_t pos=0; 1756 1757 for(i = 1; i < journal_size / phys_blksz; i++) { 1758 pos = i*phys_blksz; 1759 1760 // we don't really care what data we write just so long 1761 // as it's not a valid transaction header. since we have 1762 // the header_buf sitting around we'll use that. 1763 write_journal_data(jnl, &pos, jnl->header_buf, phys_blksz); 1764 } 1765 printf("jnl: create: done clearing journal (i=%d)\n", i); 1766#endif 1767 } else { 1768 new_txn_base = random() & 0x00ffffff; 1769 } 1770 1771 memset(jnl->header_buf, 0, phys_blksz); 1772 1773 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; 1774 jnl->jhdr->endian = ENDIAN_MAGIC; 1775 jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself 1776 jnl->jhdr->end = phys_blksz; 1777 jnl->jhdr->size = journal_size; 1778 jnl->jhdr->jhdr_size = phys_blksz; 1779 size_up_tbuffer(jnl, tbuffer_size, phys_blksz); 1780 1781 jnl->active_start = jnl->jhdr->start; 1782 1783 // XXXdbg - for testing you can force the journal to wrap around 1784 // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3); 1785 // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3); 1786 1787 jnl->jhdr->sequence_num = new_txn_base; 1788 1789 lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); 1790 lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr); 1791 lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr); 1792 1793 1794 jnl->flushing = FALSE; 1795 jnl->asyncIO = FALSE; 1796 jnl->flush_aborted = FALSE; 1797 jnl->writing_header = FALSE; 1798 jnl->async_trim = NULL; 1799 jnl->sequence_num = jnl->jhdr->sequence_num; 1800 1801 if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) { 1802 printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name); 1803 goto bad_write; 1804 } 1805 1806 goto journal_create_complete; 1807 1808 1809bad_write: 1810 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); 1811bad_kmem_alloc: 1812 jnl->jhdr = NULL; 1813 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); 1814 mount_drop(fsmount, 0); 1815cleanup_jdev_name: 1816 vnode_putname_printable(jdev_name); 1817 jnl = NULL; 1818journal_create_complete: 1819 return jnl; 1820} 1821 1822 1823journal * 1824journal_open(struct vnode *jvp, 1825 off_t offset, 1826 off_t journal_size, 1827 struct vnode *fsvp, 1828 size_t min_fs_blksz, 1829 int32_t flags, 1830 int32_t tbuffer_size, 1831 void (*flush)(void *arg), 1832 void *arg, 1833 struct mount *fsmount) 1834{ 1835 journal *jnl; 1836 uint32_t orig_blksz=0; 1837 uint32_t phys_blksz; 1838 u_int32_t min_size = 0; 1839 int orig_checksum, checksum; 1840 struct vfs_context context; 1841 const char *jdev_name = vnode_getname_printable(jvp); 1842 1843 context.vc_thread = current_thread(); 1844 context.vc_ucred = FSCRED; 1845 1846 /* Get the real physical block size. */ 1847 if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { 1848 goto cleanup_jdev_name; 1849 } 1850 1851 if (phys_blksz > min_fs_blksz) { 1852 printf("jnl: %s: open: error: phys blksize %u bigger than min fs blksize %zd\n", 1853 jdev_name, phys_blksz, min_fs_blksz); 1854 goto cleanup_jdev_name; 1855 } 1856 1857 if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) { 1858 printf("jnl: %s: open: journal size %lld looks bogus.\n", jdev_name, journal_size); 1859 goto cleanup_jdev_name; 1860 } 1861 1862 min_size = phys_blksz * (phys_blksz / sizeof(block_info)); 1863 /* Reject journals that are too small given the sector size of the device */ 1864 if (journal_size < min_size) { 1865 printf("jnl: %s: open: journal size (%lld) too small given sector size of (%u)\n", 1866 jdev_name, journal_size, phys_blksz); 1867 goto cleanup_jdev_name; 1868 } 1869 1870 if ((journal_size % phys_blksz) != 0) { 1871 printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%x\n", 1872 jdev_name, journal_size, phys_blksz); 1873 goto cleanup_jdev_name; 1874 } 1875 1876 MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK); 1877 memset(jnl, 0, sizeof(*jnl)); 1878 1879 jnl->jdev = jvp; 1880 jnl->jdev_offset = offset; 1881 jnl->fsdev = fsvp; 1882 jnl->flush = flush; 1883 jnl->flush_arg = arg; 1884 jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); 1885 jnl->jdev_name = jdev_name; 1886 lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); 1887 1888 /* We need a reference to the mount to later pass to the throttling code for 1889 * IO accounting. 1890 */ 1891 jnl->fsmount = fsmount; 1892 mount_ref(fsmount, 0); 1893 1894 get_io_info(jvp, phys_blksz, jnl, &context); 1895 1896 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { 1897 printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz); 1898 goto bad_kmem_alloc; 1899 } 1900 jnl->header_buf_size = phys_blksz; 1901 1902 jnl->jhdr = (journal_header *)jnl->header_buf; 1903 memset(jnl->jhdr, 0, sizeof(journal_header)); 1904 1905 // we have to set this up here so that do_journal_io() will work 1906 jnl->jhdr->jhdr_size = phys_blksz; 1907 1908 if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) { 1909 printf("jnl: %s: open: could not read %u bytes for the journal header.\n", 1910 jdev_name, phys_blksz); 1911 goto bad_journal; 1912 } 1913 1914 orig_checksum = jnl->jhdr->checksum; 1915 jnl->jhdr->checksum = 0; 1916 1917 if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) { 1918 // do this before the swap since it's done byte-at-a-time 1919 orig_checksum = SWAP32(orig_checksum); 1920 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); 1921 swap_journal_header(jnl); 1922 jnl->flags |= JOURNAL_NEED_SWAP; 1923 } else { 1924 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); 1925 } 1926 1927 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { 1928 printf("jnl: %s: open: journal magic is bad (0x%x != 0x%x)\n", 1929 jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); 1930 goto bad_journal; 1931 } 1932 1933 // only check if we're the current journal header magic value 1934 if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) { 1935 1936 if (orig_checksum != checksum) { 1937 printf("jnl: %s: open: journal checksum is bad (0x%x != 0x%x)\n", 1938 jdev_name, orig_checksum, checksum); 1939 1940 //goto bad_journal; 1941 } 1942 } 1943 1944 // XXXdbg - convert old style magic numbers to the new one 1945 if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) { 1946 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; 1947 } 1948 1949 if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) { 1950 /* 1951 * The volume has probably been resized (such that we had to adjust the 1952 * logical sector size), or copied to media with a different logical 1953 * sector size. 1954 * 1955 * Temporarily change the device's logical block size to match the 1956 * journal's header size. This will allow us to replay the journal 1957 * safely. If the replay succeeds, we will update the journal's header 1958 * size (later in this function). 1959 */ 1960 orig_blksz = phys_blksz; 1961 phys_blksz = jnl->jhdr->jhdr_size; 1962 VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context); 1963 printf("jnl: %s: open: temporarily switched block size from %u to %u\n", 1964 jdev_name, orig_blksz, phys_blksz); 1965 } 1966 1967 if ( jnl->jhdr->start <= 0 1968 || jnl->jhdr->start > jnl->jhdr->size 1969 || jnl->jhdr->start > 1024*1024*1024) { 1970 printf("jnl: %s: open: jhdr start looks bad (0x%llx max size 0x%llx)\n", 1971 jdev_name, jnl->jhdr->start, jnl->jhdr->size); 1972 goto bad_journal; 1973 } 1974 1975 if ( jnl->jhdr->end <= 0 1976 || jnl->jhdr->end > jnl->jhdr->size 1977 || jnl->jhdr->end > 1024*1024*1024) { 1978 printf("jnl: %s: open: jhdr end looks bad (0x%llx max size 0x%llx)\n", 1979 jdev_name, jnl->jhdr->end, jnl->jhdr->size); 1980 goto bad_journal; 1981 } 1982 1983 if (jnl->jhdr->size < (256*1024) || jnl->jhdr->size > 1024*1024*1024) { 1984 printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size); 1985 goto bad_journal; 1986 } 1987 1988// XXXdbg - can't do these checks because hfs writes all kinds of 1989// non-uniform sized blocks even on devices that have a block size 1990// that is larger than 512 bytes (i.e. optical media w/2k blocks). 1991// therefore these checks will fail and so we just have to punt and 1992// do more relaxed checking... 1993// XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) { 1994 if ((jnl->jhdr->start % 512) != 0) { 1995 printf("jnl: %s: open: journal start (0x%llx) not a multiple of 512?\n", 1996 jdev_name, jnl->jhdr->start); 1997 goto bad_journal; 1998 } 1999 2000//XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) { 2001 if ((jnl->jhdr->end % 512) != 0) { 2002 printf("jnl: %s: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n", 2003 jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size); 2004 goto bad_journal; 2005 } 2006 2007 // take care of replaying the journal if necessary 2008 if (flags & JOURNAL_RESET) { 2009 printf("jnl: %s: journal start/end pointers reset! (jnl %p; s 0x%llx e 0x%llx)\n", 2010 jdev_name, jnl, jnl->jhdr->start, jnl->jhdr->end); 2011 jnl->jhdr->start = jnl->jhdr->end; 2012 } else if (replay_journal(jnl) != 0) { 2013 printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name); 2014 goto bad_journal; 2015 } 2016 2017 /* 2018 * When we get here, we know that the journal is empty (jnl->jhdr->start == 2019 * jnl->jhdr->end). If the device's logical block size was different from 2020 * the journal's header size, then we can now restore the device's logical 2021 * block size and update the journal's header size to match. 2022 * 2023 * Note that we also adjust the journal's start and end so that they will 2024 * be aligned on the new block size. We pick a new sequence number to 2025 * avoid any problems if a replay found previous transactions using the old 2026 * journal header size. (See the comments in journal_create(), above.) 2027 */ 2028 2029 if (orig_blksz != 0) { 2030 VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); 2031 phys_blksz = orig_blksz; 2032 2033 orig_blksz = 0; 2034 2035 jnl->jhdr->jhdr_size = phys_blksz; 2036 jnl->jhdr->start = phys_blksz; 2037 jnl->jhdr->end = phys_blksz; 2038 jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num + 2039 (journal_size / phys_blksz) + 2040 (random() % 16384)) & 0x00ffffff; 2041 2042 if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) { 2043 printf("jnl: %s: open: failed to update journal header size\n", jdev_name); 2044 goto bad_journal; 2045 } 2046 } 2047 2048 // make sure this is in sync! 2049 jnl->active_start = jnl->jhdr->start; 2050 jnl->sequence_num = jnl->jhdr->sequence_num; 2051 2052 // set this now, after we've replayed the journal 2053 size_up_tbuffer(jnl, tbuffer_size, phys_blksz); 2054 2055 // TODO: Does this need to change if the device's logical block size changed? 2056 if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) { 2057 printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size, 2058 jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size); 2059 goto bad_journal; 2060 } 2061 2062 lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); 2063 lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr); 2064 lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr); 2065 2066 goto journal_open_complete; 2067 2068bad_journal: 2069 if (orig_blksz != 0) { 2070 phys_blksz = orig_blksz; 2071 VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); 2072 printf("jnl: %s: open: restored block size after error\n", jdev_name); 2073 } 2074 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); 2075bad_kmem_alloc: 2076 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); 2077 mount_drop(fsmount, 0); 2078cleanup_jdev_name: 2079 vnode_putname_printable(jdev_name); 2080 jnl = NULL; 2081journal_open_complete: 2082 return jnl; 2083} 2084 2085 2086int 2087journal_is_clean(struct vnode *jvp, 2088 off_t offset, 2089 off_t journal_size, 2090 struct vnode *fsvp, 2091 size_t min_fs_block_size) 2092{ 2093 journal jnl; 2094 uint32_t phys_blksz; 2095 int ret; 2096 int orig_checksum, checksum; 2097 struct vfs_context context; 2098 const char *jdev_name = vnode_getname_printable(jvp); 2099 2100 context.vc_thread = current_thread(); 2101 context.vc_ucred = FSCRED; 2102 2103 /* Get the real physical block size. */ 2104 if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { 2105 printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name); 2106 ret = EINVAL; 2107 goto cleanup_jdev_name; 2108 } 2109 2110 if (phys_blksz > (uint32_t)min_fs_block_size) { 2111 printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n", 2112 jdev_name, phys_blksz, min_fs_block_size); 2113 ret = EINVAL; 2114 goto cleanup_jdev_name; 2115 } 2116 2117 if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) { 2118 printf("jnl: %s: is_clean: journal size %lld looks bogus.\n", jdev_name, journal_size); 2119 ret = EINVAL; 2120 goto cleanup_jdev_name; 2121 } 2122 2123 if ((journal_size % phys_blksz) != 0) { 2124 printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n", 2125 jdev_name, journal_size, phys_blksz); 2126 ret = EINVAL; 2127 goto cleanup_jdev_name; 2128 } 2129 2130 memset(&jnl, 0, sizeof(jnl)); 2131 2132 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) { 2133 printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz); 2134 ret = ENOMEM; 2135 goto cleanup_jdev_name; 2136 } 2137 jnl.header_buf_size = phys_blksz; 2138 2139 get_io_info(jvp, phys_blksz, &jnl, &context); 2140 2141 jnl.jhdr = (journal_header *)jnl.header_buf; 2142 memset(jnl.jhdr, 0, sizeof(journal_header)); 2143 2144 jnl.jdev = jvp; 2145 jnl.jdev_offset = offset; 2146 jnl.fsdev = fsvp; 2147 2148 // we have to set this up here so that do_journal_io() will work 2149 jnl.jhdr->jhdr_size = phys_blksz; 2150 2151 if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) { 2152 printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n", 2153 jdev_name, phys_blksz); 2154 ret = EINVAL; 2155 goto get_out; 2156 } 2157 2158 orig_checksum = jnl.jhdr->checksum; 2159 jnl.jhdr->checksum = 0; 2160 2161 if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) { 2162 // do this before the swap since it's done byte-at-a-time 2163 orig_checksum = SWAP32(orig_checksum); 2164 checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); 2165 swap_journal_header(&jnl); 2166 jnl.flags |= JOURNAL_NEED_SWAP; 2167 } else { 2168 checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); 2169 } 2170 2171 if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { 2172 printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n", 2173 jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC); 2174 ret = EINVAL; 2175 goto get_out; 2176 } 2177 2178 if (orig_checksum != checksum) { 2179 printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum); 2180 ret = EINVAL; 2181 goto get_out; 2182 } 2183 2184 // 2185 // if the start and end are equal then the journal is clean. 2186 // otherwise it's not clean and therefore an error. 2187 // 2188 if (jnl.jhdr->start == jnl.jhdr->end) { 2189 ret = 0; 2190 } else { 2191 ret = EBUSY; // so the caller can differentiate an invalid journal from a "busy" one 2192 } 2193 2194get_out: 2195 kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz); 2196cleanup_jdev_name: 2197 vnode_putname_printable(jdev_name); 2198 return ret; 2199} 2200 2201 2202void 2203journal_close(journal *jnl) 2204{ 2205 volatile off_t *start, *end; 2206 int counter=0; 2207 2208 CHECK_JOURNAL(jnl); 2209 2210 // set this before doing anything that would block so that 2211 // we start tearing things down properly. 2212 // 2213 jnl->flags |= JOURNAL_CLOSE_PENDING; 2214 2215 if (jnl->owner != current_thread()) { 2216 journal_lock(jnl); 2217 } 2218 2219 wait_condition(jnl, &jnl->flushing, "journal_close"); 2220 2221 // 2222 // only write stuff to disk if the journal is still valid 2223 // 2224 if ((jnl->flags & JOURNAL_INVALID) == 0) { 2225 2226 if (jnl->active_tr) { 2227 /* 2228 * "journal_end_transaction" will fire the flush asynchronously 2229 */ 2230 journal_end_transaction(jnl); 2231 } 2232 2233 // flush any buffered transactions 2234 if (jnl->cur_tr) { 2235 transaction *tr = jnl->cur_tr; 2236 2237 jnl->cur_tr = NULL; 2238 /* 2239 * "end_transaction" will wait for any in-progress flush to complete 2240 * before flushing "cur_tr" synchronously("must_wait" == TRUE) 2241 */ 2242 end_transaction(tr, 1, NULL, NULL, FALSE, TRUE); 2243 } 2244 /* 2245 * if there was an "active_tr", make sure we wait for 2246 * it to flush if there was no "cur_tr" to process 2247 */ 2248 wait_condition(jnl, &jnl->flushing, "journal_close"); 2249 2250 //start = &jnl->jhdr->start; 2251 start = &jnl->active_start; 2252 end = &jnl->jhdr->end; 2253 2254 while (*start != *end && counter++ < 5000) { 2255 //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end); 2256 if (jnl->flush) { 2257 jnl->flush(jnl->flush_arg); 2258 } 2259 tsleep((caddr_t)jnl, PRIBIO, "jnl_close", 2); 2260 } 2261 2262 if (*start != *end) { 2263 printf("jnl: %s: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n", 2264 jnl->jdev_name, *start, *end); 2265 } 2266 2267 // make sure this is in sync when we close the journal 2268 jnl->jhdr->start = jnl->active_start; 2269 2270 // if this fails there's not much we can do at this point... 2271 write_journal_header(jnl, 1, jnl->sequence_num); 2272 } else { 2273 // if we're here the journal isn't valid any more. 2274 // so make sure we don't leave any locked blocks lying around 2275 printf("jnl: %s: close: journal %p, is invalid. aborting outstanding transactions\n", jnl->jdev_name, jnl); 2276 2277 if (jnl->active_tr || jnl->cur_tr) { 2278 transaction *tr; 2279 2280 if (jnl->active_tr) { 2281 tr = jnl->active_tr; 2282 jnl->active_tr = NULL; 2283 } else { 2284 tr = jnl->cur_tr; 2285 jnl->cur_tr = NULL; 2286 } 2287 abort_transaction(jnl, tr); 2288 2289 if (jnl->active_tr || jnl->cur_tr) { 2290 panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl); 2291 } 2292 } 2293 } 2294 wait_condition(jnl, &jnl->asyncIO, "journal_close"); 2295 2296 free_old_stuff(jnl); 2297 2298 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size); 2299 jnl->jhdr = (void *)0xbeefbabe; 2300 2301 // Release reference on the mount 2302 if (jnl->fsmount) 2303 mount_drop(jnl->fsmount, 0); 2304 2305 vnode_putname_printable(jnl->jdev_name); 2306 2307 journal_unlock(jnl); 2308 lck_mtx_destroy(&jnl->old_start_lock, jnl_mutex_group); 2309 lck_mtx_destroy(&jnl->jlock, jnl_mutex_group); 2310 lck_mtx_destroy(&jnl->flock, jnl_mutex_group); 2311 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); 2312} 2313 2314static void 2315dump_journal(journal *jnl) 2316{ 2317 transaction *ctr; 2318 2319 printf("journal for dev %s:", jnl->jdev_name); 2320 printf(" jdev_offset %.8llx\n", jnl->jdev_offset); 2321 printf(" magic: 0x%.8x\n", jnl->jhdr->magic); 2322 printf(" start: 0x%.8llx\n", jnl->jhdr->start); 2323 printf(" end: 0x%.8llx\n", jnl->jhdr->end); 2324 printf(" size: 0x%.8llx\n", jnl->jhdr->size); 2325 printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size); 2326 printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size); 2327 printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum); 2328 2329 printf(" completed transactions:\n"); 2330 for (ctr = jnl->completed_trs; ctr; ctr = ctr->next) { 2331 printf(" 0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end); 2332 } 2333} 2334 2335 2336 2337static off_t 2338free_space(journal *jnl) 2339{ 2340 off_t free_space_offset; 2341 2342 if (jnl->jhdr->start < jnl->jhdr->end) { 2343 free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size; 2344 } else if (jnl->jhdr->start > jnl->jhdr->end) { 2345 free_space_offset = jnl->jhdr->start - jnl->jhdr->end; 2346 } else { 2347 // journal is completely empty 2348 free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size; 2349 } 2350 2351 return free_space_offset; 2352} 2353 2354 2355// 2356// The journal must be locked on entry to this function. 2357// The "desired_size" is in bytes. 2358// 2359static int 2360check_free_space(journal *jnl, int desired_size, boolean_t *delayed_header_write, uint32_t sequence_num) 2361{ 2362 size_t i; 2363 int counter=0; 2364 2365 //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n", 2366 // desired_size, free_space(jnl)); 2367 2368 if (delayed_header_write) 2369 *delayed_header_write = FALSE; 2370 2371 while (1) { 2372 int old_start_empty; 2373 2374 // make sure there's space in the journal to hold this transaction 2375 if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) { 2376 break; 2377 } 2378 if (counter++ == 5000) { 2379 dump_journal(jnl); 2380 panic("jnl: check_free_space: buffer flushing isn't working " 2381 "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl, 2382 jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start); 2383 } 2384 if (counter > 7500) { 2385 printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name); 2386 return ENOSPC; 2387 } 2388 2389 // 2390 // here's where we lazily bump up jnl->jhdr->start. we'll consume 2391 // entries until there is enough space for the next transaction. 2392 // 2393 old_start_empty = 1; 2394 lock_oldstart(jnl); 2395 2396 for (i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { 2397 int lcl_counter; 2398 2399 lcl_counter = 0; 2400 while (jnl->old_start[i] & 0x8000000000000000LL) { 2401 if (lcl_counter++ > 10000) { 2402 panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n", 2403 jnl->old_start[i], jnl); 2404 } 2405 2406 unlock_oldstart(jnl); 2407 if (jnl->flush) { 2408 jnl->flush(jnl->flush_arg); 2409 } 2410 tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1); 2411 lock_oldstart(jnl); 2412 } 2413 2414 if (jnl->old_start[i] == 0) { 2415 continue; 2416 } 2417 2418 old_start_empty = 0; 2419 jnl->jhdr->start = jnl->old_start[i]; 2420 jnl->old_start[i] = 0; 2421 2422 if (free_space(jnl) > desired_size) { 2423 2424 if (delayed_header_write) 2425 *delayed_header_write = TRUE; 2426 else { 2427 unlock_oldstart(jnl); 2428 write_journal_header(jnl, 1, sequence_num); 2429 lock_oldstart(jnl); 2430 } 2431 break; 2432 } 2433 } 2434 unlock_oldstart(jnl); 2435 2436 // if we bumped the start, loop and try again 2437 if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { 2438 continue; 2439 } else if (old_start_empty) { 2440 // 2441 // if there is nothing in old_start anymore then we can 2442 // bump the jhdr->start to be the same as active_start 2443 // since it is possible there was only one very large 2444 // transaction in the old_start array. if we didn't do 2445 // this then jhdr->start would never get updated and we 2446 // would wind up looping until we hit the panic at the 2447 // start of the loop. 2448 // 2449 jnl->jhdr->start = jnl->active_start; 2450 2451 if (delayed_header_write) 2452 *delayed_header_write = TRUE; 2453 else 2454 write_journal_header(jnl, 1, sequence_num); 2455 continue; 2456 } 2457 2458 2459 // if the file system gave us a flush function, call it to so that 2460 // it can flush some blocks which hopefully will cause some transactions 2461 // to complete and thus free up space in the journal. 2462 if (jnl->flush) { 2463 jnl->flush(jnl->flush_arg); 2464 } 2465 2466 // wait for a while to avoid being cpu-bound (this will 2467 // put us to sleep for 10 milliseconds) 2468 tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1); 2469 } 2470 2471 return 0; 2472} 2473 2474/* 2475 * Allocate a new active transaction. 2476 */ 2477static errno_t 2478journal_allocate_transaction(journal *jnl) 2479{ 2480 transaction *tr; 2481 boolean_t was_vm_privileged; 2482 2483 if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) { 2484 /* 2485 * the disk driver can allocate memory on this path... 2486 * if we block waiting for memory, and there is enough pressure to 2487 * cause us to try and create a new swap file, we may end up deadlocking 2488 * due to waiting for the journal on the swap file creation path... 2489 * by making ourselves vm_privileged, we give ourselves the best chance 2490 * of not blocking 2491 */ 2492 was_vm_privileged = set_vm_privilege(TRUE); 2493 } 2494 MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK); 2495 memset(tr, 0, sizeof(transaction)); 2496 2497 tr->tbuffer_size = jnl->tbuffer_size; 2498 2499 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) { 2500 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); 2501 jnl->active_tr = NULL; 2502 return ENOMEM; 2503 } 2504 if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) 2505 set_vm_privilege(FALSE); 2506 2507 // journal replay code checksum check depends on this. 2508 memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE); 2509 // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility) 2510 memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE); 2511 2512 tr->blhdr = (block_list_header *)tr->tbuffer; 2513 tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1; 2514 tr->blhdr->num_blocks = 1; // accounts for this header block 2515 tr->blhdr->bytes_used = jnl->jhdr->blhdr_size; 2516 tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER; 2517 2518 tr->sequence_num = ++jnl->sequence_num; 2519 tr->num_blhdrs = 1; 2520 tr->total_bytes = jnl->jhdr->blhdr_size; 2521 tr->jnl = jnl; 2522 2523 jnl->active_tr = tr; 2524 2525 return 0; 2526} 2527 2528int 2529journal_start_transaction(journal *jnl) 2530{ 2531 int ret; 2532 2533 CHECK_JOURNAL(jnl); 2534 2535 free_old_stuff(jnl); 2536 2537 if (jnl->flags & JOURNAL_INVALID) { 2538 return EINVAL; 2539 } 2540 if (jnl->owner == current_thread()) { 2541 if (jnl->active_tr == NULL) { 2542 panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n", 2543 jnl, jnl->owner, current_thread()); 2544 } 2545 jnl->nested_count++; 2546 return 0; 2547 } 2548 2549 journal_lock(jnl); 2550 2551 if (jnl->nested_count != 0 || jnl->active_tr != NULL) { 2552 panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n", 2553 jnl->owner, jnl->nested_count, jnl->active_tr, jnl); 2554 } 2555 2556 jnl->nested_count = 1; 2557 2558#if JOE 2559 // make sure there's room in the journal 2560 if (free_space(jnl) < jnl->tbuffer_size) { 2561 2562 KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0); 2563 2564 // this is the call that really waits for space to free up 2565 // as well as updating jnl->jhdr->start 2566 if (check_free_space(jnl, jnl->tbuffer_size, NULL, jnl->sequence_num) != 0) { 2567 printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name); 2568 ret = ENOSPC; 2569 goto bad_start; 2570 } 2571 KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, 0, 0, 0, 0); 2572 } 2573#endif 2574 2575 // if there's a buffered transaction, use it. 2576 if (jnl->cur_tr) { 2577 jnl->active_tr = jnl->cur_tr; 2578 jnl->cur_tr = NULL; 2579 2580 return 0; 2581 } 2582 2583 ret = journal_allocate_transaction(jnl); 2584 if (ret) { 2585 goto bad_start; 2586 } 2587 2588 // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr); 2589 2590 return 0; 2591 2592bad_start: 2593 jnl->nested_count = 0; 2594 journal_unlock(jnl); 2595 2596 return ret; 2597} 2598 2599 2600int 2601journal_modify_block_start(journal *jnl, struct buf *bp) 2602{ 2603 transaction *tr; 2604 2605 CHECK_JOURNAL(jnl); 2606 2607 2608 free_old_stuff(jnl); 2609 2610 if (jnl->flags & JOURNAL_INVALID) { 2611 return EINVAL; 2612 } 2613 2614 // XXXdbg - for debugging I want this to be true. later it may 2615 // not be necessary. 2616 if ((buf_flags(bp) & B_META) == 0) { 2617 panic("jnl: modify_block_start: bp @ %p is not a meta-data block! (jnl %p)\n", bp, jnl); 2618 } 2619 2620 tr = jnl->active_tr; 2621 CHECK_TRANSACTION(tr); 2622 2623 if (jnl->owner != current_thread()) { 2624 panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n", 2625 jnl, jnl->owner, current_thread()); 2626 } 2627 2628 //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n", 2629 // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); 2630 2631 // can't allow blocks that aren't an even multiple of the 2632 // underlying block size. 2633 if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) { 2634 uint32_t phys_blksz, bad=0; 2635 2636 if (VNOP_IOCTL(jnl->jdev, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, vfs_context_kernel())) { 2637 bad = 1; 2638 } else if (phys_blksz != (uint32_t)jnl->jhdr->jhdr_size) { 2639 if (phys_blksz < 512) { 2640 panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n", 2641 phys_blksz, buf_size(bp), jnl->jhdr->jhdr_size); 2642 } 2643 2644 if ((buf_size(bp) % phys_blksz) != 0) { 2645 bad = 1; 2646 } else if (phys_blksz < (uint32_t)jnl->jhdr->jhdr_size) { 2647 jnl->jhdr->jhdr_size = phys_blksz; 2648 } else { 2649 // the phys_blksz is now larger... need to realloc the jhdr 2650 char *new_header_buf; 2651 2652 printf("jnl: %s: phys blksz got bigger (was: %d/%d now %d)\n", 2653 jnl->jdev_name, jnl->header_buf_size, jnl->jhdr->jhdr_size, phys_blksz); 2654 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&new_header_buf, phys_blksz)) { 2655 printf("jnl: modify_block_start: %s: create: phys blksz change (was %d, now %d) but could not allocate space for new header\n", 2656 jnl->jdev_name, jnl->jhdr->jhdr_size, phys_blksz); 2657 bad = 1; 2658 } else { 2659 memcpy(new_header_buf, jnl->header_buf, jnl->header_buf_size); 2660 memset(&new_header_buf[jnl->header_buf_size], 0x18, (phys_blksz - jnl->header_buf_size)); 2661 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size); 2662 jnl->header_buf = new_header_buf; 2663 jnl->header_buf_size = phys_blksz; 2664 2665 jnl->jhdr = (journal_header *)jnl->header_buf; 2666 jnl->jhdr->jhdr_size = phys_blksz; 2667 } 2668 } 2669 } else { 2670 bad = 1; 2671 } 2672 2673 if (bad) { 2674 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n", 2675 buf_size(bp), jnl->jhdr->jhdr_size); 2676 return -1; 2677 } 2678 } 2679 2680 // make sure that this transaction isn't bigger than the whole journal 2681 if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) { 2682 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n", 2683 tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp); 2684 return -1; 2685 } 2686 2687 // if the block is dirty and not already locked we have to write 2688 // it out before we muck with it because it has data that belongs 2689 // (presumably) to another transaction. 2690 // 2691 if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) { 2692 2693 if (buf_flags(bp) & B_ASYNC) { 2694 panic("modify_block_start: bp @ %p has async flag set!\n", bp); 2695 } 2696 if (bp->b_shadow_ref) 2697 panic("modify_block_start: dirty bp @ %p has shadows!\n", bp); 2698 2699 // this will cause it to not be buf_brelse()'d 2700 buf_setflags(bp, B_NORELSE); 2701 VNOP_BWRITE(bp); 2702 } 2703 buf_setflags(bp, B_LOCKED); 2704 2705 return 0; 2706} 2707 2708int 2709journal_modify_block_abort(journal *jnl, struct buf *bp) 2710{ 2711 transaction *tr; 2712 block_list_header *blhdr; 2713 int i; 2714 2715 CHECK_JOURNAL(jnl); 2716 2717 free_old_stuff(jnl); 2718 2719 tr = jnl->active_tr; 2720 2721 // 2722 // if there's no active transaction then we just want to 2723 // call buf_brelse() and return since this is just a block 2724 // that happened to be modified as part of another tr. 2725 // 2726 if (tr == NULL) { 2727 buf_brelse(bp); 2728 return 0; 2729 } 2730 2731 if (jnl->flags & JOURNAL_INVALID) { 2732 /* Still need to buf_brelse(). Callers assume we consume the bp. */ 2733 buf_brelse(bp); 2734 return EINVAL; 2735 } 2736 2737 CHECK_TRANSACTION(tr); 2738 2739 if (jnl->owner != current_thread()) { 2740 panic("jnl: modify_block_abort: called w/out a transaction! jnl %p, owner %p, curact %p\n", 2741 jnl, jnl->owner, current_thread()); 2742 } 2743 2744 // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp); 2745 2746 // first check if it's already part of this transaction 2747 for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { 2748 for (i = 1; i < blhdr->num_blocks; i++) { 2749 if (bp == blhdr->binfo[i].u.bp) { 2750 break; 2751 } 2752 } 2753 2754 if (i < blhdr->num_blocks) { 2755 break; 2756 } 2757 } 2758 2759 // 2760 // if blhdr is null, then this block has only had modify_block_start 2761 // called on it as part of the current transaction. that means that 2762 // it is ok to clear the LOCKED bit since it hasn't actually been 2763 // modified. if blhdr is non-null then modify_block_end was called 2764 // on it and so we need to keep it locked in memory. 2765 // 2766 if (blhdr == NULL) { 2767 buf_clearflags(bp, B_LOCKED); 2768 } 2769 2770 buf_brelse(bp); 2771 return 0; 2772} 2773 2774 2775int 2776journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(buf_t bp, void *arg), void *arg) 2777{ 2778 int i = 1; 2779 int tbuffer_offset=0; 2780 block_list_header *blhdr, *prev=NULL; 2781 transaction *tr; 2782 2783 CHECK_JOURNAL(jnl); 2784 2785 free_old_stuff(jnl); 2786 2787 if (jnl->flags & JOURNAL_INVALID) { 2788 /* Still need to buf_brelse(). Callers assume we consume the bp. */ 2789 buf_brelse(bp); 2790 return EINVAL; 2791 } 2792 2793 tr = jnl->active_tr; 2794 CHECK_TRANSACTION(tr); 2795 2796 if (jnl->owner != current_thread()) { 2797 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n", 2798 jnl, jnl->owner, current_thread()); 2799 } 2800 2801 //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n", 2802 // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); 2803 2804 if ((buf_flags(bp) & B_LOCKED) == 0) { 2805 panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", bp, jnl); 2806 } 2807 2808 // first check if it's already part of this transaction 2809 for (blhdr = tr->blhdr; blhdr; prev = blhdr, blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { 2810 tbuffer_offset = jnl->jhdr->blhdr_size; 2811 2812 for (i = 1; i < blhdr->num_blocks; i++) { 2813 if (bp == blhdr->binfo[i].u.bp) { 2814 break; 2815 } 2816 if (blhdr->binfo[i].bnum != (off_t)-1) { 2817 tbuffer_offset += buf_size(blhdr->binfo[i].u.bp); 2818 } else { 2819 tbuffer_offset += blhdr->binfo[i].u.bi.bsize; 2820 } 2821 } 2822 2823 if (i < blhdr->num_blocks) { 2824 break; 2825 } 2826 } 2827 2828 if (blhdr == NULL 2829 && prev 2830 && (prev->num_blocks+1) <= prev->max_blocks 2831 && (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) { 2832 blhdr = prev; 2833 2834 } else if (blhdr == NULL) { 2835 block_list_header *nblhdr; 2836 if (prev == NULL) { 2837 panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, bp %p\n", jnl, bp); 2838 } 2839 2840 // we got to the end of the list, didn't find the block and there's 2841 // no room in the block_list_header pointed to by prev 2842 2843 // we allocate another tbuffer and link it in at the end of the list 2844 // through prev->binfo[0].bnum. that's a skanky way to do things but 2845 // avoids having yet another linked list of small data structures to manage. 2846 2847 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) { 2848 panic("jnl: end_tr: no space for new block tr @ %p (total bytes: %d)!\n", 2849 tr, tr->total_bytes); 2850 } 2851 2852 // journal replay code checksum check depends on this. 2853 memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE); 2854 // Fill up the rest of the block with unimportant bytes 2855 memset(nblhdr + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE); 2856 2857 // initialize the new guy 2858 nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1; 2859 nblhdr->num_blocks = 1; // accounts for this header block 2860 nblhdr->bytes_used = jnl->jhdr->blhdr_size; 2861 nblhdr->flags = BLHDR_CHECK_CHECKSUMS; 2862 2863 tr->num_blhdrs++; 2864 tr->total_bytes += jnl->jhdr->blhdr_size; 2865 2866 // then link him in at the end 2867 prev->binfo[0].bnum = (off_t)((long)nblhdr); 2868 2869 // and finally switch to using the new guy 2870 blhdr = nblhdr; 2871 tbuffer_offset = jnl->jhdr->blhdr_size; 2872 i = 1; 2873 } 2874 2875 2876 if ((i+1) > blhdr->max_blocks) { 2877 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks); 2878 } 2879 2880 // if this is true then this is a new block we haven't seen 2881 if (i >= blhdr->num_blocks) { 2882 int bsize; 2883 vnode_t vp; 2884 2885 vp = buf_vnode(bp); 2886 vnode_ref(vp); 2887 bsize = buf_size(bp); 2888 2889 blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp)); 2890 blhdr->binfo[i].u.bp = bp; 2891 2892 KERNEL_DEBUG_CONSTANT(0x3018004, VM_KERNEL_ADDRPERM(vp), blhdr->binfo[i].bnum, bsize, 0, 0); 2893 2894 if (func) { 2895 void (*old_func)(buf_t, void *)=NULL, *old_arg=NULL; 2896 2897 buf_setfilter(bp, func, arg, &old_func, &old_arg); 2898 if (old_func != NULL && old_func != func) { 2899 panic("jnl: modify_block_end: old func %p / arg %p (func %p)", old_func, old_arg, func); 2900 } 2901 } 2902 2903 blhdr->bytes_used += bsize; 2904 tr->total_bytes += bsize; 2905 2906 blhdr->num_blocks++; 2907 } 2908 buf_bdwrite(bp); 2909 2910 return 0; 2911} 2912 2913int 2914journal_kill_block(journal *jnl, struct buf *bp) 2915{ 2916 int i; 2917 int bflags; 2918 block_list_header *blhdr; 2919 transaction *tr; 2920 2921 CHECK_JOURNAL(jnl); 2922 2923 free_old_stuff(jnl); 2924 2925 if (jnl->flags & JOURNAL_INVALID) { 2926 return EINVAL; 2927 } 2928 2929 tr = jnl->active_tr; 2930 CHECK_TRANSACTION(tr); 2931 2932 if (jnl->owner != current_thread()) { 2933 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n", 2934 jnl, jnl->owner, current_thread()); 2935 } 2936 2937 bflags = buf_flags(bp); 2938 2939 if ( !(bflags & B_LOCKED)) 2940 panic("jnl: modify_block_end: called with bp not B_LOCKED"); 2941 2942 /* 2943 * bp must be BL_BUSY and B_LOCKED 2944 * first check if it's already part of this transaction 2945 */ 2946 for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { 2947 2948 for (i = 1; i < blhdr->num_blocks; i++) { 2949 if (bp == blhdr->binfo[i].u.bp) { 2950 vnode_t vp; 2951 2952 buf_clearflags(bp, B_LOCKED); 2953 2954 // this undoes the vnode_ref() in journal_modify_block_end() 2955 vp = buf_vnode(bp); 2956 vnode_rele_ext(vp, 0, 1); 2957 2958 // if the block has the DELWRI and FILTER bits sets, then 2959 // things are seriously weird. if it was part of another 2960 // transaction then journal_modify_block_start() should 2961 // have force it to be written. 2962 // 2963 //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) { 2964 // panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp); 2965 //} else { 2966 tr->num_killed += buf_size(bp); 2967 //} 2968 blhdr->binfo[i].bnum = (off_t)-1; 2969 blhdr->binfo[i].u.bp = NULL; 2970 blhdr->binfo[i].u.bi.bsize = buf_size(bp); 2971 2972 buf_markinvalid(bp); 2973 buf_brelse(bp); 2974 2975 break; 2976 } 2977 } 2978 2979 if (i < blhdr->num_blocks) { 2980 break; 2981 } 2982 } 2983 2984 return 0; 2985} 2986 2987/* 2988;________________________________________________________________________________ 2989; 2990; Routine: journal_trim_set_callback 2991; 2992; Function: Provide the journal with a routine to be called back when a 2993; TRIM has (or would have) been issued to the device. That 2994; is, the transaction has been flushed to the device, and the 2995; blocks freed by the transaction are now safe for reuse. 2996; 2997; CAUTION: If the journal becomes invalid (eg., due to an I/O 2998; error when trying to write to the journal), this callback 2999; will stop getting called, even if extents got freed before 3000; the journal became invalid! 3001; 3002; Input Arguments: 3003; jnl - The journal structure for the filesystem. 3004; callback - The function to call when the TRIM is complete. 3005; arg - An argument to be passed to callback. 3006;________________________________________________________________________________ 3007*/ 3008__private_extern__ void 3009journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg) 3010{ 3011 jnl->trim_callback = callback; 3012 jnl->trim_callback_arg = arg; 3013} 3014 3015 3016/* 3017;________________________________________________________________________________ 3018; 3019; Routine: journal_trim_realloc 3020; 3021; Function: Increase the amount of memory allocated for the list of extents 3022; to be unmapped (trimmed). This routine will be called when 3023; adding an extent to the list, and the list already occupies 3024; all of the space allocated to it. This routine returns ENOMEM 3025; if unable to allocate more space, or 0 if the extent list was 3026; grown successfully. 3027; 3028; Input Arguments: 3029; trim - The trim list to be resized. 3030; 3031; Output: 3032; (result) - ENOMEM or 0. 3033; 3034; Side effects: 3035; The allocated_count and extents fields of tr->trim are updated 3036; if the function returned 0. 3037;________________________________________________________________________________ 3038*/ 3039static int 3040trim_realloc(journal *jnl, struct jnl_trim_list *trim) 3041{ 3042 void *new_extents; 3043 uint32_t new_allocated_count; 3044 boolean_t was_vm_privileged; 3045 3046 if (jnl_kdebug) 3047 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, VM_KERNEL_ADDRPERM(trim), 0, trim->allocated_count, trim->extent_count, 0); 3048 3049 new_allocated_count = trim->allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS; 3050 3051 if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) { 3052 /* 3053 * if we block waiting for memory, and there is enough pressure to 3054 * cause us to try and create a new swap file, we may end up deadlocking 3055 * due to waiting for the journal on the swap file creation path... 3056 * by making ourselves vm_privileged, we give ourselves the best chance 3057 * of not blocking 3058 */ 3059 was_vm_privileged = set_vm_privilege(TRUE); 3060 } 3061 new_extents = kalloc(new_allocated_count * sizeof(dk_extent_t)); 3062 if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) 3063 set_vm_privilege(FALSE); 3064 3065 if (new_extents == NULL) { 3066 printf("jnl: trim_realloc: unable to grow extent list!\n"); 3067 /* 3068 * Since we could be called when allocating space previously marked 3069 * to be trimmed, we need to empty out the list to be safe. 3070 */ 3071 trim->extent_count = 0; 3072 if (jnl_kdebug) 3073 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, ENOMEM, 0, trim->allocated_count, 0, 0); 3074 return ENOMEM; 3075 } 3076 3077 /* Copy the old extent list to the newly allocated list. */ 3078 if (trim->extents != NULL) { 3079 memmove(new_extents, 3080 trim->extents, 3081 trim->allocated_count * sizeof(dk_extent_t)); 3082 kfree(trim->extents, 3083 trim->allocated_count * sizeof(dk_extent_t)); 3084 } 3085 3086 trim->allocated_count = new_allocated_count; 3087 trim->extents = new_extents; 3088 3089 if (jnl_kdebug) 3090 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, 0, 0, new_allocated_count, trim->extent_count, 0); 3091 3092 return 0; 3093} 3094 3095/* 3096 ;________________________________________________________________________________ 3097 ; 3098 ; Routine: trim_search_extent 3099 ; 3100 ; Function: Search the given extent list to see if any of its extents 3101 ; overlap the given extent. 3102 ; 3103 ; Input Arguments: 3104 ; trim - The trim list to be searched. 3105 ; offset - The first byte of the range to be searched for. 3106 ; length - The number of bytes of the extent being searched for. 3107 ; overlap_start - start of the overlapping extent 3108 ; overlap_len - length of the overlapping extent 3109 ; 3110 ; Output: 3111 ; (result) - TRUE if one or more extents overlap, FALSE otherwise. 3112 ;________________________________________________________________________________ 3113 */ 3114static int 3115trim_search_extent(struct jnl_trim_list *trim, uint64_t offset, 3116 uint64_t length, uint64_t *overlap_start, uint64_t *overlap_len) 3117{ 3118 uint64_t end = offset + length; 3119 uint32_t lower = 0; /* Lowest index to search */ 3120 uint32_t upper = trim->extent_count; /* Highest index to search + 1 */ 3121 uint32_t middle; 3122 3123 /* A binary search over the extent list. */ 3124 while (lower < upper) { 3125 middle = (lower + upper) / 2; 3126 3127 if (trim->extents[middle].offset >= end) 3128 upper = middle; 3129 else if (trim->extents[middle].offset + trim->extents[middle].length <= offset) 3130 lower = middle + 1; 3131 else { 3132 if (overlap_start) { 3133 *overlap_start = trim->extents[middle].offset; 3134 } 3135 if (overlap_len) { 3136 *overlap_len = trim->extents[middle].length; 3137 } 3138 return TRUE; 3139 } 3140 } 3141 3142 return FALSE; 3143} 3144 3145 3146/* 3147;________________________________________________________________________________ 3148; 3149; Routine: journal_trim_add_extent 3150; 3151; Function: Keep track of extents that have been freed as part of this 3152; transaction. If the underlying device supports TRIM (UNMAP), 3153; then those extents will be trimmed/unmapped once the 3154; transaction has been written to the journal. (For example, 3155; SSDs can support trim/unmap and avoid having to recopy those 3156; blocks when doing wear leveling, and may reuse the same 3157; phsyical blocks for different logical blocks.) 3158; 3159; HFS also uses this, in combination with journal_trim_set_callback, 3160; to add recently freed extents to its free extent cache, but 3161; only after the transaction that freed them is committed to 3162; disk. (This reduces the chance of overwriting live data in 3163; a way that causes data loss if a transaction never gets 3164; written to the journal.) 3165; 3166; Input Arguments: 3167; jnl - The journal for the volume containing the byte range. 3168; offset - The first byte of the range to be trimmed. 3169; length - The number of bytes of the extent being trimmed. 3170;________________________________________________________________________________ 3171*/ 3172__private_extern__ int 3173journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) 3174{ 3175 uint64_t end; 3176 transaction *tr; 3177 dk_extent_t *extent; 3178 uint32_t insert_index; 3179 uint32_t replace_count; 3180 3181 CHECK_JOURNAL(jnl); 3182 3183 /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */ 3184 if (jnl->flags & JOURNAL_INVALID) { 3185 return EINVAL; 3186 } 3187 3188 tr = jnl->active_tr; 3189 CHECK_TRANSACTION(tr); 3190 3191 if (jnl_kdebug) 3192 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0); 3193 3194 if (jnl->owner != current_thread()) { 3195 panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", 3196 jnl, jnl->owner, current_thread()); 3197 } 3198 3199 free_old_stuff(jnl); 3200 3201 end = offset + length; 3202 3203 /* 3204 * Find the range of existing extents that can be combined with the 3205 * input extent. We start by counting the number of extents that end 3206 * strictly before the input extent, then count the number of extents 3207 * that overlap or are contiguous with the input extent. 3208 */ 3209 extent = tr->trim.extents; 3210 insert_index = 0; 3211 while (insert_index < tr->trim.extent_count && extent->offset + extent->length < offset) { 3212 ++insert_index; 3213 ++extent; 3214 } 3215 replace_count = 0; 3216 while (insert_index + replace_count < tr->trim.extent_count && extent->offset <= end) { 3217 ++replace_count; 3218 ++extent; 3219 } 3220 3221 /* 3222 * If none of the existing extents can be combined with the input extent, 3223 * then just insert it in the list (before item number insert_index). 3224 */ 3225 if (replace_count == 0) { 3226 /* If the list was already full, we need to grow it. */ 3227 if (tr->trim.extent_count == tr->trim.allocated_count) { 3228 if (trim_realloc(jnl, &tr->trim) != 0) { 3229 printf("jnl: trim_add_extent: out of memory!"); 3230 if (jnl_kdebug) 3231 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, ENOMEM, 0, 0, tr->trim.extent_count, 0); 3232 return ENOMEM; 3233 } 3234 } 3235 3236 /* Shift any existing extents with larger offsets. */ 3237 if (insert_index < tr->trim.extent_count) { 3238 memmove(&tr->trim.extents[insert_index+1], 3239 &tr->trim.extents[insert_index], 3240 (tr->trim.extent_count - insert_index) * sizeof(dk_extent_t)); 3241 } 3242 tr->trim.extent_count++; 3243 3244 /* Store the new extent in the list. */ 3245 tr->trim.extents[insert_index].offset = offset; 3246 tr->trim.extents[insert_index].length = length; 3247 3248 /* We're done. */ 3249 if (jnl_kdebug) 3250 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0); 3251 return 0; 3252 } 3253 3254 /* 3255 * Update extent number insert_index to be the union of the input extent 3256 * and all of the replaced extents. 3257 */ 3258 if (tr->trim.extents[insert_index].offset < offset) 3259 offset = tr->trim.extents[insert_index].offset; 3260 extent = &tr->trim.extents[insert_index + replace_count - 1]; 3261 if (extent->offset + extent->length > end) 3262 end = extent->offset + extent->length; 3263 tr->trim.extents[insert_index].offset = offset; 3264 tr->trim.extents[insert_index].length = end - offset; 3265 3266 /* 3267 * If we were replacing more than one existing extent, then shift any 3268 * extents with larger offsets, and update the count of extents. 3269 * 3270 * We're going to leave extent #insert_index alone since it was just updated, above. 3271 * We need to move extents from index (insert_index + replace_count) through the end of 3272 * the list by (replace_count - 1) positions so that they overwrite extent #(insert_index + 1). 3273 */ 3274 if (replace_count > 1 && (insert_index + replace_count) < tr->trim.extent_count) { 3275 memmove(&tr->trim.extents[insert_index + 1], 3276 &tr->trim.extents[insert_index + replace_count], 3277 (tr->trim.extent_count - insert_index - replace_count) * sizeof(dk_extent_t)); 3278 } 3279 tr->trim.extent_count -= replace_count - 1; 3280 3281 if (jnl_kdebug) 3282 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0); 3283 return 0; 3284} 3285 3286/* 3287 * journal_trim_extent_overlap 3288 * 3289 * Return 1 if there are any pending TRIMs that overlap with the given offset and length 3290 * Return 0 otherwise. 3291 */ 3292 3293int journal_trim_extent_overlap (journal *jnl, uint64_t offset, uint64_t length, uint64_t *end) { 3294 transaction *tr = NULL; 3295 int overlap = 0; 3296 3297 uint64_t overlap_start; 3298 uint64_t overlap_len; 3299 tr = jnl->active_tr; 3300 CHECK_TRANSACTION(tr); 3301 3302 /* 3303 * There are two lists that need to be examined for potential overlaps: 3304 * 3305 * The first is the current transaction. Since this function requires that 3306 * a transaction be active when this is called, this is the "active_tr" 3307 * pointer in the journal struct. This has a trimlist pointer which needs 3308 * to be searched. 3309 */ 3310 overlap = trim_search_extent (&tr->trim, offset, length, &overlap_start, &overlap_len); 3311 if (overlap == 0) { 3312 /* 3313 * The second is the async trim list, which is only done if the current 3314 * transaction group (active transaction) did not overlap with our target 3315 * extent. This async trim list is the set of all previously 3316 * committed transaction groups whose I/Os are now in-flight. We need to hold the 3317 * trim lock in order to search this list. If we grab the list before the 3318 * TRIM has completed, then we will compare it. If it is grabbed AFTER the 3319 * TRIM has completed, then the pointer will be zeroed out and we won't have 3320 * to check anything. 3321 */ 3322 lck_rw_lock_shared (&jnl->trim_lock); 3323 if (jnl->async_trim != NULL) { 3324 overlap = trim_search_extent(jnl->async_trim, offset, length, &overlap_start, &overlap_len); 3325 } 3326 lck_rw_unlock_shared (&jnl->trim_lock); 3327 } 3328 3329 if (overlap) { 3330 /* compute the end (min) of the overlapping range */ 3331 if ( (overlap_start + overlap_len) < (offset + length)) { 3332 *end = (overlap_start + overlap_len); 3333 } 3334 else { 3335 *end = (offset + length); 3336 } 3337 } 3338 3339 3340 return overlap; 3341} 3342 3343/* 3344 * journal_request_immediate_flush 3345 * 3346 * FS requests that the journal flush immediately upon the 3347 * active transaction's completion. 3348 * 3349 * Returns 0 if operation succeeds 3350 * Returns EPERM if we failed to leave hint 3351 */ 3352int 3353journal_request_immediate_flush (journal *jnl) { 3354 3355 transaction *tr = NULL; 3356 /* 3357 * Is a transaction still in process? You must do 3358 * this while there are txns open 3359 */ 3360 tr = jnl->active_tr; 3361 if (tr != NULL) { 3362 CHECK_TRANSACTION(tr); 3363 tr->flush_on_completion = TRUE; 3364 } 3365 else { 3366 return EPERM; 3367 } 3368 return 0; 3369} 3370 3371 3372 3373/* 3374;________________________________________________________________________________ 3375; 3376; Routine: trim_remove_extent 3377; 3378; Function: Indicate that a range of bytes, some of which may have previously 3379; been passed to journal_trim_add_extent, is now allocated. 3380; Any overlapping ranges currently in the journal's trim list will 3381; be removed. If the underlying device supports TRIM (UNMAP), then 3382; these extents will not be trimmed/unmapped when the transaction 3383; is written to the journal. 3384; 3385; HFS also uses this to prevent newly allocated space from being 3386; added to its free extent cache (if some portion of the newly 3387; allocated space was recently freed). 3388; 3389; Input Arguments: 3390; trim - The trim list to update. 3391; offset - The first byte of the range to be trimmed. 3392; length - The number of bytes of the extent being trimmed. 3393;________________________________________________________________________________ 3394*/ 3395static int 3396trim_remove_extent(journal *jnl, struct jnl_trim_list *trim, uint64_t offset, uint64_t length) 3397{ 3398 u_int64_t end; 3399 dk_extent_t *extent; 3400 u_int32_t keep_before; 3401 u_int32_t keep_after; 3402 3403 end = offset + length; 3404 3405 /* 3406 * Find any existing extents that start before or end after the input 3407 * extent. These extents will be modified if they overlap the input 3408 * extent. Other extents between them will be deleted. 3409 */ 3410 extent = trim->extents; 3411 keep_before = 0; 3412 while (keep_before < trim->extent_count && extent->offset < offset) { 3413 ++keep_before; 3414 ++extent; 3415 } 3416 keep_after = keep_before; 3417 if (keep_after > 0) { 3418 /* See if previous extent extends beyond both ends of input extent. */ 3419 --keep_after; 3420 --extent; 3421 } 3422 while (keep_after < trim->extent_count && (extent->offset + extent->length) <= end) { 3423 ++keep_after; 3424 ++extent; 3425 } 3426 3427 /* 3428 * When we get here, the first keep_before extents (0 .. keep_before-1) 3429 * start before the input extent, and extents (keep_after .. extent_count-1) 3430 * end after the input extent. We'll need to keep, all of those extents, 3431 * but possibly modify #(keep_before-1) and #keep_after to remove the portion 3432 * that overlaps with the input extent. 3433 */ 3434 3435 /* 3436 * Does the input extent start after and end before the same existing 3437 * extent? If so, we have to "punch a hole" in that extent and convert 3438 * it to two separate extents. 3439 */ 3440 if (keep_before > keep_after) { 3441 /* If the list was already full, we need to grow it. */ 3442 if (trim->extent_count == trim->allocated_count) { 3443 if (trim_realloc(jnl, trim) != 0) { 3444 printf("jnl: trim_remove_extent: out of memory!"); 3445 return ENOMEM; 3446 } 3447 } 3448 3449 /* 3450 * Make room for a new extent by shifting extents #keep_after and later 3451 * down by one extent. When we're done, extents #keep_before and 3452 * #keep_after will be identical, and we can fall through to removing 3453 * the portion that overlaps the input extent. 3454 */ 3455 memmove(&trim->extents[keep_before], 3456 &trim->extents[keep_after], 3457 (trim->extent_count - keep_after) * sizeof(dk_extent_t)); 3458 ++trim->extent_count; 3459 ++keep_after; 3460 3461 /* 3462 * Fall through. We now have the case where the length of extent 3463 * #(keep_before - 1) needs to be updated, and the start of extent 3464 * #(keep_after) needs to be updated. 3465 */ 3466 } 3467 3468 /* 3469 * May need to truncate the end of extent #(keep_before - 1) if it overlaps 3470 * the input extent. 3471 */ 3472 if (keep_before > 0) { 3473 extent = &trim->extents[keep_before - 1]; 3474 if (extent->offset + extent->length > offset) { 3475 extent->length = offset - extent->offset; 3476 } 3477 } 3478 3479 /* 3480 * May need to update the start of extent #(keep_after) if it overlaps the 3481 * input extent. 3482 */ 3483 if (keep_after < trim->extent_count) { 3484 extent = &trim->extents[keep_after]; 3485 if (extent->offset < end) { 3486 extent->length = extent->offset + extent->length - end; 3487 extent->offset = end; 3488 } 3489 } 3490 3491 /* 3492 * If there were whole extents that overlapped the input extent, get rid 3493 * of them by shifting any following extents, and updating the count. 3494 */ 3495 if (keep_after > keep_before && keep_after < trim->extent_count) { 3496 memmove(&trim->extents[keep_before], 3497 &trim->extents[keep_after], 3498 (trim->extent_count - keep_after) * sizeof(dk_extent_t)); 3499 } 3500 trim->extent_count -= keep_after - keep_before; 3501 3502 return 0; 3503} 3504 3505/* 3506 ;________________________________________________________________________________ 3507 ; 3508 ; Routine: journal_trim_remove_extent 3509 ; 3510 ; Function: Make note of a range of bytes, some of which may have previously 3511 ; been passed to journal_trim_add_extent, is now in use on the 3512 ; volume. The given bytes will be not be trimmed as part of 3513 ; this transaction, or a pending trim of a transaction being 3514 ; asynchronously flushed. 3515 ; 3516 ; Input Arguments: 3517 ; jnl - The journal for the volume containing the byte range. 3518 ; offset - The first byte of the range to be trimmed. 3519 ; length - The number of bytes of the extent being trimmed. 3520 ;________________________________________________________________________________ 3521 */ 3522__private_extern__ int 3523journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length) 3524{ 3525 int error = 0; 3526 transaction *tr; 3527 3528 CHECK_JOURNAL(jnl); 3529 3530 /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */ 3531 if (jnl->flags & JOURNAL_INVALID) { 3532 return EINVAL; 3533 } 3534 3535 tr = jnl->active_tr; 3536 CHECK_TRANSACTION(tr); 3537 3538 if (jnl_kdebug) 3539 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0); 3540 3541 if (jnl->owner != current_thread()) { 3542 panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", 3543 jnl, jnl->owner, current_thread()); 3544 } 3545 3546 free_old_stuff(jnl); 3547 3548 error = trim_remove_extent(jnl, &tr->trim, offset, length); 3549 if (error == 0) { 3550 int found = FALSE; 3551 3552 /* 3553 * See if a pending trim has any extents that overlap with the 3554 * one we were given. 3555 */ 3556 lck_rw_lock_shared(&jnl->trim_lock); 3557 if (jnl->async_trim != NULL) 3558 found = trim_search_extent(jnl->async_trim, offset, length, NULL, NULL); 3559 lck_rw_unlock_shared(&jnl->trim_lock); 3560 3561 if (found) { 3562 /* 3563 * There was an overlap, so avoid trimming the extent we 3564 * just allocated. (Otherwise, it might get trimmed after 3565 * we've written to it, which will cause that data to be 3566 * corrupted.) 3567 */ 3568 uint32_t async_extent_count = 0; 3569 3570 if (jnl_kdebug) 3571 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, 0, 0); 3572 lck_rw_lock_exclusive(&jnl->trim_lock); 3573 if (jnl->async_trim != NULL) { 3574 error = trim_remove_extent(jnl, jnl->async_trim, offset, length); 3575 async_extent_count = jnl->async_trim->extent_count; 3576 } 3577 lck_rw_unlock_exclusive(&jnl->trim_lock); 3578 if (jnl_kdebug) 3579 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_END, error, 0, 0, async_extent_count, 0); 3580 } 3581 } 3582 3583 if (jnl_kdebug) 3584 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_END, error, 0, 0, tr->trim.extent_count, 0); 3585 return error; 3586} 3587 3588 3589static int 3590journal_trim_flush(journal *jnl, transaction *tr) 3591{ 3592 int errno = 0; 3593 boolean_t was_vm_privileged; 3594 3595 if (jnl_kdebug) 3596 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0); 3597 3598 if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) { 3599 /* 3600 * the disk driver can allocate memory on this path... 3601 * if we block waiting for memory, and there is enough pressure to 3602 * cause us to try and create a new swap file, we may end up deadlocking 3603 * due to waiting for the journal on the swap file creation path... 3604 * by making ourselves vm_privileged, we give ourselves the best chance 3605 * of not blocking 3606 */ 3607 was_vm_privileged = set_vm_privilege(TRUE); 3608 } 3609 lck_rw_lock_shared(&jnl->trim_lock); 3610 if (tr->trim.extent_count > 0) { 3611 dk_unmap_t unmap; 3612 3613 bzero(&unmap, sizeof(unmap)); 3614 if (CONFIG_HFS_TRIM && (jnl->flags & JOURNAL_USE_UNMAP)) { 3615 unmap.extents = tr->trim.extents; 3616 unmap.extentsCount = tr->trim.extent_count; 3617 if (jnl_kdebug) 3618 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0); 3619 errno = VNOP_IOCTL(jnl->fsdev, DKIOCUNMAP, (caddr_t)&unmap, FWRITE, vfs_context_kernel()); 3620 if (jnl_kdebug) 3621 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_END, errno, 0, 0, 0, 0); 3622 } 3623 3624 /* 3625 * Call back into the file system to tell them that we have 3626 * trimmed some extents and that they can now be reused. 3627 * 3628 * CAUTION: If the journal becomes invalid (eg., due to an I/O 3629 * error when trying to write to the journal), this callback 3630 * will stop getting called, even if extents got freed before 3631 * the journal became invalid! 3632 */ 3633 if (jnl->trim_callback) 3634 jnl->trim_callback(jnl->trim_callback_arg, tr->trim.extent_count, tr->trim.extents); 3635 } 3636 lck_rw_unlock_shared(&jnl->trim_lock); 3637 3638 if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) 3639 set_vm_privilege(FALSE); 3640 /* 3641 * If the transaction we're flushing was the async transaction, then 3642 * tell the current transaction that there is no pending trim 3643 * any more. 3644 * 3645 * NOTE: Since we released the lock, another thread could have 3646 * removed one or more extents from our list. That's not a 3647 * problem since any writes to the re-allocated blocks 3648 * would get sent to the device after the DKIOCUNMAP. 3649 */ 3650 lck_rw_lock_exclusive(&jnl->trim_lock); 3651 if (jnl->async_trim == &tr->trim) 3652 jnl->async_trim = NULL; 3653 lck_rw_unlock_exclusive(&jnl->trim_lock); 3654 3655 /* 3656 * By the time we get here, no other thread can discover the address 3657 * of "tr", so it is safe for us to manipulate tr->trim without 3658 * holding any locks. 3659 */ 3660 if (tr->trim.extents) { 3661 kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); 3662 tr->trim.allocated_count = 0; 3663 tr->trim.extent_count = 0; 3664 tr->trim.extents = NULL; 3665 } 3666 3667 if (jnl_kdebug) 3668 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_END, errno, 0, 0, 0, 0); 3669 3670 return errno; 3671} 3672 3673static int 3674journal_binfo_cmp(const void *a, const void *b) 3675{ 3676 const block_info *bi_a = (const struct block_info *)a; 3677 const block_info *bi_b = (const struct block_info *)b; 3678 daddr64_t res; 3679 3680 if (bi_a->bnum == (off_t)-1) { 3681 return 1; 3682 } 3683 if (bi_b->bnum == (off_t)-1) { 3684 return -1; 3685 } 3686 3687 // don't have to worry about negative block 3688 // numbers so this is ok to do. 3689 // 3690 res = (buf_blkno(bi_a->u.bp) - buf_blkno(bi_b->u.bp)); 3691 3692 return (int)res; 3693} 3694 3695 3696/* 3697 * End a transaction. If the transaction is small enough, and we're not forcing 3698 * a write to disk, the "active" transaction becomes the "current" transaction, 3699 * and will be reused for the next transaction that is started (group commit). 3700 * 3701 * If the transaction gets written to disk (because force_it is true, or no 3702 * group commit, or the transaction is sufficiently full), the blocks get 3703 * written into the journal first, then the are written asynchronously. When 3704 * those async writes complete, the transaction can be freed and removed from 3705 * the journal. 3706 * 3707 * An optional callback can be supplied. If given, it is called after the 3708 * the blocks have been written to the journal, but before the async writes 3709 * of those blocks to their normal on-disk locations. This is used by 3710 * journal_relocate so that the location of the journal can be changed and 3711 * flushed to disk before the blocks get written to their normal locations. 3712 * Note that the callback is only called if the transaction gets written to 3713 * the journal during this end_transaction call; you probably want to set the 3714 * force_it flag. 3715 * 3716 * Inputs: 3717 * tr Transaction to add to the journal 3718 * force_it If true, force this transaction to the on-disk journal immediately. 3719 * callback See description above. Pass NULL for no callback. 3720 * callback_arg Argument passed to callback routine. 3721 * 3722 * Result 3723 * 0 No errors 3724 * -1 An error occurred. The journal is marked invalid. 3725 */ 3726static int 3727end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait) 3728{ 3729 block_list_header *blhdr=NULL, *next=NULL; 3730 int i, ret_val = 0; 3731 errno_t errno; 3732 journal *jnl = tr->jnl; 3733 struct buf *bp; 3734 size_t tbuffer_offset; 3735 boolean_t drop_lock_early; 3736 3737 if (jnl->cur_tr) { 3738 panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n", 3739 jnl, jnl->cur_tr, tr); 3740 } 3741 3742 // if there weren't any modified blocks in the transaction 3743 // just save off the transaction pointer and return. 3744 if (tr->total_bytes == jnl->jhdr->blhdr_size) { 3745 jnl->cur_tr = tr; 3746 goto done; 3747 } 3748 3749 // if our transaction buffer isn't very full, just hang 3750 // on to it and don't actually flush anything. this is 3751 // what is known as "group commit". we will flush the 3752 // transaction buffer if it's full or if we have more than 3753 // one of them so we don't start hogging too much memory. 3754 // 3755 // We also check the device supports UNMAP/TRIM, and if so, 3756 // the number of extents waiting to be trimmed. If it is 3757 // small enough, then keep accumulating more (so we can 3758 // reduce the overhead of trimming). If there was a prior 3759 // trim error, then we stop issuing trims for this 3760 // volume, so we can also coalesce transactions. 3761 // 3762 if ( force_it == 0 3763 && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0 3764 && tr->num_blhdrs < 3 3765 && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8)) 3766 && (!(jnl->flags & JOURNAL_USE_UNMAP) || (tr->trim.extent_count < jnl_trim_flush_limit))) { 3767 3768 jnl->cur_tr = tr; 3769 goto done; 3770 } 3771 3772 KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_START, jnl, tr, drop_lock, must_wait, 0); 3773 3774 lock_condition(jnl, &jnl->flushing, "end_transaction"); 3775 3776 /* 3777 * if the previous 'finish_end_transaction' was being run 3778 * asynchronously, it could have encountered a condition 3779 * that caused it to mark the journal invalid... if that 3780 * occurred while we were waiting for it to finish, we 3781 * need to notice and abort the current transaction 3782 */ 3783 if ((jnl->flags & JOURNAL_INVALID) || jnl->flush_aborted == TRUE) { 3784 unlock_condition(jnl, &jnl->flushing); 3785 3786 abort_transaction(jnl, tr); 3787 ret_val = -1; 3788 KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0); 3789 goto done; 3790 } 3791 3792 /* 3793 * Store a pointer to this transaction's trim list so that 3794 * future transactions can find it. 3795 * 3796 * Note: if there are no extents in the trim list, then don't 3797 * bother saving the pointer since nothing can add new extents 3798 * to the list (and other threads/transactions only care if 3799 * there is a trim pending). 3800 */ 3801 lck_rw_lock_exclusive(&jnl->trim_lock); 3802 if (jnl->async_trim != NULL) 3803 panic("jnl: end_transaction: async_trim already non-NULL!"); 3804 if (tr->trim.extent_count > 0) 3805 jnl->async_trim = &tr->trim; 3806 lck_rw_unlock_exclusive(&jnl->trim_lock); 3807 3808 /* 3809 * snapshot the transaction sequence number while we are still behind 3810 * the journal lock since it will be bumped upon the start of the 3811 * next transaction group which may overlap the current journal flush... 3812 * we pass the snapshot into write_journal_header during the journal 3813 * flush so that it can write the correct version in the header... 3814 * because we hold the 'flushing' condition variable for the duration 3815 * of the journal flush, 'saved_sequence_num' remains stable 3816 */ 3817 jnl->saved_sequence_num = jnl->sequence_num; 3818 3819 /* 3820 * if we're here we're going to flush the transaction buffer to disk. 3821 * 'check_free_space' will not return untl there is enough free 3822 * space for this transaction in the journal and jnl->old_start[0] 3823 * is avaiable for use 3824 */ 3825 KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0); 3826 3827 check_free_space(jnl, tr->total_bytes, &tr->delayed_header_write, jnl->saved_sequence_num); 3828 3829 KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, tr->delayed_header_write, 0, 0, 0); 3830 3831 // range check the end index 3832 if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) { 3833 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n", 3834 jnl->jhdr->end, jnl->jhdr->size); 3835 } 3836 if (tr->delayed_header_write == TRUE) { 3837 thread_t thread = THREAD_NULL; 3838 3839 lock_condition(jnl, &jnl->writing_header, "end_transaction"); 3840 /* 3841 * fire up a thread to write the journal header 3842 * asynchronously... when it finishes, it will call 3843 * unlock_condition... we can overlap the preparation of 3844 * the log and buffers during this time 3845 */ 3846 kernel_thread_start((thread_continue_t)write_header_thread, jnl, &thread); 3847 } else 3848 jnl->write_header_failed = FALSE; 3849 3850 3851 // this transaction starts where the current journal ends 3852 tr->journal_start = jnl->jhdr->end; 3853 3854 lock_oldstart(jnl); 3855 /* 3856 * Because old_start is locked above, we can cast away the volatile qualifier before passing it to memcpy. 3857 * slide everyone else down and put our latest guy in the last 3858 * entry in the old_start array 3859 */ 3860 memcpy(__CAST_AWAY_QUALIFIER(&jnl->old_start[0], volatile, void *), __CAST_AWAY_QUALIFIER(&jnl->old_start[1], volatile, void *), sizeof(jnl->old_start)-sizeof(jnl->old_start[0])); 3861 jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL; 3862 3863 unlock_oldstart(jnl); 3864 3865 3866 for (blhdr = tr->blhdr; blhdr; blhdr = next) { 3867 char *blkptr; 3868 buf_t sbp; 3869 int32_t bsize; 3870 3871 tbuffer_offset = jnl->jhdr->blhdr_size; 3872 3873 for (i = 1; i < blhdr->num_blocks; i++) { 3874 3875 if (blhdr->binfo[i].bnum != (off_t)-1) { 3876 void (*func)(buf_t, void *); 3877 void *arg; 3878 3879 bp = blhdr->binfo[i].u.bp; 3880 3881 if (bp == NULL) { 3882 panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n", 3883 blhdr->binfo[i].bnum, jnl, tr); 3884 } 3885 /* 3886 * acquire the bp here so that we can safely 3887 * mess around with its data. buf_acquire() 3888 * will return EAGAIN if the buffer was busy, 3889 * so loop trying again. 3890 */ 3891 do { 3892 errno = buf_acquire(bp, BAC_REMOVE, 0, 0); 3893 } while (errno == EAGAIN); 3894 3895 if (errno) 3896 panic("could not acquire bp %p (err %d)\n", bp, errno); 3897 3898 if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) { 3899 if (jnl->flags & JOURNAL_CLOSE_PENDING) { 3900 buf_clearflags(bp, B_LOCKED); 3901 buf_brelse(bp); 3902 3903 /* 3904 * this is an odd case that appears to happen occasionally 3905 * make sure we mark this block as no longer valid 3906 * so that we don't process it in "finish_end_transaction" since 3907 * the bp that is recorded in our array no longer belongs 3908 * to us (normally we substitute a shadow bp to be processed 3909 * issuing a 'buf_bawrite' on a stale buf_t pointer leads 3910 * to all kinds of problems. 3911 */ 3912 blhdr->binfo[i].bnum = (off_t)-1; 3913 continue; 3914 } else { 3915 panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp)); 3916 } 3917 } 3918 bsize = buf_size(bp); 3919 3920 buf_setfilter(bp, NULL, NULL, &func, &arg); 3921 3922 blkptr = (char *)&((char *)blhdr)[tbuffer_offset]; 3923 3924 sbp = buf_create_shadow_priv(bp, FALSE, (uintptr_t)blkptr, 0, 0); 3925 3926 if (sbp == NULL) 3927 panic("jnl: buf_create_shadow returned NULL"); 3928 3929 /* 3930 * copy the data into the transaction buffer... 3931 */ 3932 memcpy(blkptr, (char *)buf_dataptr(bp), bsize); 3933 3934 buf_clearflags(bp, B_LOCKED); 3935 buf_markclean(bp); 3936 buf_drop(bp); 3937 3938 /* 3939 * adopt the shadow buffer for this block 3940 */ 3941 if (func) { 3942 /* 3943 * transfer FS hook function to the 3944 * shadow buffer... it will get called 3945 * in finish_end_transaction 3946 */ 3947 buf_setfilter(sbp, func, arg, NULL, NULL); 3948 } 3949 blhdr->binfo[i].u.bp = sbp; 3950 3951 } else { 3952 // bnum == -1, only true if a block was "killed" 3953 bsize = blhdr->binfo[i].u.bi.bsize; 3954 } 3955 tbuffer_offset += bsize; 3956 } 3957 next = (block_list_header *)((long)blhdr->binfo[0].bnum); 3958 } 3959 /* 3960 * if callback != NULL, we don't want to drop the journal 3961 * lock, or complete end_transaction asynchronously, since 3962 * the caller is expecting the callback to run in the calling 3963 * context 3964 * 3965 * if drop_lock == FALSE, we can't complete end_transaction 3966 * asynchronously 3967 */ 3968 if (callback) 3969 drop_lock_early = FALSE; 3970 else 3971 drop_lock_early = drop_lock; 3972 3973 if (drop_lock_early == FALSE) 3974 must_wait = TRUE; 3975 3976 if (drop_lock_early == TRUE) { 3977 journal_unlock(jnl); 3978 drop_lock = FALSE; 3979 } 3980 if (must_wait == TRUE) 3981 ret_val = finish_end_transaction(tr, callback, callback_arg); 3982 else { 3983 thread_t thread = THREAD_NULL; 3984 3985 /* 3986 * fire up a thread to complete processing this transaction 3987 * asynchronously... when it finishes, it will call 3988 * unlock_condition 3989 */ 3990 kernel_thread_start((thread_continue_t)finish_end_thread, tr, &thread); 3991 } 3992 KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0); 3993done: 3994 if (drop_lock == TRUE) { 3995 journal_unlock(jnl); 3996 } 3997 return (ret_val); 3998} 3999 4000 4001static void 4002finish_end_thread(transaction *tr) 4003{ 4004 proc_set_task_policy(current_task(), current_thread(), 4005 TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE); 4006 4007 finish_end_transaction(tr, NULL, NULL); 4008 4009 thread_deallocate(current_thread()); 4010 thread_terminate(current_thread()); 4011} 4012 4013static void 4014write_header_thread(journal *jnl) 4015{ 4016 proc_set_task_policy(current_task(), current_thread(), 4017 TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE); 4018 4019 if (write_journal_header(jnl, 1, jnl->saved_sequence_num)) 4020 jnl->write_header_failed = TRUE; 4021 else 4022 jnl->write_header_failed = FALSE; 4023 unlock_condition(jnl, &jnl->writing_header); 4024 4025 thread_deallocate(current_thread()); 4026 thread_terminate(current_thread()); 4027} 4028 4029static int 4030finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg) 4031{ 4032 int i, amt; 4033 int ret = 0; 4034 off_t end; 4035 journal *jnl = tr->jnl; 4036 buf_t bp, *bparray; 4037 vnode_t vp; 4038 block_list_header *blhdr=NULL, *next=NULL; 4039 size_t tbuffer_offset; 4040 int bufs_written = 0; 4041 int ret_val = 0; 4042 4043 KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_START, jnl, tr, 0, 0, 0); 4044 4045 end = jnl->jhdr->end; 4046 4047 for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { 4048 boolean_t was_vm_privileged; 4049 4050 amt = blhdr->bytes_used; 4051 4052 blhdr->binfo[0].u.bi.b.sequence_num = tr->sequence_num; 4053 4054 blhdr->checksum = 0; 4055 blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); 4056 4057 if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) { 4058 /* 4059 * if we block waiting for memory, and there is enough pressure to 4060 * cause us to try and create a new swap file, we may end up deadlocking 4061 * due to waiting for the journal on the swap file creation path... 4062 * by making ourselves vm_privileged, we give ourselves the best chance 4063 * of not blocking 4064 */ 4065 was_vm_privileged = set_vm_privilege(TRUE); 4066 } 4067 if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *))) { 4068 panic("can't allocate %zd bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *)); 4069 } 4070 if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) 4071 set_vm_privilege(FALSE); 4072 4073 tbuffer_offset = jnl->jhdr->blhdr_size; 4074 4075 for (i = 1; i < blhdr->num_blocks; i++) { 4076 void (*func)(buf_t, void *); 4077 void *arg; 4078 int32_t bsize; 4079 4080 /* 4081 * finish preparing the shadow buf_t before 4082 * calculating the individual block checksums 4083 */ 4084 if (blhdr->binfo[i].bnum != (off_t)-1) { 4085 daddr64_t blkno; 4086 daddr64_t lblkno; 4087 4088 bp = blhdr->binfo[i].u.bp; 4089 4090 vp = buf_vnode(bp); 4091 blkno = buf_blkno(bp); 4092 lblkno = buf_lblkno(bp); 4093 4094 if (vp == NULL && lblkno == blkno) { 4095 printf("jnl: %s: end_tr: bad news! bp @ %p w/null vp and l/blkno = %qd/%qd. aborting the transaction (tr %p jnl %p).\n", 4096 jnl->jdev_name, bp, lblkno, blkno, tr, jnl); 4097 ret_val = -1; 4098 goto bad_journal; 4099 } 4100 4101 // if the lblkno is the same as blkno and this bp isn't 4102 // associated with the underlying file system device then 4103 // we need to call bmap() to get the actual physical block. 4104 // 4105 if ((lblkno == blkno) && (vp != jnl->fsdev)) { 4106 off_t f_offset; 4107 size_t contig_bytes; 4108 4109 if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) { 4110 printf("jnl: %s: end_tr: vnop_blktooff failed @ %p, jnl %p\n", jnl->jdev_name, bp, jnl); 4111 ret_val = -1; 4112 goto bad_journal; 4113 } 4114 if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) { 4115 printf("jnl: %s: end_tr: can't blockmap the bp @ %p, jnl %p\n", jnl->jdev_name, bp, jnl); 4116 ret_val = -1; 4117 goto bad_journal; 4118 } 4119 if ((uint32_t)contig_bytes < buf_count(bp)) { 4120 printf("jnl: %s: end_tr: blk not physically contiguous on disk@ %p, jnl %p\n", jnl->jdev_name, bp, jnl); 4121 ret_val = -1; 4122 goto bad_journal; 4123 } 4124 buf_setblkno(bp, blkno); 4125 } 4126 // update this so we write out the correct physical block number! 4127 blhdr->binfo[i].bnum = (off_t)(blkno); 4128 4129 /* 4130 * pick up the FS hook function (if any) and prepare 4131 * to fire this buffer off in the next pass 4132 */ 4133 buf_setfilter(bp, buffer_flushed_callback, tr, &func, &arg); 4134 4135 if (func) { 4136 /* 4137 * call the hook function supplied by the filesystem... 4138 * this needs to happen BEFORE cacl_checksum in case 4139 * the FS morphs the data in the buffer 4140 */ 4141 func(bp, arg); 4142 } 4143 bparray[i] = bp; 4144 bsize = buf_size(bp); 4145 blhdr->binfo[i].u.bi.bsize = bsize; 4146 blhdr->binfo[i].u.bi.b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], bsize); 4147 } else { 4148 bparray[i] = NULL; 4149 bsize = blhdr->binfo[i].u.bi.bsize; 4150 blhdr->binfo[i].u.bi.b.cksum = 0; 4151 } 4152 tbuffer_offset += bsize; 4153 } 4154 /* 4155 * if we fired off the journal_write_header asynchronously in 4156 * 'end_transaction', we need to wait for its completion 4157 * before writing the actual journal data 4158 */ 4159 wait_condition(jnl, &jnl->writing_header, "finish_end_transaction"); 4160 4161 if (jnl->write_header_failed == FALSE) 4162 ret = write_journal_data(jnl, &end, blhdr, amt); 4163 else 4164 ret_val = -1; 4165 /* 4166 * put the bp pointers back so that we can 4167 * make the final pass on them 4168 */ 4169 for (i = 1; i < blhdr->num_blocks; i++) 4170 blhdr->binfo[i].u.bp = bparray[i]; 4171 4172 kmem_free(kernel_map, (vm_offset_t)bparray, blhdr->num_blocks * sizeof(struct buf *)); 4173 4174 if (ret_val == -1) 4175 goto bad_journal; 4176 4177 if (ret != amt) { 4178 printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n", 4179 jnl->jdev_name, ret, amt); 4180 4181 ret_val = -1; 4182 goto bad_journal; 4183 } 4184 } 4185 jnl->jhdr->end = end; // update where the journal now ends 4186 tr->journal_end = end; // the transaction ends here too 4187 4188 if (tr->journal_start == 0 || tr->journal_end == 0) { 4189 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n", 4190 tr->journal_start, tr->journal_end); 4191 } 4192 4193 if (write_journal_header(jnl, 0, jnl->saved_sequence_num) != 0) { 4194 ret_val = -1; 4195 goto bad_journal; 4196 } 4197 /* 4198 * If the caller supplied a callback, call it now that the blocks have been 4199 * written to the journal. This is used by journal_relocate so, for example, 4200 * the file system can change its pointer to the new journal. 4201 */ 4202 if (callback != NULL && callback(callback_arg) != 0) { 4203 ret_val = -1; 4204 goto bad_journal; 4205 } 4206 4207 // 4208 // Send a DKIOCUNMAP for the extents trimmed by this transaction, and 4209 // free up the extent list. 4210 // 4211 journal_trim_flush(jnl, tr); 4212 4213 // the buffer_flushed_callback will only be called for the 4214 // real blocks that get flushed so we have to account for 4215 // the block_list_headers here. 4216 // 4217 tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size; 4218 4219 lock_condition(jnl, &jnl->asyncIO, "finish_end_transaction"); 4220 4221 // 4222 // setup for looping through all the blhdr's. 4223 // 4224 for (blhdr = tr->blhdr; blhdr; blhdr = next) { 4225 uint16_t num_blocks; 4226 4227 /* 4228 * grab this info ahead of issuing the buf_bawrites... 4229 * once the last one goes out, its possible for blhdr 4230 * to be freed (especially if we get preempted) before 4231 * we do the last check of num_blocks or 4232 * grab the next blhdr pointer... 4233 */ 4234 next = (block_list_header *)((long)blhdr->binfo[0].bnum); 4235 num_blocks = blhdr->num_blocks; 4236 4237 /* 4238 * we can re-order the buf ptrs because everything is written out already 4239 */ 4240 qsort(&blhdr->binfo[1], num_blocks-1, sizeof(block_info), journal_binfo_cmp); 4241 4242 /* 4243 * need to make sure that the loop issuing the buf_bawrite's 4244 * does not touch blhdr once the last buf_bawrite has been 4245 * issued... at that point, we no longer have a legitmate 4246 * reference on the associated storage since it will be 4247 * released upon the completion of that last buf_bawrite 4248 */ 4249 for (i = num_blocks-1; i >= 1; i--) { 4250 if (blhdr->binfo[i].bnum != (off_t)-1) 4251 break; 4252 num_blocks--; 4253 } 4254 for (i = 1; i < num_blocks; i++) { 4255 4256 if ((bp = blhdr->binfo[i].u.bp)) { 4257 vp = buf_vnode(bp); 4258 4259 buf_bawrite(bp); 4260 4261 // this undoes the vnode_ref() in journal_modify_block_end() 4262 vnode_rele_ext(vp, 0, 1); 4263 4264 bufs_written++; 4265 } 4266 } 4267 } 4268 if (bufs_written == 0) { 4269 /* 4270 * since we didn't issue any buf_bawrite's, there is no 4271 * async trigger to cause the memory associated with this 4272 * transaction to be freed... so, move it to the garbage 4273 * list now 4274 */ 4275 lock_oldstart(jnl); 4276 4277 tr->next = jnl->tr_freeme; 4278 jnl->tr_freeme = tr; 4279 4280 unlock_oldstart(jnl); 4281 4282 unlock_condition(jnl, &jnl->asyncIO); 4283 } 4284 4285 //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n", 4286 // tr, tr->journal_start, tr->journal_end); 4287 4288bad_journal: 4289 if (ret_val == -1) { 4290 /* 4291 * 'flush_aborted' is protected by the flushing condition... we need to 4292 * set it before dropping the condition so that it will be 4293 * noticed in 'end_transaction'... we add this additional 4294 * aborted condition so that we can drop the 'flushing' condition 4295 * before grabbing the journal lock... this avoids a deadlock 4296 * in 'end_transaction' which is holding the journal lock while 4297 * waiting for the 'flushing' condition to clear... 4298 * everyone else will notice the JOURNAL_INVALID flag 4299 */ 4300 jnl->flush_aborted = TRUE; 4301 4302 unlock_condition(jnl, &jnl->flushing); 4303 journal_lock(jnl); 4304 4305 jnl->flags |= JOURNAL_INVALID; 4306 jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL; 4307 abort_transaction(jnl, tr); // cleans up list of extents to be trimmed 4308 4309 journal_unlock(jnl); 4310 } else 4311 unlock_condition(jnl, &jnl->flushing); 4312 4313 KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_END, jnl, tr, bufs_written, ret_val, 0); 4314 4315 return (ret_val); 4316} 4317 4318 4319static void 4320lock_condition(journal *jnl, boolean_t *condition, const char *condition_name) 4321{ 4322 4323 KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_START, jnl, condition, 0, 0, 0); 4324 4325 lock_flush(jnl); 4326 4327 while (*condition == TRUE) 4328 msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL); 4329 4330 *condition = TRUE; 4331 unlock_flush(jnl); 4332 4333 KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_END, jnl, condition, 0, 0, 0); 4334} 4335 4336static void 4337wait_condition(journal *jnl, boolean_t *condition, const char *condition_name) 4338{ 4339 4340 if (*condition == FALSE) 4341 return; 4342 4343 KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_START, jnl, condition, 0, 0, 0); 4344 4345 lock_flush(jnl); 4346 4347 while (*condition == TRUE) 4348 msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL); 4349 4350 unlock_flush(jnl); 4351 4352 KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_END, jnl, condition, 0, 0, 0); 4353} 4354 4355static void 4356unlock_condition(journal *jnl, boolean_t *condition) 4357{ 4358 lock_flush(jnl); 4359 4360 *condition = FALSE; 4361 wakeup(condition); 4362 4363 unlock_flush(jnl); 4364} 4365 4366static void 4367abort_transaction(journal *jnl, transaction *tr) 4368{ 4369 block_list_header *blhdr, *next; 4370 4371 // for each block list header, iterate over the blocks then 4372 // free up the memory associated with the block list. 4373 // 4374 // find each of the primary blocks (i.e. the list could 4375 // contain a mix of shadowed and real buf_t's depending 4376 // on when the abort condition was detected) and mark them 4377 // clean and locked in the cache... this at least allows 4378 // the FS a consistent view between it's incore data structures 4379 // and the meta-data held in the cache 4380 // 4381 KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_START, jnl, tr, 0, 0, 0); 4382 4383 for (blhdr = tr->blhdr; blhdr; blhdr = next) { 4384 int i; 4385 4386 for (i = 1; i < blhdr->num_blocks; i++) { 4387 buf_t bp, tbp, sbp; 4388 vnode_t bp_vp; 4389 errno_t errno; 4390 4391 if (blhdr->binfo[i].bnum == (off_t)-1) 4392 continue; 4393 4394 tbp = blhdr->binfo[i].u.bp; 4395 4396 bp_vp = buf_vnode(tbp); 4397 4398 buf_setfilter(tbp, NULL, NULL, NULL, NULL); 4399 4400 if (buf_shadow(tbp)) 4401 sbp = tbp; 4402 else 4403 sbp = NULL; 4404 4405 if (bp_vp) { 4406 errno = buf_meta_bread(bp_vp, 4407 buf_lblkno(tbp), 4408 buf_size(tbp), 4409 NOCRED, 4410 &bp); 4411 if (errno == 0) { 4412 if (sbp == NULL && bp != tbp && (buf_flags(tbp) & B_LOCKED)) { 4413 panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n", 4414 bp, tbp, jnl); 4415 } 4416 /* 4417 * once the journal has been marked INVALID and aborted, 4418 * NO meta data can be written back to the disk, so 4419 * mark the buf_t clean and make sure it's locked in the cache 4420 * note: if we found a shadow, the real buf_t needs to be relocked 4421 */ 4422 buf_setflags(bp, B_LOCKED); 4423 buf_markclean(bp); 4424 buf_brelse(bp); 4425 4426 KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_NONE, jnl, tr, bp, 0, 0); 4427 4428 /* 4429 * this undoes the vnode_ref() in journal_modify_block_end() 4430 */ 4431 vnode_rele_ext(bp_vp, 0, 1); 4432 } else { 4433 printf("jnl: %s: abort_tr: could not find block %lld vp %p!\n", 4434 jnl->jdev_name, blhdr->binfo[i].bnum, tbp); 4435 if (bp) { 4436 buf_brelse(bp); 4437 } 4438 } 4439 } 4440 if (sbp) 4441 buf_brelse(sbp); 4442 } 4443 next = (block_list_header *)((long)blhdr->binfo[0].bnum); 4444 4445 // we can free blhdr here since we won't need it any more 4446 blhdr->binfo[0].bnum = 0xdeadc0de; 4447 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size); 4448 } 4449 4450 /* 4451 * If the transaction we're aborting was the async transaction, then 4452 * tell the current transaction that there is no pending trim 4453 * any more. 4454 */ 4455 lck_rw_lock_exclusive(&jnl->trim_lock); 4456 if (jnl->async_trim == &tr->trim) 4457 jnl->async_trim = NULL; 4458 lck_rw_unlock_exclusive(&jnl->trim_lock); 4459 4460 4461 if (tr->trim.extents) { 4462 kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); 4463 } 4464 tr->trim.allocated_count = 0; 4465 tr->trim.extent_count = 0; 4466 tr->trim.extents = NULL; 4467 tr->tbuffer = NULL; 4468 tr->blhdr = NULL; 4469 tr->total_bytes = 0xdbadc0de; 4470 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); 4471 4472 KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_END, jnl, tr, 0, 0, 0); 4473} 4474 4475 4476int 4477journal_end_transaction(journal *jnl) 4478{ 4479 int ret; 4480 transaction *tr; 4481 4482 CHECK_JOURNAL(jnl); 4483 4484 free_old_stuff(jnl); 4485 4486 if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) { 4487 return 0; 4488 } 4489 4490 if (jnl->owner != current_thread()) { 4491 panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n", 4492 jnl, jnl->owner, current_thread()); 4493 } 4494 jnl->nested_count--; 4495 4496 if (jnl->nested_count > 0) { 4497 return 0; 4498 } else if (jnl->nested_count < 0) { 4499 panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count); 4500 } 4501 4502 if (jnl->flags & JOURNAL_INVALID) { 4503 if (jnl->active_tr) { 4504 if (jnl->cur_tr != NULL) { 4505 panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n", 4506 jnl, jnl->active_tr, jnl->cur_tr); 4507 } 4508 tr = jnl->active_tr; 4509 jnl->active_tr = NULL; 4510 4511 abort_transaction(jnl, tr); 4512 } 4513 journal_unlock(jnl); 4514 4515 return EINVAL; 4516 } 4517 4518 tr = jnl->active_tr; 4519 CHECK_TRANSACTION(tr); 4520 4521 // clear this out here so that when check_free_space() calls 4522 // the FS flush function, we don't panic in journal_flush() 4523 // if the FS were to call that. note: check_free_space() is 4524 // called from end_transaction(). 4525 // 4526 jnl->active_tr = NULL; 4527 4528 /* Examine the force-journal-flush state in the active txn */ 4529 if (tr->flush_on_completion == TRUE) { 4530 /* 4531 * If the FS requested it, disallow group commit and force the 4532 * transaction out to disk immediately. 4533 */ 4534 ret = end_transaction(tr, 1, NULL, NULL, TRUE, TRUE); 4535 } 4536 else { 4537 /* in the common path we can simply use the double-buffered journal */ 4538 ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE); 4539 } 4540 4541 return ret; 4542} 4543 4544 4545/* 4546 * Flush the contents of the journal to the disk. 4547 * 4548 * Input: 4549 * wait_for_IO - 4550 * If TRUE, wait to write in-memory journal to the disk 4551 * consistently, and also wait to write all asynchronous 4552 * metadata blocks to its corresponding locations 4553 * consistently on the disk. This means that the journal 4554 * is empty at this point and does not contain any 4555 * transactions. This is overkill in normal scenarios 4556 * but is useful whenever the metadata blocks are required 4557 * to be consistent on-disk instead of just the journal 4558 * being consistent; like before live verification 4559 * and live volume resizing. 4560 * 4561 * If FALSE, only wait to write in-memory journal to the 4562 * disk consistently. This means that the journal still 4563 * contains uncommitted transactions and the file system 4564 * metadata blocks in the journal transactions might be 4565 * written asynchronously to the disk. But there is no 4566 * guarantee that they are written to the disk before 4567 * returning to the caller. Note that this option is 4568 * sufficient for file system data integrity as it 4569 * guarantees consistent journal content on the disk. 4570 */ 4571int 4572journal_flush(journal *jnl, boolean_t wait_for_IO) 4573{ 4574 boolean_t drop_lock = FALSE; 4575 4576 CHECK_JOURNAL(jnl); 4577 4578 free_old_stuff(jnl); 4579 4580 if (jnl->flags & JOURNAL_INVALID) { 4581 return -1; 4582 } 4583 4584 KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_START, jnl, 0, 0, 0, 0); 4585 4586 if (jnl->owner != current_thread()) { 4587 journal_lock(jnl); 4588 drop_lock = TRUE; 4589 } 4590 4591 // if we're not active, flush any buffered transactions 4592 if (jnl->active_tr == NULL && jnl->cur_tr) { 4593 transaction *tr = jnl->cur_tr; 4594 4595 jnl->cur_tr = NULL; 4596 4597 if (wait_for_IO) { 4598 wait_condition(jnl, &jnl->flushing, "journal_flush"); 4599 wait_condition(jnl, &jnl->asyncIO, "journal_flush"); 4600 } 4601 /* 4602 * "end_transction" will wait for any current async flush 4603 * to complete, before flushing "cur_tr"... because we've 4604 * specified the 'must_wait' arg as TRUE, it will then 4605 * synchronously flush the "cur_tr" 4606 */ 4607 end_transaction(tr, 1, NULL, NULL, drop_lock, TRUE); // force it to get flushed 4608 4609 } else { 4610 if (drop_lock == TRUE) { 4611 journal_unlock(jnl); 4612 } 4613 4614 /* Because of pipelined journal, the journal transactions 4615 * might be in process of being flushed on another thread. 4616 * If there is nothing to flush currently, we should 4617 * synchronize ourselves with the pipelined journal thread 4618 * to ensure that all inflight transactions, if any, are 4619 * flushed before we return success to caller. 4620 */ 4621 wait_condition(jnl, &jnl->flushing, "journal_flush"); 4622 } 4623 if (wait_for_IO) { 4624 wait_condition(jnl, &jnl->asyncIO, "journal_flush"); 4625 } 4626 4627 KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_END, jnl, 0, 0, 0, 0); 4628 4629 return 0; 4630} 4631 4632int 4633journal_active(journal *jnl) 4634{ 4635 if (jnl->flags & JOURNAL_INVALID) { 4636 return -1; 4637 } 4638 4639 return (jnl->active_tr == NULL) ? 0 : 1; 4640} 4641 4642void * 4643journal_owner(journal *jnl) 4644{ 4645 return jnl->owner; 4646} 4647 4648int journal_uses_fua(journal *jnl) 4649{ 4650 if (jnl->flags & JOURNAL_DO_FUA_WRITES) 4651 return 1; 4652 return 0; 4653} 4654 4655/* 4656 * Relocate the journal. 4657 * 4658 * You provide the new starting offset and size for the journal. You may 4659 * optionally provide a new tbuffer_size; passing zero defaults to not 4660 * changing the tbuffer size except as needed to fit within the new journal 4661 * size. 4662 * 4663 * You must have already started a transaction. The transaction may contain 4664 * modified blocks (such as those needed to deallocate the old journal, 4665 * allocate the new journal, and update the location and size of the journal 4666 * in filesystem-private structures). Any transactions prior to the active 4667 * transaction will be flushed to the old journal. The new journal will be 4668 * initialized, and the blocks from the active transaction will be written to 4669 * the new journal. 4670 * 4671 * The caller will need to update the structures that identify the location 4672 * and size of the journal. These updates should be made in the supplied 4673 * callback routine. These updates must NOT go into a transaction. You should 4674 * force these updates to the media before returning from the callback. In the 4675 * even of a crash, either the old journal will be found, with an empty journal, 4676 * or the new journal will be found with the contents of the active transaction. 4677 * 4678 * Upon return from the callback, the blocks from the active transaction are 4679 * written to their normal locations on disk. 4680 * 4681 * (Remember that we have to ensure that blocks get committed to the journal 4682 * before being committed to their normal locations. But the blocks don't count 4683 * as committed until the new journal is pointed at.) 4684 * 4685 * Upon return, there is still an active transaction: newly allocated, and 4686 * with no modified blocks. Call journal_end_transaction as normal. You may 4687 * modifiy additional blocks before calling journal_end_transaction, and those 4688 * blocks will (eventually) go to the relocated journal. 4689 * 4690 * Inputs: 4691 * jnl The (opened) journal to relocate. 4692 * offset The new journal byte offset (from start of the journal device). 4693 * journal_size The size, in bytes, of the new journal. 4694 * tbuffer_size The new desired transaction buffer size. Pass zero to keep 4695 * the same size as the current journal. The size will be 4696 * modified as needed to fit the new journal. 4697 * callback Routine called after the new journal has been initialized, 4698 * and the active transaction written to the new journal, but 4699 * before the blocks are written to their normal locations. 4700 * Pass NULL for no callback. 4701 * callback_arg An argument passed to the callback routine. 4702 * 4703 * Result: 4704 * 0 No errors 4705 * EINVAL The offset is not block aligned 4706 * EINVAL The journal_size is not a multiple of the block size 4707 * EINVAL The journal is invalid 4708 * (any) An error returned by journal_flush. 4709 * 4710 */ 4711int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size, 4712 errno_t (*callback)(void *), void *callback_arg) 4713{ 4714 int ret; 4715 transaction *tr; 4716 size_t i = 0; 4717 4718 /* 4719 * Sanity check inputs, and adjust the size of the transaction buffer. 4720 */ 4721 if ((offset % jnl->jhdr->jhdr_size) != 0) { 4722 printf("jnl: %s: relocate: offset 0x%llx is not an even multiple of block size 0x%x\n", 4723 jnl->jdev_name, offset, jnl->jhdr->jhdr_size); 4724 return EINVAL; 4725 } 4726 if ((journal_size % jnl->jhdr->jhdr_size) != 0) { 4727 printf("jnl: %s: relocate: journal size 0x%llx is not an even multiple of block size 0x%x\n", 4728 jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size); 4729 return EINVAL; 4730 } 4731 4732 CHECK_JOURNAL(jnl); 4733 4734 /* Guarantee we own the active transaction. */ 4735 if (jnl->flags & JOURNAL_INVALID) { 4736 return EINVAL; 4737 } 4738 if (jnl->owner != current_thread()) { 4739 panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n", 4740 jnl, jnl->owner, current_thread()); 4741 } 4742 4743 if (tbuffer_size == 0) 4744 tbuffer_size = jnl->tbuffer_size; 4745 size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size); 4746 4747 /* 4748 * Flush any non-active transactions. We have to temporarily hide the 4749 * active transaction to make journal_flush flush out non-active but 4750 * current (unwritten) transactions. 4751 */ 4752 tr = jnl->active_tr; 4753 CHECK_TRANSACTION(tr); 4754 jnl->active_tr = NULL; 4755 ret = journal_flush(jnl, TRUE); 4756 jnl->active_tr = tr; 4757 4758 if (ret) { 4759 return ret; 4760 } 4761 wait_condition(jnl, &jnl->flushing, "end_transaction"); 4762 4763 /* 4764 * At this point, we have completely flushed the contents of the current 4765 * journal to disk (and have asynchronously written all of the txns to 4766 * their actual desired locations). As a result, we can (and must) clear 4767 * out the old_start array. If we do not, then if the last written transaction 4768 * started at the beginning of the journal (starting 1 block into the 4769 * journal file) it could confuse the buffer_flushed callback. This is 4770 * because we're about to reset the start/end pointers of the journal header 4771 * below. 4772 */ 4773 lock_oldstart(jnl); 4774 for (i = 0; i < sizeof (jnl->old_start) / sizeof(jnl->old_start[0]); i++) { 4775 jnl->old_start[i] = 0; 4776 } 4777 unlock_oldstart(jnl); 4778 4779 /* Update the journal's offset and size in memory. */ 4780 jnl->jdev_offset = offset; 4781 jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size; 4782 jnl->jhdr->size = journal_size; 4783 jnl->active_start = jnl->jhdr->start; 4784 4785 /* 4786 * Force the active transaction to be written to the new journal. Call the 4787 * supplied callback after the blocks have been written to the journal, but 4788 * before they get written to their normal on-disk locations. 4789 */ 4790 jnl->active_tr = NULL; 4791 ret = end_transaction(tr, 1, callback, callback_arg, FALSE, TRUE); 4792 if (ret) { 4793 printf("jnl: %s: relocate: end_transaction failed (%d)\n", jnl->jdev_name, ret); 4794 goto bad_journal; 4795 } 4796 4797 /* 4798 * Create a new, empty transaction to be the active transaction. This way 4799 * our caller can use journal_end_transaction as usual. 4800 */ 4801 ret = journal_allocate_transaction(jnl); 4802 if (ret) { 4803 printf("jnl: %s: relocate: could not allocate new transaction (%d)\n", jnl->jdev_name, ret); 4804 goto bad_journal; 4805 } 4806 4807 return 0; 4808 4809bad_journal: 4810 jnl->flags |= JOURNAL_INVALID; 4811 abort_transaction(jnl, tr); 4812 return ret; 4813} 4814 4815 4816#else // !JOURNALING - so provide stub functions 4817 4818int journal_uses_fua(__unused journal *jnl) 4819{ 4820 return 0; 4821} 4822 4823journal * 4824journal_create(__unused struct vnode *jvp, 4825 __unused off_t offset, 4826 __unused off_t journal_size, 4827 __unused struct vnode *fsvp, 4828 __unused size_t min_fs_blksz, 4829 __unused int32_t flags, 4830 __unused int32_t tbuffer_size, 4831 __unused void (*flush)(void *arg), 4832 __unused void *arg, 4833 __unused struct mount *fsmount) 4834{ 4835 return NULL; 4836} 4837 4838journal * 4839journal_open(__unused struct vnode *jvp, 4840 __unused off_t offset, 4841 __unused off_t journal_size, 4842 __unused struct vnode *fsvp, 4843 __unused size_t min_fs_blksz, 4844 __unused int32_t flags, 4845 __unused int32_t tbuffer_size, 4846 __unused void (*flush)(void *arg), 4847 __unused void *arg, 4848 __unused struct mount *fsmount) 4849{ 4850 return NULL; 4851} 4852 4853 4854int 4855journal_modify_block_start(__unused journal *jnl, __unused struct buf *bp) 4856{ 4857 return EINVAL; 4858} 4859 4860int 4861journal_modify_block_end(__unused journal *jnl, 4862 __unused struct buf *bp, 4863 __unused void (*func)(struct buf *bp, void *arg), 4864 __unused void *arg) 4865{ 4866 return EINVAL; 4867} 4868 4869int 4870journal_kill_block(__unused journal *jnl, __unused struct buf *bp) 4871{ 4872 return EINVAL; 4873} 4874 4875int journal_relocate(__unused journal *jnl, 4876 __unused off_t offset, 4877 __unused off_t journal_size, 4878 __unused int32_t tbuffer_size, 4879 __unused errno_t (*callback)(void *), 4880 __unused void *callback_arg) 4881{ 4882 return EINVAL; 4883} 4884 4885void 4886journal_close(__unused journal *jnl) 4887{ 4888} 4889 4890int 4891journal_start_transaction(__unused journal *jnl) 4892{ 4893 return EINVAL; 4894} 4895 4896int 4897journal_end_transaction(__unused journal *jnl) 4898{ 4899 return EINVAL; 4900} 4901 4902int 4903journal_flush(__unused journal *jnl, __unused boolean_t wait_for_IO) 4904{ 4905 return EINVAL; 4906} 4907 4908int 4909journal_is_clean(__unused struct vnode *jvp, 4910 __unused off_t offset, 4911 __unused off_t journal_size, 4912 __unused struct vnode *fsvp, 4913 __unused size_t min_fs_block_size) 4914{ 4915 return 0; 4916} 4917 4918 4919void * 4920journal_owner(__unused journal *jnl) 4921{ 4922 return NULL; 4923} 4924 4925void 4926journal_lock(__unused journal *jnl) 4927{ 4928 return; 4929} 4930 4931void 4932journal_unlock(__unused journal *jnl) 4933{ 4934 return; 4935} 4936 4937__private_extern__ int 4938journal_trim_add_extent(__unused journal *jnl, 4939 __unused uint64_t offset, 4940 __unused uint64_t length) 4941{ 4942 return 0; 4943} 4944 4945int 4946journal_request_immediate_flush(__unused journal *jnl) 4947{ 4948 return 0; 4949} 4950 4951__private_extern__ int 4952journal_trim_remove_extent(__unused journal *jnl, 4953 __unused uint64_t offset, 4954 __unused uint64_t length) 4955{ 4956 return 0; 4957} 4958 4959int journal_trim_extent_overlap(__unused journal *jnl, 4960 __unused uint64_t offset, 4961 __unused uint64_t length, 4962 __unused uint64_t *end) 4963{ 4964 return 0; 4965} 4966 4967#endif // !JOURNALING 4968