1/* 2 * Copyright (c) 2010-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <stdio.h> 30#include <stddef.h> 31#include <stdlib.h> 32#include <string.h> 33#include <limits.h> 34#include <err.h> 35#include <errno.h> 36#include <fcntl.h> 37#include <unistd.h> 38#include <stdarg.h> 39#include <sys/types.h> 40#include <sys/param.h> 41#include <sys/stat.h> 42#include <sys/ioctl.h> 43#include <sys/disk.h> 44#include <sys/param.h> 45 46#include "../fsck_hfs.h" 47#include "fsck_journal.h" 48 49#define DEBUG_JOURNAL 0 50 51extern char debug; 52 53#include <hfs/hfs_format.h> 54#include <libkern/OSByteOrder.h> 55 56typedef struct SwapType { 57 const char *name; 58 uint16_t (^swap16)(uint16_t); 59 uint32_t (^swap32)(uint32_t); 60 uint64_t (^swap64)(uint64_t); 61} swapper_t; 62 63static swapper_t nativeEndian = { 64 "native endian", 65 ^(uint16_t x) { return x; }, 66 ^(uint32_t x) { return x; }, 67 ^(uint64_t x) { return x; } 68}; 69 70static swapper_t swappedEndian = { 71 "swapped endian", 72 ^(uint16_t x) { return OSSwapInt16(x); }, 73 ^(uint32_t x) { return OSSwapInt32(x); }, 74 ^(uint64_t x) { return OSSwapInt64(x); } 75}; 76 77typedef int (^journal_write_block_t)(off_t, void *, size_t); 78 79// 80// this isn't a great checksum routine but it will do for now. 81// we use it to checksum the journal header and the block list 82// headers that are at the start of each transaction. 83// 84static uint32_t 85calc_checksum(char *ptr, int len) 86{ 87 int i; 88 uint32_t cksum = 0; 89 90 // this is a lame checksum but for now it'll do 91 for(i = 0; i < len; i++, ptr++) { 92 cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr); 93 } 94 95 return (~cksum); 96} 97 98typedef struct JournalIOInfo { 99 int jfd; // File descriptor for journal buffer 100 int wrapCount; // Incremented when it wraps around. 101 size_t bSize; // Block size. I/O needs to be done in that amount. 102 uint64_t base; // Base offset of journal buffer, past the header 103 uint64_t size; // Size of the journal, minus the header size 104 uint64_t end; // End of the journal (initially the "end" field from the journal header) 105 uint64_t current; // Current offset; starts at "start" 106} JournalIOInfo_t; 107 108/* 109 * Attempt to read <length> bytes from the journal buffer. 110 * Since this is a wrapped buffer, it may have to start at the 111 * beginning. info->{base, size, end} are read-only; info->current 112 * is updated with the current offset. It returns the number of bytes 113 * it read, or -1 on error. 114 */ 115static ssize_t 116journalRead(JournalIOInfo_t *info, uint8_t *buffer, size_t length) 117{ 118 size_t nread = 0; 119 uint8_t *ptr = buffer; 120 121// fprintf(stderr, "%s(%p, %p, %zu)\n", __FUNCTION__, info, buffer, length); 122 if (info->wrapCount > 1) { 123 fplog(stderr, "%s(%p, %p, %zu): journal buffer wrap count = %d\n", __FUNCTION__, info, buffer, length, info->wrapCount); 124 return -1; 125 } 126 while (nread < length) { 127 off_t end; 128 size_t amt; 129 ssize_t n; 130 131 if (info->end < info->current) { 132 // It wraps, so we max out at bse+size 133 end = info->base + info->size; 134 } else { 135 end = info->end; 136 } 137 amt = MIN((length - nread), (end - info->current)); 138 if (amt == 0) { 139 if (debug) { 140 fplog(stderr, "Journal read amount is 0, is that right?\n"); 141 } 142 goto done; 143 } 144 145 n = pread(info->jfd, ptr, amt, info->current); 146 if (n == -1) { 147 warn("pread(%d, %p, %zu, %llu)", info->jfd, ptr, amt, info->current); 148 goto done; 149 } 150 if (n != amt) { 151 if (debug) { 152 fplog(stderr, "%s(%d): Wanted to read %zu, but only read %zd\n", __FUNCTION__, __LINE__, amt, n); 153 } 154 } 155 nread += n; 156 ptr += n; 157 info->current += n; 158 if (info->current == (info->base + info->size)) { 159 info->current = info->base; 160 info->wrapCount++; 161 } 162 } 163done: 164 return nread; 165} 166 167/* 168 * Read a transaction from the journal buffer. 169 * A transaction is a list of block_list_headers, and their 170 * associated data. It needs to read all of the block_lists in 171 * a transaction, or it fails. It returns NULL if there are 172 * no transactions, and on error. (Maybe that should change?) 173 */ 174static block_list_header * 175getJournalTransaction(JournalIOInfo_t *jinfo, swapper_t *swap) 176{ 177 block_list_header *retval = NULL; 178 uint8_t block[jinfo->bSize]; 179 block_list_header *hdr = (void*)█ 180 ssize_t nread; 181 ssize_t amt; 182 183 memset(block, 0, sizeof(block)); 184 nread = journalRead(jinfo, block, sizeof(block)); 185 if (nread == -1 || 186 (size_t)nread != sizeof(block)) { 187 if (debug) 188 plog("%s: wanted %zd, got %zd\n", __FUNCTION__, sizeof(block), nread); 189 return NULL; 190 } 191 if (swap->swap32(hdr->num_blocks) == 0) { 192 /* 193 * Either there really are no blocks, or this is not a valid 194 * transaction. Either way, there's nothing for us to do here. 195 */ 196#if DEBUG_JOURNAL 197 if (debug) 198 fplog(stderr, "%s(%d): hdr->num_blocks == 0\n", __FUNCTION__, __LINE__); 199#endif 200 return NULL; 201 } 202 /* 203 * Now we check the checksum to see if this is a valid header. 204 * Note that we verify the checksum before reading any more -- if 205 * it's not a valid header, we don't want to read more than a block 206 * size. 207 */ 208 uint32_t tmpChecksum = swap->swap32(hdr->checksum); 209 uint32_t compChecksum; 210 hdr->checksum = 0; 211 compChecksum = calc_checksum((void*)hdr, sizeof(*hdr)); 212 hdr->checksum = swap->swap32(tmpChecksum); 213 214 if (compChecksum != tmpChecksum) { 215 if (debug) 216 fplog(stderr, "%s(%d): hdr has bad checksum, returning NULL\n", __FUNCTION__, __LINE__); 217 return NULL; 218 } 219 220 if (swap->swap32(hdr->bytes_used) < sizeof(block)) { 221#if DEBUG_JOURNAL 222 if (debug) { 223 fplog(stderr, "%s(%d): hdr has bytes_used (%u) less than sizeof block (%zd)\n", 224 __FUNCTION__, __LINE__, swap->swap32(hdr->bytes_used), sizeof(block)); 225 } 226#endif 227 return NULL; 228 } 229 230 retval = malloc(swap->swap32(hdr->bytes_used)); 231 if (retval == NULL) 232 return NULL; 233 234 memset(retval, 0, swap->swap32(hdr->bytes_used)); 235 memcpy(retval, block, sizeof(block)); 236 amt = swap->swap32(hdr->bytes_used) - sizeof(block); 237 nread = journalRead(jinfo, ((uint8_t*)retval) + sizeof(block), amt); 238 if (nread != amt) { 239 free(retval); 240 return NULL; 241 } 242 243 return retval; 244} 245 246/* 247 * Replay a transaction. 248 * Transactions have a blockListSize amount of block_list_header, and 249 * are then followed by data. We read it in, verify the checksum, and 250 * if it's good, we call the block that was passed in to do something 251 * with it. Maybe write it out. Maybe laugh about it. 252 * 253 * It returns -1 if there was an error before it wrote anything out, 254 * and -2 if there was an error after it wrote something out. 255 * 256 * The arguments are: 257 * txn -- a block_list_header pointer, which has the description and data 258 * to be replayed. 259 * blSize -- the size of the block_list for this journal. (The data 260 * are after the block_list, but part of the same buffer.) 261 * blkSize -- The block size used to convert block numbers to offsets. This 262 * is defined to be the size of the journal header. 263 * swap -- A pointer to a swapper_t used to swap journal data structure elements. 264 * writer -- A block-of-code that does writing. 265 * 266 * "writer" should return -1 to stop the replay (this propagates an error up). 267 */ 268static int 269replayTransaction(block_list_header *txn, size_t blSize, size_t blkSize, swapper_t *swap, journal_write_block_t writer) 270{ 271 uint32_t i; 272 uint8_t *endPtr = ((uint8_t*)txn) + swap->swap32(txn->bytes_used); 273 uint8_t *dataPtr = ((uint8_t*)txn) + blSize; 274 int retval = -1; 275 for (i = 1; i < swap->swap32(txn->num_blocks); i++) { 276#if DEBUG_JOURNAL 277 if (debug) 278 plog("\tBlock %d: blkNum %llu, size %u, data offset = %zd\n", i, swap->swap64(txn->binfo[i].bnum), swap->swap32(txn->binfo[i].bsize), dataPtr - (uint8_t*)txn); 279#endif 280 /* 281 * XXX 282 * Check with security types on these checks. Need to ensure 283 * that the fields don't take us off into the dark scary woods. 284 * It's mostly the second one that I am unsure about. 285 */ 286 if (dataPtr > endPtr) { 287 if (debug) 288 plog("\tData out of range for block_list_header\n"); 289 return retval; 290 } 291 if ((endPtr - dataPtr) < swap->swap32(txn->binfo[i].bsize)) { 292 if (debug) 293 plog("\tData size for block %d out of range for block_list_header\n", i); 294 return retval; 295 } 296 if ((dataPtr + swap->swap32(txn->binfo[i].bsize)) > endPtr) { 297 if (debug) 298 plog("\tData end out of range for block_list_header\n"); 299 return retval; 300 } 301#if DEBUG_JOURNAL 302 // Just for debugging 303 if (debug) { 304 if (swap->swap64(txn->binfo[i].bnum) == 2) { 305 HFSPlusVolumeHeader *vp = (void*)dataPtr; 306 plog("vp->signature = %#x, version = %#x\n", vp->signature, vp->version); 307 } 308 } 309#endif 310 // It's in the spec, and I saw it come up once on a live volume. 311 if (swap->swap64(txn->binfo[i].bnum) == ~(uint64_t)0) { 312#if DEBUG_JOURNAL 313 if (debug) 314 plog("\tSkipping this block due to magic skip number\n"); 315#endif 316 } else { 317 // Should we set retval to -2 here? 318 if (writer) { 319 if ((writer)(swap->swap64(txn->binfo[i].bnum) * blkSize, dataPtr, swap->swap32(txn->binfo[i].bsize)) == -1) 320 return retval; 321 } 322 } 323 dataPtr += swap->swap32(txn->binfo[i].bsize); 324 retval = -2; 325 } 326 return 0; 327} 328 329/* 330 * Read a journal header in from the journal device. 331 */ 332static int 333loadJournalHeader(int jfd, off_t offset, size_t blockSize, journal_header *jhp) 334{ 335 uint8_t buffer[blockSize]; 336 ssize_t nread; 337 338 nread = pread(jfd, buffer, sizeof(buffer), offset); 339 if (nread == -1 || 340 (size_t)nread != sizeof(buffer)) { 341 warn("tried to read %zu for journal header buffer, got %zd", sizeof(buffer), nread); 342 return -1; 343 } 344 *jhp = *(journal_header*)buffer; 345 return 0; 346} 347 348/* 349 * Replay a journal (called "journal_open" because you have to 350 * to replay it as part of opening it). At this point, all it 351 * is useful for is replaying the journal. 352 * 353 * It is passed in: 354 * jfd -- file descriptor for the journal device 355 * offset -- offset (in bytes) of the journal on the journal device 356 * journal_size -- size of the jorunal (in bytes) 357 * min_fs_blksize -- Blocksize of the data filesystem 358 * flags -- unused for now 359 * jdev_name -- string name for the journal device. used for logging. 360 * do_write_b -- a block which does the actual writing. 361 * 362 * Currently, for fsck_hfs, the do_write_b block writes to the cache. It could also 363 * just print out the block numbers, or just check their integrity, as much as is 364 * possible. 365 * 366 * The function works by loading the journal header. From there, it then starts 367 * loading transactions, via block_list_header groups. When it gets to the end 368 * of the journal, it tries continuing, in case there were transactions that 369 * didn't get updated in the header (this apparently happens). 370 * 371 * It returns 0 on success, and -1 on error. Note that there's not a lot 372 * fsck_hfs can probably do in the event of error. 373 * 374 */ 375int 376journal_open(int jfd, 377 off_t offset, // Offset of journal 378 off_t journal_size, // Size, in bytes, of the entire journal 379 size_t min_fs_blksize, // Blocksize of the data filesystem, journal blocksize must be at least this size 380 uint32_t flags __unused, // Not used in this implementation 381 const char *jdev_name, // The name of the journal device, for logging 382 int (^do_write_b)(off_t, void*, size_t)) 383{ 384 journal_header jhdr = { 0 }; 385 swapper_t *jnlSwap; // Used to swap fields of the journal 386 uint32_t tempCksum; // Temporary checksum value 387 uint32_t jBlkSize = 0; 388 389 if (ioctl(jfd, DKIOCGETBLOCKSIZE, &jBlkSize) == -1) { 390 jBlkSize = min_fs_blksize; 391 } else { 392 if (jBlkSize < min_fs_blksize) { 393 fplog(stderr, "%s: journal block size %u < min block size %zu for %s\n", __FUNCTION__, jBlkSize, min_fs_blksize, jdev_name); 394 return -1; 395 } 396 if ((jBlkSize % min_fs_blksize) != 0) { 397 fplog(stderr, "%s: journal block size %u is not a multiple of fs block size %zu for %s\n", __FUNCTION__, jBlkSize, min_fs_blksize, jdev_name); 398 return -1; 399 } 400 } 401 if (loadJournalHeader(jfd, offset, jBlkSize, &jhdr) != 0) { 402 fplog(stderr, "%s: unable to load journal header from %s\n", __FUNCTION__, jdev_name); 403 return -1; 404 } 405 406 /* 407 * Unlike the rest of the filesystem, the journal can be in native or 408 * non-native byte order. Barring moving a filesystem from one host 409 * to another, it'll almost always be in native byte order. 410 */ 411 if (jhdr.endian == ENDIAN_MAGIC) { 412 jnlSwap = &nativeEndian; 413 } else if (OSSwapInt32(jhdr.endian) == ENDIAN_MAGIC) { 414 jnlSwap = &swappedEndian; 415 } else { 416 fplog(stderr, "%s: Unknown journal endian magic number %#x from %s\n", __FUNCTION__, jhdr.endian, jdev_name); 417 return -1; 418 } 419 /* 420 * Two different magic numbers are valid. 421 * Do they mean different thigs, though? 422 */ 423 if (jnlSwap->swap32(jhdr.magic) != JOURNAL_HEADER_MAGIC && 424 jnlSwap->swap32(jhdr.magic) != OLD_JOURNAL_HEADER_MAGIC) { 425 fplog(stderr, "%s: Unknown journal header magic number %#x from %s\n", __FUNCTION__, jhdr.magic, jdev_name); 426 return -1; 427 } 428 429 /* 430 * Checksums have to be done with the checksum field set to 0. 431 * So we have to stash it aside for a bit, and set the field to 432 * 0, before we can compare. Afterwards, if it compares correctly, 433 * we put the original (swapped, if necessary) value back, just 434 * in case. 435 */ 436 tempCksum = jnlSwap->swap32(jhdr.checksum); 437 jhdr.checksum = 0; 438 if (jnlSwap->swap32(jhdr.magic) == JOURNAL_HEADER_MAGIC && 439 (calc_checksum((void*)&jhdr, JOURNAL_HEADER_CKSUM_SIZE) != tempCksum)) { 440 fplog(stderr, "%s: Invalid journal checksum from %s\n", __FUNCTION__, jdev_name); 441 return -1; 442 } 443 jhdr.checksum = jnlSwap->swap32(tempCksum); 444 445 /* 446 * Set up information about the journal which we use to do the I/O. 447 * The journal is a circular buffer. However, the start of the journal 448 * buffer is past the journal header. See the JournalIOInfo structure above. 449 */ 450 off_t startOffset = jnlSwap->swap64(jhdr.start); 451 off_t endOffset =jnlSwap->swap64(jhdr.end); 452 off_t journalStart = offset + jnlSwap->swap32(jhdr.jhdr_size); 453 454 /* 455 * The journal code was updated to be able to read past the "end" of the journal, 456 * to see if there were any valid transactions there. If we are peeking past the 457 * end, we don't care if we have checksum errors -- that just means they're not 458 * valid transactions. 459 * 460 */ 461 int into_the_weeds = 0; 462 uint32_t last_sequence_number = 0; 463 464 JournalIOInfo_t jinfo = { 0 }; 465 466#if DEBUG_JOURNAL 467 if (debug) 468 plog("Journal start sequence number = %u\n", jnlSwap->swap32(jhdr.sequence_num)); 469#endif 470 471 /* 472 * Now set up the JournalIOInfo object with the file descriptor, 473 * the block size, start and end of the journal buffer, and where 474 * the journal pointer currently is. 475 */ 476 jinfo.jfd = jfd; 477 jinfo.bSize = jnlSwap->swap32(jhdr.jhdr_size); 478 jinfo.base = journalStart; 479 jinfo.size = journal_size - jinfo.bSize; 480 jinfo.end = offset + endOffset; 481 jinfo.current = offset + startOffset; 482 483 const char *state = ""; 484 int bad_journal = 0; 485 block_list_header *txn = NULL; 486 487 /* 488 * Loop while getting transactions. We exit when we hit a checksum 489 * error, or when the sequence number for a transaction doesn't match 490 * what we expect it to. (That's the trickiest part -- the into_the_weeds 491 * portion of the code. It doesn't match the TN11150 documentation, so 492 * I've had to go by both my experience with real-world journals and by 493 * looking at the kernel code.) 494 */ 495 while (1) { 496 int rv; 497 498 if (jinfo.current == jinfo.end && into_the_weeds == 0) { 499 /* 500 * This is a bit weird, but it works: if current == end, but gone_into_weeds is 1, 501 * then this code will not execute. If it does execute, it'll go to get a transaction. 502 * That will put the pointer past end. 503 */ 504 if (jhdr.sequence_num == 0) { 505 /* 506 * XXX 507 * I am not sure about this; this behaviour is not in TN1150 at all, 508 * but I _think_ this is what the kernel is doing. 509 */ 510 plog("Journal sequence number is 0, is going into the end okay?\n"); 511 } 512 into_the_weeds = 1; 513#if DEBUG_JOURNAL 514 if (debug) 515 plog("Attempting to read past stated end of journal\n"); 516#endif 517 state = "tentative "; 518 jinfo.end = (jinfo.base + startOffset - jinfo.bSize); 519 continue; 520 } 521#if DEBUG_JOURNAL 522 if (debug) 523 plog("Before getting %stransaction: jinfo.current = %llu\n", state, jinfo.current); 524#endif 525 /* 526 * Note that getJournalTransaction verifies the checksum on the block_list_header, so 527 * if it's bad, it'll return NULL. 528 */ 529 txn = getJournalTransaction(&jinfo, jnlSwap); 530 if (txn == NULL) { 531#if DEBUG_JOURNAL 532 if (debug) 533 plog("txn is NULL, jinfo.current = %llu\n", jinfo.current); 534#endif 535 if (into_the_weeds) { 536#if DEBUG_JOURNAL 537 if (debug) 538 plog("\tBut we do not care, since it is past the end of the journal\n"); 539#endif 540 } else { 541 bad_journal = 1; 542 } 543 break; 544 } 545#if DEBUG_JOURNAL 546 if (debug) { 547 plog("After getting %stransaction: jinfo.current = %llu\n", state, jinfo.current); 548 plog("%stxn = { %u max_blocks, %u num_blocks, %u bytes_used, binfo[0].next = %u }\n", state, jnlSwap->swap32(txn->max_blocks), jnlSwap->swap32(txn->num_blocks), jnlSwap->swap32(txn->bytes_used), jnlSwap->swap32(txn->binfo[0].next)); 549 } 550#endif 551 if (into_the_weeds) { 552 /* 553 * This seems to be what the kernel was checking: if the 554 * last_sequence_number was set, and the txn sequence number 555 * is set, and the txn sequence number doesn't match either 556 * last_sequence_number _or_ an incremented version of it, then 557 * the transaction isn't worth looking at, and we've reached 558 * the end of the journal. 559 */ 560 if (last_sequence_number != 0 && 561 txn->binfo[0].next != 0 && 562 jnlSwap->swap32(txn->binfo[0].next) != last_sequence_number && 563 jnlSwap->swap32(txn->binfo[0].next) != (last_sequence_number + 1)) { 564 // Probably not a valid transaction 565#if DEBUG_JOURNAL 566 if (debug) 567 plog("\tTentative txn sequence %u is not expected %u, stopping journal replay\n", jnlSwap->swap32(txn->binfo[0].next), last_sequence_number + 1); 568#endif 569 break; 570 } 571 } 572 /* 573 * If we've got a valid transaction, then we replay it. 574 * If there was an error, we're done with the journal replay. 575 * (If the error occurred after the "end," then we don't care, 576 * and it's not a bad journal.) 577 */ 578 rv = replayTransaction(txn, 579 jnlSwap->swap32(jhdr.blhdr_size), 580 jnlSwap->swap32(jhdr.jhdr_size), 581 jnlSwap, 582 do_write_b); 583 584 if (rv < 0) { 585 if (debug) 586 plog("\tTransaction replay failed, returned %d\n", rv); 587 if (into_the_weeds) { 588 if (debug) 589 plog("\t\tAnd we don't care\n"); 590 } else { 591 bad_journal = 1; 592 } 593 break; 594 } 595 last_sequence_number = jnlSwap->swap32(txn->binfo[0].next); 596 free(txn); 597 txn = NULL; 598 } 599 if (txn) 600 free(txn); 601 if (bad_journal) { 602 if (debug) 603 plog("Journal was bad, stopped replaying\n"); 604 return -1; 605 } 606 607 return 0; 608} 609