1/* 2 * linux/fs/jbd/journal.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 5 * 6 * Copyright 1998 Red Hat corp --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Generic filesystem journal-writing code; part of the ext2fs 13 * journaling system. 14 * 15 * This file manages journals: areas of disk reserved for logging 16 * transactional updates. This includes the kernel journaling thread 17 * which is responsible for scheduling updates to the log. 18 * 19 * We do not actually manage the physical storage of the journal in this 20 * file: that is left to a per-journal policy function, which allows us 21 * to store the journal within a filesystem-specified area for ext2 22 * journaling (ext2 can use a reserved inode for storing the log). 23 */ 24 25#include <linux/module.h> 26#include <linux/time.h> 27#include <linux/fs.h> 28#include <linux/jbd.h> 29#include <linux/errno.h> 30#include <linux/slab.h> 31#include <linux/init.h> 32#include <linux/mm.h> 33#include <linux/freezer.h> 34#include <linux/pagemap.h> 35#include <linux/kthread.h> 36#include <linux/poison.h> 37#include <linux/proc_fs.h> 38#include <linux/debugfs.h> 39 40#include <asm/uaccess.h> 41#include <asm/page.h> 42 43EXPORT_SYMBOL(journal_start); 44EXPORT_SYMBOL(journal_restart); 45EXPORT_SYMBOL(journal_extend); 46EXPORT_SYMBOL(journal_stop); 47EXPORT_SYMBOL(journal_lock_updates); 48EXPORT_SYMBOL(journal_unlock_updates); 49EXPORT_SYMBOL(journal_get_write_access); 50EXPORT_SYMBOL(journal_get_create_access); 51EXPORT_SYMBOL(journal_get_undo_access); 52EXPORT_SYMBOL(journal_dirty_data); 53EXPORT_SYMBOL(journal_dirty_metadata); 54EXPORT_SYMBOL(journal_release_buffer); 55EXPORT_SYMBOL(journal_forget); 56EXPORT_SYMBOL(journal_flush); 57EXPORT_SYMBOL(journal_revoke); 58 59EXPORT_SYMBOL(journal_init_dev); 60EXPORT_SYMBOL(journal_init_inode); 61EXPORT_SYMBOL(journal_update_format); 62EXPORT_SYMBOL(journal_check_used_features); 63EXPORT_SYMBOL(journal_check_available_features); 64EXPORT_SYMBOL(journal_set_features); 65EXPORT_SYMBOL(journal_create); 66EXPORT_SYMBOL(journal_load); 67EXPORT_SYMBOL(journal_destroy); 68EXPORT_SYMBOL(journal_abort); 69EXPORT_SYMBOL(journal_errno); 70EXPORT_SYMBOL(journal_ack_err); 71EXPORT_SYMBOL(journal_clear_err); 72EXPORT_SYMBOL(log_wait_commit); 73EXPORT_SYMBOL(log_start_commit); 74EXPORT_SYMBOL(journal_start_commit); 75EXPORT_SYMBOL(journal_force_commit_nested); 76EXPORT_SYMBOL(journal_wipe); 77EXPORT_SYMBOL(journal_blocks_per_page); 78EXPORT_SYMBOL(journal_invalidatepage); 79EXPORT_SYMBOL(journal_try_to_free_buffers); 80EXPORT_SYMBOL(journal_force_commit); 81 82static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 83static void __journal_abort_soft (journal_t *journal, int errno); 84 85/* 86 * Helper function used to manage commit timeouts 87 */ 88 89static void commit_timeout(unsigned long __data) 90{ 91 struct task_struct * p = (struct task_struct *) __data; 92 93 wake_up_process(p); 94} 95 96/* 97 * kjournald: The main thread function used to manage a logging device 98 * journal. 99 * 100 * This kernel thread is responsible for two things: 101 * 102 * 1) COMMIT: Every so often we need to commit the current state of the 103 * filesystem to disk. The journal thread is responsible for writing 104 * all of the metadata buffers to disk. 105 * 106 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all 107 * of the data in that part of the log has been rewritten elsewhere on 108 * the disk. Flushing these old buffers to reclaim space in the log is 109 * known as checkpointing, and this thread is responsible for that job. 110 */ 111 112static int kjournald(void *arg) 113{ 114 journal_t *journal = arg; 115 transaction_t *transaction; 116 117 /* 118 * Set up an interval timer which can be used to trigger a commit wakeup 119 * after the commit interval expires 120 */ 121 setup_timer(&journal->j_commit_timer, commit_timeout, 122 (unsigned long)current); 123 124 /* Record that the journal thread is running */ 125 journal->j_task = current; 126 wake_up(&journal->j_wait_done_commit); 127 128 printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", 129 journal->j_commit_interval / HZ); 130 131 /* 132 * And now, wait forever for commit wakeup events. 133 */ 134 spin_lock(&journal->j_state_lock); 135 136loop: 137 if (journal->j_flags & JFS_UNMOUNT) 138 goto end_loop; 139 140 jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", 141 journal->j_commit_sequence, journal->j_commit_request); 142 143 if (journal->j_commit_sequence != journal->j_commit_request) { 144 jbd_debug(1, "OK, requests differ\n"); 145 spin_unlock(&journal->j_state_lock); 146 del_timer_sync(&journal->j_commit_timer); 147 journal_commit_transaction(journal); 148 spin_lock(&journal->j_state_lock); 149 goto loop; 150 } 151 152 wake_up(&journal->j_wait_done_commit); 153 if (freezing(current)) { 154 /* 155 * The simpler the better. Flushing journal isn't a 156 * good idea, because that depends on threads that may 157 * be already stopped. 158 */ 159 jbd_debug(1, "Now suspending kjournald\n"); 160 spin_unlock(&journal->j_state_lock); 161 refrigerator(); 162 spin_lock(&journal->j_state_lock); 163 } else { 164 /* 165 * We assume on resume that commits are already there, 166 * so we don't sleep 167 */ 168 DEFINE_WAIT(wait); 169 int should_sleep = 1; 170 171 prepare_to_wait(&journal->j_wait_commit, &wait, 172 TASK_INTERRUPTIBLE); 173 if (journal->j_commit_sequence != journal->j_commit_request) 174 should_sleep = 0; 175 transaction = journal->j_running_transaction; 176 if (transaction && time_after_eq(jiffies, 177 transaction->t_expires)) 178 should_sleep = 0; 179 if (journal->j_flags & JFS_UNMOUNT) 180 should_sleep = 0; 181 if (should_sleep) { 182 spin_unlock(&journal->j_state_lock); 183 schedule(); 184 spin_lock(&journal->j_state_lock); 185 } 186 finish_wait(&journal->j_wait_commit, &wait); 187 } 188 189 jbd_debug(1, "kjournald wakes\n"); 190 191 /* 192 * Were we woken up by a commit wakeup event? 193 */ 194 transaction = journal->j_running_transaction; 195 if (transaction && time_after_eq(jiffies, transaction->t_expires)) { 196 journal->j_commit_request = transaction->t_tid; 197 jbd_debug(1, "woke because of timeout\n"); 198 } 199 goto loop; 200 201end_loop: 202 spin_unlock(&journal->j_state_lock); 203 del_timer_sync(&journal->j_commit_timer); 204 journal->j_task = NULL; 205 wake_up(&journal->j_wait_done_commit); 206 jbd_debug(1, "Journal thread exiting.\n"); 207 return 0; 208} 209 210static int journal_start_thread(journal_t *journal) 211{ 212 struct task_struct *t; 213 214 t = kthread_run(kjournald, journal, "kjournald"); 215 if (IS_ERR(t)) 216 return PTR_ERR(t); 217 218 wait_event(journal->j_wait_done_commit, journal->j_task != NULL); 219 return 0; 220} 221 222static void journal_kill_thread(journal_t *journal) 223{ 224 spin_lock(&journal->j_state_lock); 225 journal->j_flags |= JFS_UNMOUNT; 226 227 while (journal->j_task) { 228 wake_up(&journal->j_wait_commit); 229 spin_unlock(&journal->j_state_lock); 230 wait_event(journal->j_wait_done_commit, 231 journal->j_task == NULL); 232 spin_lock(&journal->j_state_lock); 233 } 234 spin_unlock(&journal->j_state_lock); 235} 236 237/* 238 * journal_write_metadata_buffer: write a metadata buffer to the journal. 239 * 240 * Writes a metadata buffer to a given disk block. The actual IO is not 241 * performed but a new buffer_head is constructed which labels the data 242 * to be written with the correct destination disk block. 243 * 244 * Any magic-number escaping which needs to be done will cause a 245 * copy-out here. If the buffer happens to start with the 246 * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the 247 * magic number is only written to the log for descripter blocks. In 248 * this case, we copy the data and replace the first word with 0, and we 249 * return a result code which indicates that this buffer needs to be 250 * marked as an escaped buffer in the corresponding log descriptor 251 * block. The missing word can then be restored when the block is read 252 * during recovery. 253 * 254 * If the source buffer has already been modified by a new transaction 255 * since we took the last commit snapshot, we use the frozen copy of 256 * that data for IO. If we end up using the existing buffer_head's data 257 * for the write, then we *have* to lock the buffer to prevent anyone 258 * else from using and possibly modifying it while the IO is in 259 * progress. 260 * 261 * The function returns a pointer to the buffer_heads to be used for IO. 262 * 263 * We assume that the journal has already been locked in this function. 264 * 265 * Return value: 266 * <0: Error 267 * >=0: Finished OK 268 * 269 * On success: 270 * Bit 0 set == escape performed on the data 271 * Bit 1 set == buffer copy-out performed (kfree the data after IO) 272 */ 273 274int journal_write_metadata_buffer(transaction_t *transaction, 275 struct journal_head *jh_in, 276 struct journal_head **jh_out, 277 unsigned int blocknr) 278{ 279 int need_copy_out = 0; 280 int done_copy_out = 0; 281 int do_escape = 0; 282 char *mapped_data; 283 struct buffer_head *new_bh; 284 struct journal_head *new_jh; 285 struct page *new_page; 286 unsigned int new_offset; 287 struct buffer_head *bh_in = jh2bh(jh_in); 288 journal_t *journal = transaction->t_journal; 289 290 /* 291 * The buffer really shouldn't be locked: only the current committing 292 * transaction is allowed to write it, so nobody else is allowed 293 * to do any IO. 294 * 295 * akpm: except if we're journalling data, and write() output is 296 * also part of a shared mapping, and another thread has 297 * decided to launch a writepage() against this buffer. 298 */ 299 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); 300 301 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); 302 /* keep subsequent assertions sane */ 303 new_bh->b_state = 0; 304 init_buffer(new_bh, NULL, NULL); 305 atomic_set(&new_bh->b_count, 1); 306 new_jh = journal_add_journal_head(new_bh); /* This sleeps */ 307 308 /* 309 * If a new transaction has already done a buffer copy-out, then 310 * we use that version of the data for the commit. 311 */ 312 jbd_lock_bh_state(bh_in); 313repeat: 314 if (jh_in->b_frozen_data) { 315 done_copy_out = 1; 316 new_page = virt_to_page(jh_in->b_frozen_data); 317 new_offset = offset_in_page(jh_in->b_frozen_data); 318 } else { 319 new_page = jh2bh(jh_in)->b_page; 320 new_offset = offset_in_page(jh2bh(jh_in)->b_data); 321 } 322 323 mapped_data = kmap_atomic(new_page, KM_USER0); 324 /* 325 * Check for escaping 326 */ 327 if (*((__be32 *)(mapped_data + new_offset)) == 328 cpu_to_be32(JFS_MAGIC_NUMBER)) { 329 need_copy_out = 1; 330 do_escape = 1; 331 } 332 kunmap_atomic(mapped_data, KM_USER0); 333 334 /* 335 * Do we need to do a data copy? 336 */ 337 if (need_copy_out && !done_copy_out) { 338 char *tmp; 339 340 jbd_unlock_bh_state(bh_in); 341 tmp = jbd_alloc(bh_in->b_size, GFP_NOFS); 342 jbd_lock_bh_state(bh_in); 343 if (jh_in->b_frozen_data) { 344 jbd_free(tmp, bh_in->b_size); 345 goto repeat; 346 } 347 348 jh_in->b_frozen_data = tmp; 349 mapped_data = kmap_atomic(new_page, KM_USER0); 350 memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); 351 kunmap_atomic(mapped_data, KM_USER0); 352 353 new_page = virt_to_page(tmp); 354 new_offset = offset_in_page(tmp); 355 done_copy_out = 1; 356 } 357 358 /* 359 * Did we need to do an escaping? Now we've done all the 360 * copying, we can finally do so. 361 */ 362 if (do_escape) { 363 mapped_data = kmap_atomic(new_page, KM_USER0); 364 *((unsigned int *)(mapped_data + new_offset)) = 0; 365 kunmap_atomic(mapped_data, KM_USER0); 366 } 367 368 set_bh_page(new_bh, new_page, new_offset); 369 new_jh->b_transaction = NULL; 370 new_bh->b_size = jh2bh(jh_in)->b_size; 371 new_bh->b_bdev = transaction->t_journal->j_dev; 372 new_bh->b_blocknr = blocknr; 373 set_buffer_mapped(new_bh); 374 set_buffer_dirty(new_bh); 375 376 *jh_out = new_jh; 377 378 /* 379 * The to-be-written buffer needs to get moved to the io queue, 380 * and the original buffer whose contents we are shadowing or 381 * copying is moved to the transaction's shadow queue. 382 */ 383 JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); 384 spin_lock(&journal->j_list_lock); 385 __journal_file_buffer(jh_in, transaction, BJ_Shadow); 386 spin_unlock(&journal->j_list_lock); 387 jbd_unlock_bh_state(bh_in); 388 389 JBUFFER_TRACE(new_jh, "file as BJ_IO"); 390 journal_file_buffer(new_jh, transaction, BJ_IO); 391 392 return do_escape | (done_copy_out << 1); 393} 394 395/* 396 * Allocation code for the journal file. Manage the space left in the 397 * journal, so that we can begin checkpointing when appropriate. 398 */ 399 400/* 401 * __log_space_left: Return the number of free blocks left in the journal. 402 * 403 * Called with the journal already locked. 404 * 405 * Called under j_state_lock 406 */ 407 408int __log_space_left(journal_t *journal) 409{ 410 int left = journal->j_free; 411 412 assert_spin_locked(&journal->j_state_lock); 413 414 /* 415 * Be pessimistic here about the number of those free blocks which 416 * might be required for log descriptor control blocks. 417 */ 418 419#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ 420 421 left -= MIN_LOG_RESERVED_BLOCKS; 422 423 if (left <= 0) 424 return 0; 425 left -= (left >> 3); 426 return left; 427} 428 429/* 430 * Called under j_state_lock. Returns true if a transaction commit was started. 431 */ 432int __log_start_commit(journal_t *journal, tid_t target) 433{ 434 /* 435 * Are we already doing a recent enough commit? 436 */ 437 if (!tid_geq(journal->j_commit_request, target)) { 438 /* 439 * We want a new commit: OK, mark the request and wakup the 440 * commit thread. We do _not_ do the commit ourselves. 441 */ 442 443 journal->j_commit_request = target; 444 jbd_debug(1, "JBD: requesting commit %d/%d\n", 445 journal->j_commit_request, 446 journal->j_commit_sequence); 447 wake_up(&journal->j_wait_commit); 448 return 1; 449 } 450 return 0; 451} 452 453int log_start_commit(journal_t *journal, tid_t tid) 454{ 455 int ret; 456 457 spin_lock(&journal->j_state_lock); 458 ret = __log_start_commit(journal, tid); 459 spin_unlock(&journal->j_state_lock); 460 return ret; 461} 462 463/* 464 * Force and wait upon a commit if the calling process is not within 465 * transaction. This is used for forcing out undo-protected data which contains 466 * bitmaps, when the fs is running out of space. 467 * 468 * We can only force the running transaction if we don't have an active handle; 469 * otherwise, we will deadlock. 470 * 471 * Returns true if a transaction was started. 472 */ 473int journal_force_commit_nested(journal_t *journal) 474{ 475 transaction_t *transaction = NULL; 476 tid_t tid; 477 478 spin_lock(&journal->j_state_lock); 479 if (journal->j_running_transaction && !current->journal_info) { 480 transaction = journal->j_running_transaction; 481 __log_start_commit(journal, transaction->t_tid); 482 } else if (journal->j_committing_transaction) 483 transaction = journal->j_committing_transaction; 484 485 if (!transaction) { 486 spin_unlock(&journal->j_state_lock); 487 return 0; /* Nothing to retry */ 488 } 489 490 tid = transaction->t_tid; 491 spin_unlock(&journal->j_state_lock); 492 log_wait_commit(journal, tid); 493 return 1; 494} 495 496/* 497 * Start a commit of the current running transaction (if any). Returns true 498 * if a transaction is going to be committed (or is currently already 499 * committing), and fills its tid in at *ptid 500 */ 501int journal_start_commit(journal_t *journal, tid_t *ptid) 502{ 503 int ret = 0; 504 505 spin_lock(&journal->j_state_lock); 506 if (journal->j_running_transaction) { 507 tid_t tid = journal->j_running_transaction->t_tid; 508 509 __log_start_commit(journal, tid); 510 /* There's a running transaction and we've just made sure 511 * it's commit has been scheduled. */ 512 if (ptid) 513 *ptid = tid; 514 ret = 1; 515 } else if (journal->j_committing_transaction) { 516 /* 517 * If ext3_write_super() recently started a commit, then we 518 * have to wait for completion of that transaction 519 */ 520 if (ptid) 521 *ptid = journal->j_committing_transaction->t_tid; 522 ret = 1; 523 } 524 spin_unlock(&journal->j_state_lock); 525 return ret; 526} 527 528/* 529 * Wait for a specified commit to complete. 530 * The caller may not hold the journal lock. 531 */ 532int log_wait_commit(journal_t *journal, tid_t tid) 533{ 534 int err = 0; 535 536#ifdef CONFIG_JBD_DEBUG 537 spin_lock(&journal->j_state_lock); 538 if (!tid_geq(journal->j_commit_request, tid)) { 539 printk(KERN_EMERG 540 "%s: error: j_commit_request=%d, tid=%d\n", 541 __func__, journal->j_commit_request, tid); 542 } 543 spin_unlock(&journal->j_state_lock); 544#endif 545 spin_lock(&journal->j_state_lock); 546 while (tid_gt(tid, journal->j_commit_sequence)) { 547 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", 548 tid, journal->j_commit_sequence); 549 wake_up(&journal->j_wait_commit); 550 spin_unlock(&journal->j_state_lock); 551 wait_event(journal->j_wait_done_commit, 552 !tid_gt(tid, journal->j_commit_sequence)); 553 spin_lock(&journal->j_state_lock); 554 } 555 spin_unlock(&journal->j_state_lock); 556 557 if (unlikely(is_journal_aborted(journal))) { 558 printk(KERN_EMERG "journal commit I/O error\n"); 559 err = -EIO; 560 } 561 return err; 562} 563 564/* 565 * Return 1 if a given transaction has not yet sent barrier request 566 * connected with a transaction commit. If 0 is returned, transaction 567 * may or may not have sent the barrier. Used to avoid sending barrier 568 * twice in common cases. 569 */ 570int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid) 571{ 572 int ret = 0; 573 transaction_t *commit_trans; 574 575 if (!(journal->j_flags & JFS_BARRIER)) 576 return 0; 577 spin_lock(&journal->j_state_lock); 578 /* Transaction already committed? */ 579 if (tid_geq(journal->j_commit_sequence, tid)) 580 goto out; 581 /* 582 * Transaction is being committed and we already proceeded to 583 * writing commit record? 584 */ 585 commit_trans = journal->j_committing_transaction; 586 if (commit_trans && commit_trans->t_tid == tid && 587 commit_trans->t_state >= T_COMMIT_RECORD) 588 goto out; 589 ret = 1; 590out: 591 spin_unlock(&journal->j_state_lock); 592 return ret; 593} 594EXPORT_SYMBOL(journal_trans_will_send_data_barrier); 595 596/* 597 * Log buffer allocation routines: 598 */ 599 600int journal_next_log_block(journal_t *journal, unsigned int *retp) 601{ 602 unsigned int blocknr; 603 604 spin_lock(&journal->j_state_lock); 605 J_ASSERT(journal->j_free > 1); 606 607 blocknr = journal->j_head; 608 journal->j_head++; 609 journal->j_free--; 610 if (journal->j_head == journal->j_last) 611 journal->j_head = journal->j_first; 612 spin_unlock(&journal->j_state_lock); 613 return journal_bmap(journal, blocknr, retp); 614} 615 616/* 617 * Conversion of logical to physical block numbers for the journal 618 * 619 * On external journals the journal blocks are identity-mapped, so 620 * this is a no-op. If needed, we can use j_blk_offset - everything is 621 * ready. 622 */ 623int journal_bmap(journal_t *journal, unsigned int blocknr, 624 unsigned int *retp) 625{ 626 int err = 0; 627 unsigned int ret; 628 629 if (journal->j_inode) { 630 ret = bmap(journal->j_inode, blocknr); 631 if (ret) 632 *retp = ret; 633 else { 634 char b[BDEVNAME_SIZE]; 635 636 printk(KERN_ALERT "%s: journal block not found " 637 "at offset %u on %s\n", 638 __func__, 639 blocknr, 640 bdevname(journal->j_dev, b)); 641 err = -EIO; 642 __journal_abort_soft(journal, err); 643 } 644 } else { 645 *retp = blocknr; /* +journal->j_blk_offset */ 646 } 647 return err; 648} 649 650/* 651 * We play buffer_head aliasing tricks to write data/metadata blocks to 652 * the journal without copying their contents, but for journal 653 * descriptor blocks we do need to generate bona fide buffers. 654 * 655 * After the caller of journal_get_descriptor_buffer() has finished modifying 656 * the buffer's contents they really should run flush_dcache_page(bh->b_page). 657 * But we don't bother doing that, so there will be coherency problems with 658 * mmaps of blockdevs which hold live JBD-controlled filesystems. 659 */ 660struct journal_head *journal_get_descriptor_buffer(journal_t *journal) 661{ 662 struct buffer_head *bh; 663 unsigned int blocknr; 664 int err; 665 666 err = journal_next_log_block(journal, &blocknr); 667 668 if (err) 669 return NULL; 670 671 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 672 if (!bh) 673 return NULL; 674 lock_buffer(bh); 675 memset(bh->b_data, 0, journal->j_blocksize); 676 set_buffer_uptodate(bh); 677 unlock_buffer(bh); 678 BUFFER_TRACE(bh, "return this buffer"); 679 return journal_add_journal_head(bh); 680} 681 682/* 683 * Management for journal control blocks: functions to create and 684 * destroy journal_t structures, and to initialise and read existing 685 * journal blocks from disk. */ 686 687/* First: create and setup a journal_t object in memory. We initialise 688 * very few fields yet: that has to wait until we have created the 689 * journal structures from from scratch, or loaded them from disk. */ 690 691static journal_t * journal_init_common (void) 692{ 693 journal_t *journal; 694 int err; 695 696 journal = kzalloc(sizeof(*journal), GFP_KERNEL); 697 if (!journal) 698 goto fail; 699 700 init_waitqueue_head(&journal->j_wait_transaction_locked); 701 init_waitqueue_head(&journal->j_wait_logspace); 702 init_waitqueue_head(&journal->j_wait_done_commit); 703 init_waitqueue_head(&journal->j_wait_checkpoint); 704 init_waitqueue_head(&journal->j_wait_commit); 705 init_waitqueue_head(&journal->j_wait_updates); 706 mutex_init(&journal->j_barrier); 707 mutex_init(&journal->j_checkpoint_mutex); 708 spin_lock_init(&journal->j_revoke_lock); 709 spin_lock_init(&journal->j_list_lock); 710 spin_lock_init(&journal->j_state_lock); 711 712 journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE); 713 714 /* The journal is marked for error until we succeed with recovery! */ 715 journal->j_flags = JFS_ABORT; 716 717 /* Set up a default-sized revoke table for the new mount. */ 718 err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); 719 if (err) { 720 kfree(journal); 721 goto fail; 722 } 723 return journal; 724fail: 725 return NULL; 726} 727 728/* journal_init_dev and journal_init_inode: 729 * 730 * Create a journal structure assigned some fixed set of disk blocks to 731 * the journal. We don't actually touch those disk blocks yet, but we 732 * need to set up all of the mapping information to tell the journaling 733 * system where the journal blocks are. 734 * 735 */ 736 737/** 738 * journal_t * journal_init_dev() - creates and initialises a journal structure 739 * @bdev: Block device on which to create the journal 740 * @fs_dev: Device which hold journalled filesystem for this journal. 741 * @start: Block nr Start of journal. 742 * @len: Length of the journal in blocks. 743 * @blocksize: blocksize of journalling device 744 * 745 * Returns: a newly created journal_t * 746 * 747 * journal_init_dev creates a journal which maps a fixed contiguous 748 * range of blocks on an arbitrary block device. 749 * 750 */ 751journal_t * journal_init_dev(struct block_device *bdev, 752 struct block_device *fs_dev, 753 int start, int len, int blocksize) 754{ 755 journal_t *journal = journal_init_common(); 756 struct buffer_head *bh; 757 int n; 758 759 if (!journal) 760 return NULL; 761 762 /* journal descriptor can store up to n blocks -bzzz */ 763 journal->j_blocksize = blocksize; 764 n = journal->j_blocksize / sizeof(journal_block_tag_t); 765 journal->j_wbufsize = n; 766 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 767 if (!journal->j_wbuf) { 768 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 769 __func__); 770 goto out_err; 771 } 772 journal->j_dev = bdev; 773 journal->j_fs_dev = fs_dev; 774 journal->j_blk_offset = start; 775 journal->j_maxlen = len; 776 777 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 778 if (!bh) { 779 printk(KERN_ERR 780 "%s: Cannot get buffer for journal superblock\n", 781 __func__); 782 goto out_err; 783 } 784 journal->j_sb_buffer = bh; 785 journal->j_superblock = (journal_superblock_t *)bh->b_data; 786 787 return journal; 788out_err: 789 kfree(journal->j_wbuf); 790 kfree(journal); 791 return NULL; 792} 793 794/** 795 * journal_t * journal_init_inode () - creates a journal which maps to a inode. 796 * @inode: An inode to create the journal in 797 * 798 * journal_init_inode creates a journal which maps an on-disk inode as 799 * the journal. The inode must exist already, must support bmap() and 800 * must have all data blocks preallocated. 801 */ 802journal_t * journal_init_inode (struct inode *inode) 803{ 804 struct buffer_head *bh; 805 journal_t *journal = journal_init_common(); 806 int err; 807 int n; 808 unsigned int blocknr; 809 810 if (!journal) 811 return NULL; 812 813 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; 814 journal->j_inode = inode; 815 jbd_debug(1, 816 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", 817 journal, inode->i_sb->s_id, inode->i_ino, 818 (long long) inode->i_size, 819 inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); 820 821 journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; 822 journal->j_blocksize = inode->i_sb->s_blocksize; 823 824 /* journal descriptor can store up to n blocks -bzzz */ 825 n = journal->j_blocksize / sizeof(journal_block_tag_t); 826 journal->j_wbufsize = n; 827 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 828 if (!journal->j_wbuf) { 829 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 830 __func__); 831 goto out_err; 832 } 833 834 err = journal_bmap(journal, 0, &blocknr); 835 /* If that failed, give up */ 836 if (err) { 837 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 838 __func__); 839 goto out_err; 840 } 841 842 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 843 if (!bh) { 844 printk(KERN_ERR 845 "%s: Cannot get buffer for journal superblock\n", 846 __func__); 847 goto out_err; 848 } 849 journal->j_sb_buffer = bh; 850 journal->j_superblock = (journal_superblock_t *)bh->b_data; 851 852 return journal; 853out_err: 854 kfree(journal->j_wbuf); 855 kfree(journal); 856 return NULL; 857} 858 859/* 860 * If the journal init or create aborts, we need to mark the journal 861 * superblock as being NULL to prevent the journal destroy from writing 862 * back a bogus superblock. 863 */ 864static void journal_fail_superblock (journal_t *journal) 865{ 866 struct buffer_head *bh = journal->j_sb_buffer; 867 brelse(bh); 868 journal->j_sb_buffer = NULL; 869} 870 871/* 872 * Given a journal_t structure, initialise the various fields for 873 * startup of a new journaling session. We use this both when creating 874 * a journal, and after recovering an old journal to reset it for 875 * subsequent use. 876 */ 877 878static int journal_reset(journal_t *journal) 879{ 880 journal_superblock_t *sb = journal->j_superblock; 881 unsigned int first, last; 882 883 first = be32_to_cpu(sb->s_first); 884 last = be32_to_cpu(sb->s_maxlen); 885 if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { 886 printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n", 887 first, last); 888 journal_fail_superblock(journal); 889 return -EINVAL; 890 } 891 892 journal->j_first = first; 893 journal->j_last = last; 894 895 journal->j_head = first; 896 journal->j_tail = first; 897 journal->j_free = last - first; 898 899 journal->j_tail_sequence = journal->j_transaction_sequence; 900 journal->j_commit_sequence = journal->j_transaction_sequence - 1; 901 journal->j_commit_request = journal->j_commit_sequence; 902 903 journal->j_max_transaction_buffers = journal->j_maxlen / 4; 904 905 /* Add the dynamic fields and write it to disk. */ 906 journal_update_superblock(journal, 1); 907 return journal_start_thread(journal); 908} 909 910/** 911 * int journal_create() - Initialise the new journal file 912 * @journal: Journal to create. This structure must have been initialised 913 * 914 * Given a journal_t structure which tells us which disk blocks we can 915 * use, create a new journal superblock and initialise all of the 916 * journal fields from scratch. 917 **/ 918int journal_create(journal_t *journal) 919{ 920 unsigned int blocknr; 921 struct buffer_head *bh; 922 journal_superblock_t *sb; 923 int i, err; 924 925 if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { 926 printk (KERN_ERR "Journal length (%d blocks) too short.\n", 927 journal->j_maxlen); 928 journal_fail_superblock(journal); 929 return -EINVAL; 930 } 931 932 if (journal->j_inode == NULL) { 933 /* 934 * We don't know what block to start at! 935 */ 936 printk(KERN_EMERG 937 "%s: creation of journal on external device!\n", 938 __func__); 939 BUG(); 940 } 941 942 /* Zero out the entire journal on disk. We cannot afford to 943 have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ 944 jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); 945 for (i = 0; i < journal->j_maxlen; i++) { 946 err = journal_bmap(journal, i, &blocknr); 947 if (err) 948 return err; 949 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 950 lock_buffer(bh); 951 memset (bh->b_data, 0, journal->j_blocksize); 952 BUFFER_TRACE(bh, "marking dirty"); 953 mark_buffer_dirty(bh); 954 BUFFER_TRACE(bh, "marking uptodate"); 955 set_buffer_uptodate(bh); 956 unlock_buffer(bh); 957 __brelse(bh); 958 } 959 960 sync_blockdev(journal->j_dev); 961 jbd_debug(1, "JBD: journal cleared.\n"); 962 963 /* OK, fill in the initial static fields in the new superblock */ 964 sb = journal->j_superblock; 965 966 sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); 967 sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); 968 969 sb->s_blocksize = cpu_to_be32(journal->j_blocksize); 970 sb->s_maxlen = cpu_to_be32(journal->j_maxlen); 971 sb->s_first = cpu_to_be32(1); 972 973 journal->j_transaction_sequence = 1; 974 975 journal->j_flags &= ~JFS_ABORT; 976 journal->j_format_version = 2; 977 978 return journal_reset(journal); 979} 980 981/** 982 * void journal_update_superblock() - Update journal sb on disk. 983 * @journal: The journal to update. 984 * @wait: Set to '0' if you don't want to wait for IO completion. 985 * 986 * Update a journal's dynamic superblock fields and write it to disk, 987 * optionally waiting for the IO to complete. 988 */ 989void journal_update_superblock(journal_t *journal, int wait) 990{ 991 journal_superblock_t *sb = journal->j_superblock; 992 struct buffer_head *bh = journal->j_sb_buffer; 993 994 /* 995 * As a special case, if the on-disk copy is already marked as needing 996 * no recovery (s_start == 0) and there are no outstanding transactions 997 * in the filesystem, then we can safely defer the superblock update 998 * until the next commit by setting JFS_FLUSHED. This avoids 999 * attempting a write to a potential-readonly device. 1000 */ 1001 if (sb->s_start == 0 && journal->j_tail_sequence == 1002 journal->j_transaction_sequence) { 1003 jbd_debug(1,"JBD: Skipping superblock update on recovered sb " 1004 "(start %u, seq %d, errno %d)\n", 1005 journal->j_tail, journal->j_tail_sequence, 1006 journal->j_errno); 1007 goto out; 1008 } 1009 1010 spin_lock(&journal->j_state_lock); 1011 jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", 1012 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1013 1014 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 1015 sb->s_start = cpu_to_be32(journal->j_tail); 1016 sb->s_errno = cpu_to_be32(journal->j_errno); 1017 spin_unlock(&journal->j_state_lock); 1018 1019 BUFFER_TRACE(bh, "marking dirty"); 1020 mark_buffer_dirty(bh); 1021 if (wait) 1022 sync_dirty_buffer(bh); 1023 else 1024 write_dirty_buffer(bh, WRITE); 1025 1026out: 1027 /* If we have just flushed the log (by marking s_start==0), then 1028 * any future commit will have to be careful to update the 1029 * superblock again to re-record the true start of the log. */ 1030 1031 spin_lock(&journal->j_state_lock); 1032 if (sb->s_start) 1033 journal->j_flags &= ~JFS_FLUSHED; 1034 else 1035 journal->j_flags |= JFS_FLUSHED; 1036 spin_unlock(&journal->j_state_lock); 1037} 1038 1039/* 1040 * Read the superblock for a given journal, performing initial 1041 * validation of the format. 1042 */ 1043 1044static int journal_get_superblock(journal_t *journal) 1045{ 1046 struct buffer_head *bh; 1047 journal_superblock_t *sb; 1048 int err = -EIO; 1049 1050 bh = journal->j_sb_buffer; 1051 1052 J_ASSERT(bh != NULL); 1053 if (!buffer_uptodate(bh)) { 1054 ll_rw_block(READ, 1, &bh); 1055 wait_on_buffer(bh); 1056 if (!buffer_uptodate(bh)) { 1057 printk (KERN_ERR 1058 "JBD: IO error reading journal superblock\n"); 1059 goto out; 1060 } 1061 } 1062 1063 sb = journal->j_superblock; 1064 1065 err = -EINVAL; 1066 1067 if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) || 1068 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { 1069 printk(KERN_WARNING "JBD: no valid journal superblock found\n"); 1070 goto out; 1071 } 1072 1073 switch(be32_to_cpu(sb->s_header.h_blocktype)) { 1074 case JFS_SUPERBLOCK_V1: 1075 journal->j_format_version = 1; 1076 break; 1077 case JFS_SUPERBLOCK_V2: 1078 journal->j_format_version = 2; 1079 break; 1080 default: 1081 printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); 1082 goto out; 1083 } 1084 1085 if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) 1086 journal->j_maxlen = be32_to_cpu(sb->s_maxlen); 1087 else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { 1088 printk (KERN_WARNING "JBD: journal file too short\n"); 1089 goto out; 1090 } 1091 1092 return 0; 1093 1094out: 1095 journal_fail_superblock(journal); 1096 return err; 1097} 1098 1099/* 1100 * Load the on-disk journal superblock and read the key fields into the 1101 * journal_t. 1102 */ 1103 1104static int load_superblock(journal_t *journal) 1105{ 1106 int err; 1107 journal_superblock_t *sb; 1108 1109 err = journal_get_superblock(journal); 1110 if (err) 1111 return err; 1112 1113 sb = journal->j_superblock; 1114 1115 journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); 1116 journal->j_tail = be32_to_cpu(sb->s_start); 1117 journal->j_first = be32_to_cpu(sb->s_first); 1118 journal->j_last = be32_to_cpu(sb->s_maxlen); 1119 journal->j_errno = be32_to_cpu(sb->s_errno); 1120 1121 return 0; 1122} 1123 1124 1125/** 1126 * int journal_load() - Read journal from disk. 1127 * @journal: Journal to act on. 1128 * 1129 * Given a journal_t structure which tells us which disk blocks contain 1130 * a journal, read the journal from disk to initialise the in-memory 1131 * structures. 1132 */ 1133int journal_load(journal_t *journal) 1134{ 1135 int err; 1136 journal_superblock_t *sb; 1137 1138 err = load_superblock(journal); 1139 if (err) 1140 return err; 1141 1142 sb = journal->j_superblock; 1143 /* If this is a V2 superblock, then we have to check the 1144 * features flags on it. */ 1145 1146 if (journal->j_format_version >= 2) { 1147 if ((sb->s_feature_ro_compat & 1148 ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || 1149 (sb->s_feature_incompat & 1150 ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { 1151 printk (KERN_WARNING 1152 "JBD: Unrecognised features on journal\n"); 1153 return -EINVAL; 1154 } 1155 } 1156 1157 /* Let the recovery code check whether it needs to recover any 1158 * data from the journal. */ 1159 if (journal_recover(journal)) 1160 goto recovery_error; 1161 1162 /* OK, we've finished with the dynamic journal bits: 1163 * reinitialise the dynamic contents of the superblock in memory 1164 * and reset them on disk. */ 1165 if (journal_reset(journal)) 1166 goto recovery_error; 1167 1168 journal->j_flags &= ~JFS_ABORT; 1169 journal->j_flags |= JFS_LOADED; 1170 return 0; 1171 1172recovery_error: 1173 printk (KERN_WARNING "JBD: recovery failed\n"); 1174 return -EIO; 1175} 1176 1177/** 1178 * void journal_destroy() - Release a journal_t structure. 1179 * @journal: Journal to act on. 1180 * 1181 * Release a journal_t structure once it is no longer in use by the 1182 * journaled object. 1183 * Return <0 if we couldn't clean up the journal. 1184 */ 1185int journal_destroy(journal_t *journal) 1186{ 1187 int err = 0; 1188 1189 1190 /* Wait for the commit thread to wake up and die. */ 1191 journal_kill_thread(journal); 1192 1193 /* Force a final log commit */ 1194 if (journal->j_running_transaction) 1195 journal_commit_transaction(journal); 1196 1197 /* Force any old transactions to disk */ 1198 1199 /* Totally anal locking here... */ 1200 spin_lock(&journal->j_list_lock); 1201 while (journal->j_checkpoint_transactions != NULL) { 1202 spin_unlock(&journal->j_list_lock); 1203 log_do_checkpoint(journal); 1204 spin_lock(&journal->j_list_lock); 1205 } 1206 1207 J_ASSERT(journal->j_running_transaction == NULL); 1208 J_ASSERT(journal->j_committing_transaction == NULL); 1209 J_ASSERT(journal->j_checkpoint_transactions == NULL); 1210 spin_unlock(&journal->j_list_lock); 1211 1212 if (journal->j_sb_buffer) { 1213 if (!is_journal_aborted(journal)) { 1214 /* We can now mark the journal as empty. */ 1215 journal->j_tail = 0; 1216 journal->j_tail_sequence = 1217 ++journal->j_transaction_sequence; 1218 journal_update_superblock(journal, 1); 1219 } else { 1220 err = -EIO; 1221 } 1222 brelse(journal->j_sb_buffer); 1223 } 1224 1225 if (journal->j_inode) 1226 iput(journal->j_inode); 1227 if (journal->j_revoke) 1228 journal_destroy_revoke(journal); 1229 kfree(journal->j_wbuf); 1230 kfree(journal); 1231 1232 return err; 1233} 1234 1235 1236/** 1237 *int journal_check_used_features () - Check if features specified are used. 1238 * @journal: Journal to check. 1239 * @compat: bitmask of compatible features 1240 * @ro: bitmask of features that force read-only mount 1241 * @incompat: bitmask of incompatible features 1242 * 1243 * Check whether the journal uses all of a given set of 1244 * features. Return true (non-zero) if it does. 1245 **/ 1246 1247int journal_check_used_features (journal_t *journal, unsigned long compat, 1248 unsigned long ro, unsigned long incompat) 1249{ 1250 journal_superblock_t *sb; 1251 1252 if (!compat && !ro && !incompat) 1253 return 1; 1254 if (journal->j_format_version == 1) 1255 return 0; 1256 1257 sb = journal->j_superblock; 1258 1259 if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && 1260 ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && 1261 ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) 1262 return 1; 1263 1264 return 0; 1265} 1266 1267/** 1268 * int journal_check_available_features() - Check feature set in journalling layer 1269 * @journal: Journal to check. 1270 * @compat: bitmask of compatible features 1271 * @ro: bitmask of features that force read-only mount 1272 * @incompat: bitmask of incompatible features 1273 * 1274 * Check whether the journaling code supports the use of 1275 * all of a given set of features on this journal. Return true 1276 * (non-zero) if it can. */ 1277 1278int journal_check_available_features (journal_t *journal, unsigned long compat, 1279 unsigned long ro, unsigned long incompat) 1280{ 1281 if (!compat && !ro && !incompat) 1282 return 1; 1283 1284 /* We can support any known requested features iff the 1285 * superblock is in version 2. Otherwise we fail to support any 1286 * extended sb features. */ 1287 1288 if (journal->j_format_version != 2) 1289 return 0; 1290 1291 if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && 1292 (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && 1293 (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) 1294 return 1; 1295 1296 return 0; 1297} 1298 1299/** 1300 * int journal_set_features () - Mark a given journal feature in the superblock 1301 * @journal: Journal to act on. 1302 * @compat: bitmask of compatible features 1303 * @ro: bitmask of features that force read-only mount 1304 * @incompat: bitmask of incompatible features 1305 * 1306 * Mark a given journal feature as present on the 1307 * superblock. Returns true if the requested features could be set. 1308 * 1309 */ 1310 1311int journal_set_features (journal_t *journal, unsigned long compat, 1312 unsigned long ro, unsigned long incompat) 1313{ 1314 journal_superblock_t *sb; 1315 1316 if (journal_check_used_features(journal, compat, ro, incompat)) 1317 return 1; 1318 1319 if (!journal_check_available_features(journal, compat, ro, incompat)) 1320 return 0; 1321 1322 jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", 1323 compat, ro, incompat); 1324 1325 sb = journal->j_superblock; 1326 1327 sb->s_feature_compat |= cpu_to_be32(compat); 1328 sb->s_feature_ro_compat |= cpu_to_be32(ro); 1329 sb->s_feature_incompat |= cpu_to_be32(incompat); 1330 1331 return 1; 1332} 1333 1334 1335/** 1336 * int journal_update_format () - Update on-disk journal structure. 1337 * @journal: Journal to act on. 1338 * 1339 * Given an initialised but unloaded journal struct, poke about in the 1340 * on-disk structure to update it to the most recent supported version. 1341 */ 1342int journal_update_format (journal_t *journal) 1343{ 1344 journal_superblock_t *sb; 1345 int err; 1346 1347 err = journal_get_superblock(journal); 1348 if (err) 1349 return err; 1350 1351 sb = journal->j_superblock; 1352 1353 switch (be32_to_cpu(sb->s_header.h_blocktype)) { 1354 case JFS_SUPERBLOCK_V2: 1355 return 0; 1356 case JFS_SUPERBLOCK_V1: 1357 return journal_convert_superblock_v1(journal, sb); 1358 default: 1359 break; 1360 } 1361 return -EINVAL; 1362} 1363 1364static int journal_convert_superblock_v1(journal_t *journal, 1365 journal_superblock_t *sb) 1366{ 1367 int offset, blocksize; 1368 struct buffer_head *bh; 1369 1370 printk(KERN_WARNING 1371 "JBD: Converting superblock from version 1 to 2.\n"); 1372 1373 /* Pre-initialise new fields to zero */ 1374 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); 1375 blocksize = be32_to_cpu(sb->s_blocksize); 1376 memset(&sb->s_feature_compat, 0, blocksize-offset); 1377 1378 sb->s_nr_users = cpu_to_be32(1); 1379 sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); 1380 journal->j_format_version = 2; 1381 1382 bh = journal->j_sb_buffer; 1383 BUFFER_TRACE(bh, "marking dirty"); 1384 mark_buffer_dirty(bh); 1385 sync_dirty_buffer(bh); 1386 return 0; 1387} 1388 1389 1390/** 1391 * int journal_flush () - Flush journal 1392 * @journal: Journal to act on. 1393 * 1394 * Flush all data for a given journal to disk and empty the journal. 1395 * Filesystems can use this when remounting readonly to ensure that 1396 * recovery does not need to happen on remount. 1397 */ 1398 1399int journal_flush(journal_t *journal) 1400{ 1401 int err = 0; 1402 transaction_t *transaction = NULL; 1403 unsigned int old_tail; 1404 1405 spin_lock(&journal->j_state_lock); 1406 1407 /* Force everything buffered to the log... */ 1408 if (journal->j_running_transaction) { 1409 transaction = journal->j_running_transaction; 1410 __log_start_commit(journal, transaction->t_tid); 1411 } else if (journal->j_committing_transaction) 1412 transaction = journal->j_committing_transaction; 1413 1414 /* Wait for the log commit to complete... */ 1415 if (transaction) { 1416 tid_t tid = transaction->t_tid; 1417 1418 spin_unlock(&journal->j_state_lock); 1419 log_wait_commit(journal, tid); 1420 } else { 1421 spin_unlock(&journal->j_state_lock); 1422 } 1423 1424 /* ...and flush everything in the log out to disk. */ 1425 spin_lock(&journal->j_list_lock); 1426 while (!err && journal->j_checkpoint_transactions != NULL) { 1427 spin_unlock(&journal->j_list_lock); 1428 mutex_lock(&journal->j_checkpoint_mutex); 1429 err = log_do_checkpoint(journal); 1430 mutex_unlock(&journal->j_checkpoint_mutex); 1431 spin_lock(&journal->j_list_lock); 1432 } 1433 spin_unlock(&journal->j_list_lock); 1434 1435 if (is_journal_aborted(journal)) 1436 return -EIO; 1437 1438 cleanup_journal_tail(journal); 1439 1440 /* Finally, mark the journal as really needing no recovery. 1441 * This sets s_start==0 in the underlying superblock, which is 1442 * the magic code for a fully-recovered superblock. Any future 1443 * commits of data to the journal will restore the current 1444 * s_start value. */ 1445 spin_lock(&journal->j_state_lock); 1446 old_tail = journal->j_tail; 1447 journal->j_tail = 0; 1448 spin_unlock(&journal->j_state_lock); 1449 journal_update_superblock(journal, 1); 1450 spin_lock(&journal->j_state_lock); 1451 journal->j_tail = old_tail; 1452 1453 J_ASSERT(!journal->j_running_transaction); 1454 J_ASSERT(!journal->j_committing_transaction); 1455 J_ASSERT(!journal->j_checkpoint_transactions); 1456 J_ASSERT(journal->j_head == journal->j_tail); 1457 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); 1458 spin_unlock(&journal->j_state_lock); 1459 return 0; 1460} 1461 1462/** 1463 * int journal_wipe() - Wipe journal contents 1464 * @journal: Journal to act on. 1465 * @write: flag (see below) 1466 * 1467 * Wipe out all of the contents of a journal, safely. This will produce 1468 * a warning if the journal contains any valid recovery information. 1469 * Must be called between journal_init_*() and journal_load(). 1470 * 1471 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise 1472 * we merely suppress recovery. 1473 */ 1474 1475int journal_wipe(journal_t *journal, int write) 1476{ 1477 int err = 0; 1478 1479 J_ASSERT (!(journal->j_flags & JFS_LOADED)); 1480 1481 err = load_superblock(journal); 1482 if (err) 1483 return err; 1484 1485 if (!journal->j_tail) 1486 goto no_recovery; 1487 1488 printk (KERN_WARNING "JBD: %s recovery information on journal\n", 1489 write ? "Clearing" : "Ignoring"); 1490 1491 err = journal_skip_recovery(journal); 1492 if (write) 1493 journal_update_superblock(journal, 1); 1494 1495 no_recovery: 1496 return err; 1497} 1498 1499/* 1500 * journal_dev_name: format a character string to describe on what 1501 * device this journal is present. 1502 */ 1503 1504static const char *journal_dev_name(journal_t *journal, char *buffer) 1505{ 1506 struct block_device *bdev; 1507 1508 if (journal->j_inode) 1509 bdev = journal->j_inode->i_sb->s_bdev; 1510 else 1511 bdev = journal->j_dev; 1512 1513 return bdevname(bdev, buffer); 1514} 1515 1516/* 1517 * Journal abort has very specific semantics, which we describe 1518 * for journal abort. 1519 * 1520 * Two internal function, which provide abort to te jbd layer 1521 * itself are here. 1522 */ 1523 1524/* 1525 * Quick version for internal journal use (doesn't lock the journal). 1526 * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, 1527 * and don't attempt to make any other journal updates. 1528 */ 1529static void __journal_abort_hard(journal_t *journal) 1530{ 1531 transaction_t *transaction; 1532 char b[BDEVNAME_SIZE]; 1533 1534 if (journal->j_flags & JFS_ABORT) 1535 return; 1536 1537 printk(KERN_ERR "Aborting journal on device %s.\n", 1538 journal_dev_name(journal, b)); 1539 1540 spin_lock(&journal->j_state_lock); 1541 journal->j_flags |= JFS_ABORT; 1542 transaction = journal->j_running_transaction; 1543 if (transaction) 1544 __log_start_commit(journal, transaction->t_tid); 1545 spin_unlock(&journal->j_state_lock); 1546} 1547 1548/* Soft abort: record the abort error status in the journal superblock, 1549 * but don't do any other IO. */ 1550static void __journal_abort_soft (journal_t *journal, int errno) 1551{ 1552 if (journal->j_flags & JFS_ABORT) 1553 return; 1554 1555 if (!journal->j_errno) 1556 journal->j_errno = errno; 1557 1558 __journal_abort_hard(journal); 1559 1560 if (errno) 1561 journal_update_superblock(journal, 1); 1562} 1563 1564/** 1565 * void journal_abort () - Shutdown the journal immediately. 1566 * @journal: the journal to shutdown. 1567 * @errno: an error number to record in the journal indicating 1568 * the reason for the shutdown. 1569 * 1570 * Perform a complete, immediate shutdown of the ENTIRE 1571 * journal (not of a single transaction). This operation cannot be 1572 * undone without closing and reopening the journal. 1573 * 1574 * The journal_abort function is intended to support higher level error 1575 * recovery mechanisms such as the ext2/ext3 remount-readonly error 1576 * mode. 1577 * 1578 * Journal abort has very specific semantics. Any existing dirty, 1579 * unjournaled buffers in the main filesystem will still be written to 1580 * disk by bdflush, but the journaling mechanism will be suspended 1581 * immediately and no further transaction commits will be honoured. 1582 * 1583 * Any dirty, journaled buffers will be written back to disk without 1584 * hitting the journal. Atomicity cannot be guaranteed on an aborted 1585 * filesystem, but we _do_ attempt to leave as much data as possible 1586 * behind for fsck to use for cleanup. 1587 * 1588 * Any attempt to get a new transaction handle on a journal which is in 1589 * ABORT state will just result in an -EROFS error return. A 1590 * journal_stop on an existing handle will return -EIO if we have 1591 * entered abort state during the update. 1592 * 1593 * Recursive transactions are not disturbed by journal abort until the 1594 * final journal_stop, which will receive the -EIO error. 1595 * 1596 * Finally, the journal_abort call allows the caller to supply an errno 1597 * which will be recorded (if possible) in the journal superblock. This 1598 * allows a client to record failure conditions in the middle of a 1599 * transaction without having to complete the transaction to record the 1600 * failure to disk. ext3_error, for example, now uses this 1601 * functionality. 1602 * 1603 * Errors which originate from within the journaling layer will NOT 1604 * supply an errno; a null errno implies that absolutely no further 1605 * writes are done to the journal (unless there are any already in 1606 * progress). 1607 * 1608 */ 1609 1610void journal_abort(journal_t *journal, int errno) 1611{ 1612 __journal_abort_soft(journal, errno); 1613} 1614 1615/** 1616 * int journal_errno () - returns the journal's error state. 1617 * @journal: journal to examine. 1618 * 1619 * This is the errno numbet set with journal_abort(), the last 1620 * time the journal was mounted - if the journal was stopped 1621 * without calling abort this will be 0. 1622 * 1623 * If the journal has been aborted on this mount time -EROFS will 1624 * be returned. 1625 */ 1626int journal_errno(journal_t *journal) 1627{ 1628 int err; 1629 1630 spin_lock(&journal->j_state_lock); 1631 if (journal->j_flags & JFS_ABORT) 1632 err = -EROFS; 1633 else 1634 err = journal->j_errno; 1635 spin_unlock(&journal->j_state_lock); 1636 return err; 1637} 1638 1639/** 1640 * int journal_clear_err () - clears the journal's error state 1641 * @journal: journal to act on. 1642 * 1643 * An error must be cleared or Acked to take a FS out of readonly 1644 * mode. 1645 */ 1646int journal_clear_err(journal_t *journal) 1647{ 1648 int err = 0; 1649 1650 spin_lock(&journal->j_state_lock); 1651 if (journal->j_flags & JFS_ABORT) 1652 err = -EROFS; 1653 else 1654 journal->j_errno = 0; 1655 spin_unlock(&journal->j_state_lock); 1656 return err; 1657} 1658 1659/** 1660 * void journal_ack_err() - Ack journal err. 1661 * @journal: journal to act on. 1662 * 1663 * An error must be cleared or Acked to take a FS out of readonly 1664 * mode. 1665 */ 1666void journal_ack_err(journal_t *journal) 1667{ 1668 spin_lock(&journal->j_state_lock); 1669 if (journal->j_errno) 1670 journal->j_flags |= JFS_ACK_ERR; 1671 spin_unlock(&journal->j_state_lock); 1672} 1673 1674int journal_blocks_per_page(struct inode *inode) 1675{ 1676 return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 1677} 1678 1679/* 1680 * Journal_head storage management 1681 */ 1682static struct kmem_cache *journal_head_cache; 1683#ifdef CONFIG_JBD_DEBUG 1684static atomic_t nr_journal_heads = ATOMIC_INIT(0); 1685#endif 1686 1687static int journal_init_journal_head_cache(void) 1688{ 1689 int retval; 1690 1691 J_ASSERT(journal_head_cache == NULL); 1692 journal_head_cache = kmem_cache_create("journal_head", 1693 sizeof(struct journal_head), 1694 0, /* offset */ 1695 SLAB_TEMPORARY, /* flags */ 1696 NULL); /* ctor */ 1697 retval = 0; 1698 if (!journal_head_cache) { 1699 retval = -ENOMEM; 1700 printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); 1701 } 1702 return retval; 1703} 1704 1705static void journal_destroy_journal_head_cache(void) 1706{ 1707 if (journal_head_cache) { 1708 kmem_cache_destroy(journal_head_cache); 1709 journal_head_cache = NULL; 1710 } 1711} 1712 1713/* 1714 * journal_head splicing and dicing 1715 */ 1716static struct journal_head *journal_alloc_journal_head(void) 1717{ 1718 struct journal_head *ret; 1719 static unsigned long last_warning; 1720 1721#ifdef CONFIG_JBD_DEBUG 1722 atomic_inc(&nr_journal_heads); 1723#endif 1724 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); 1725 if (ret == NULL) { 1726 jbd_debug(1, "out of memory for journal_head\n"); 1727 if (time_after(jiffies, last_warning + 5*HZ)) { 1728 printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", 1729 __func__); 1730 last_warning = jiffies; 1731 } 1732 while (ret == NULL) { 1733 yield(); 1734 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); 1735 } 1736 } 1737 return ret; 1738} 1739 1740static void journal_free_journal_head(struct journal_head *jh) 1741{ 1742#ifdef CONFIG_JBD_DEBUG 1743 atomic_dec(&nr_journal_heads); 1744 memset(jh, JBD_POISON_FREE, sizeof(*jh)); 1745#endif 1746 kmem_cache_free(journal_head_cache, jh); 1747} 1748 1749 1750/* 1751 * Give a buffer_head a journal_head. 1752 * 1753 * Doesn't need the journal lock. 1754 * May sleep. 1755 */ 1756struct journal_head *journal_add_journal_head(struct buffer_head *bh) 1757{ 1758 struct journal_head *jh; 1759 struct journal_head *new_jh = NULL; 1760 1761repeat: 1762 if (!buffer_jbd(bh)) { 1763 new_jh = journal_alloc_journal_head(); 1764 memset(new_jh, 0, sizeof(*new_jh)); 1765 } 1766 1767 jbd_lock_bh_journal_head(bh); 1768 if (buffer_jbd(bh)) { 1769 jh = bh2jh(bh); 1770 } else { 1771 J_ASSERT_BH(bh, 1772 (atomic_read(&bh->b_count) > 0) || 1773 (bh->b_page && bh->b_page->mapping)); 1774 1775 if (!new_jh) { 1776 jbd_unlock_bh_journal_head(bh); 1777 goto repeat; 1778 } 1779 1780 jh = new_jh; 1781 new_jh = NULL; /* We consumed it */ 1782 set_buffer_jbd(bh); 1783 bh->b_private = jh; 1784 jh->b_bh = bh; 1785 get_bh(bh); 1786 BUFFER_TRACE(bh, "added journal_head"); 1787 } 1788 jh->b_jcount++; 1789 jbd_unlock_bh_journal_head(bh); 1790 if (new_jh) 1791 journal_free_journal_head(new_jh); 1792 return bh->b_private; 1793} 1794 1795/* 1796 * Grab a ref against this buffer_head's journal_head. If it ended up not 1797 * having a journal_head, return NULL 1798 */ 1799struct journal_head *journal_grab_journal_head(struct buffer_head *bh) 1800{ 1801 struct journal_head *jh = NULL; 1802 1803 jbd_lock_bh_journal_head(bh); 1804 if (buffer_jbd(bh)) { 1805 jh = bh2jh(bh); 1806 jh->b_jcount++; 1807 } 1808 jbd_unlock_bh_journal_head(bh); 1809 return jh; 1810} 1811 1812static void __journal_remove_journal_head(struct buffer_head *bh) 1813{ 1814 struct journal_head *jh = bh2jh(bh); 1815 1816 J_ASSERT_JH(jh, jh->b_jcount >= 0); 1817 1818 get_bh(bh); 1819 if (jh->b_jcount == 0) { 1820 if (jh->b_transaction == NULL && 1821 jh->b_next_transaction == NULL && 1822 jh->b_cp_transaction == NULL) { 1823 J_ASSERT_JH(jh, jh->b_jlist == BJ_None); 1824 J_ASSERT_BH(bh, buffer_jbd(bh)); 1825 J_ASSERT_BH(bh, jh2bh(jh) == bh); 1826 BUFFER_TRACE(bh, "remove journal_head"); 1827 if (jh->b_frozen_data) { 1828 printk(KERN_WARNING "%s: freeing " 1829 "b_frozen_data\n", 1830 __func__); 1831 jbd_free(jh->b_frozen_data, bh->b_size); 1832 } 1833 if (jh->b_committed_data) { 1834 printk(KERN_WARNING "%s: freeing " 1835 "b_committed_data\n", 1836 __func__); 1837 jbd_free(jh->b_committed_data, bh->b_size); 1838 } 1839 bh->b_private = NULL; 1840 jh->b_bh = NULL; /* debug, really */ 1841 clear_buffer_jbd(bh); 1842 __brelse(bh); 1843 journal_free_journal_head(jh); 1844 } else { 1845 BUFFER_TRACE(bh, "journal_head was locked"); 1846 } 1847 } 1848} 1849 1850/* 1851 * journal_remove_journal_head(): if the buffer isn't attached to a transaction 1852 * and has a zero b_jcount then remove and release its journal_head. If we did 1853 * see that the buffer is not used by any transaction we also "logically" 1854 * decrement ->b_count. 1855 * 1856 * We in fact take an additional increment on ->b_count as a convenience, 1857 * because the caller usually wants to do additional things with the bh 1858 * after calling here. 1859 * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some 1860 * time. Once the caller has run __brelse(), the buffer is eligible for 1861 * reaping by try_to_free_buffers(). 1862 */ 1863void journal_remove_journal_head(struct buffer_head *bh) 1864{ 1865 jbd_lock_bh_journal_head(bh); 1866 __journal_remove_journal_head(bh); 1867 jbd_unlock_bh_journal_head(bh); 1868} 1869 1870/* 1871 * Drop a reference on the passed journal_head. If it fell to zero then try to 1872 * release the journal_head from the buffer_head. 1873 */ 1874void journal_put_journal_head(struct journal_head *jh) 1875{ 1876 struct buffer_head *bh = jh2bh(jh); 1877 1878 jbd_lock_bh_journal_head(bh); 1879 J_ASSERT_JH(jh, jh->b_jcount > 0); 1880 --jh->b_jcount; 1881 if (!jh->b_jcount && !jh->b_transaction) { 1882 __journal_remove_journal_head(bh); 1883 __brelse(bh); 1884 } 1885 jbd_unlock_bh_journal_head(bh); 1886} 1887 1888/* 1889 * debugfs tunables 1890 */ 1891#ifdef CONFIG_JBD_DEBUG 1892 1893u8 journal_enable_debug __read_mostly; 1894EXPORT_SYMBOL(journal_enable_debug); 1895 1896static struct dentry *jbd_debugfs_dir; 1897static struct dentry *jbd_debug; 1898 1899static void __init jbd_create_debugfs_entry(void) 1900{ 1901 jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); 1902 if (jbd_debugfs_dir) 1903 jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR, 1904 jbd_debugfs_dir, 1905 &journal_enable_debug); 1906} 1907 1908static void __exit jbd_remove_debugfs_entry(void) 1909{ 1910 debugfs_remove(jbd_debug); 1911 debugfs_remove(jbd_debugfs_dir); 1912} 1913 1914#else 1915 1916static inline void jbd_create_debugfs_entry(void) 1917{ 1918} 1919 1920static inline void jbd_remove_debugfs_entry(void) 1921{ 1922} 1923 1924#endif 1925 1926struct kmem_cache *jbd_handle_cache; 1927 1928static int __init journal_init_handle_cache(void) 1929{ 1930 jbd_handle_cache = kmem_cache_create("journal_handle", 1931 sizeof(handle_t), 1932 0, /* offset */ 1933 SLAB_TEMPORARY, /* flags */ 1934 NULL); /* ctor */ 1935 if (jbd_handle_cache == NULL) { 1936 printk(KERN_EMERG "JBD: failed to create handle cache\n"); 1937 return -ENOMEM; 1938 } 1939 return 0; 1940} 1941 1942static void journal_destroy_handle_cache(void) 1943{ 1944 if (jbd_handle_cache) 1945 kmem_cache_destroy(jbd_handle_cache); 1946} 1947 1948/* 1949 * Module startup and shutdown 1950 */ 1951 1952static int __init journal_init_caches(void) 1953{ 1954 int ret; 1955 1956 ret = journal_init_revoke_caches(); 1957 if (ret == 0) 1958 ret = journal_init_journal_head_cache(); 1959 if (ret == 0) 1960 ret = journal_init_handle_cache(); 1961 return ret; 1962} 1963 1964static void journal_destroy_caches(void) 1965{ 1966 journal_destroy_revoke_caches(); 1967 journal_destroy_journal_head_cache(); 1968 journal_destroy_handle_cache(); 1969} 1970 1971static int __init journal_init(void) 1972{ 1973 int ret; 1974 1975 BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); 1976 1977 ret = journal_init_caches(); 1978 if (ret != 0) 1979 journal_destroy_caches(); 1980 jbd_create_debugfs_entry(); 1981 return ret; 1982} 1983 1984static void __exit journal_exit(void) 1985{ 1986#ifdef CONFIG_JBD_DEBUG 1987 int n = atomic_read(&nr_journal_heads); 1988 if (n) 1989 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); 1990#endif 1991 jbd_remove_debugfs_entry(); 1992 journal_destroy_caches(); 1993} 1994 1995MODULE_LICENSE("GPL"); 1996module_init(journal_init); 1997module_exit(journal_exit); 1998