1/* 2 * linux/fs/checkpoint.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 5 * 6 * Copyright 1999 Red Hat Software --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Checkpoint routines for the generic filesystem journaling code. 13 * Part of the ext2fs journaling system. 14 * 15 * Checkpointing is the process of ensuring that a section of the log is 16 * committed fully to disk, so that that portion of the log can be 17 * reused. 18 */ 19 20#include <linux/sched.h> 21#include <linux/fs.h> 22#include <linux/jbd.h> 23#include <linux/errno.h> 24#include <linux/slab.h> 25#include <linux/locks.h> 26 27extern spinlock_t journal_datalist_lock; 28 29/* 30 * Unlink a buffer from a transaction. 31 * 32 * Called with journal_datalist_lock held. 33 */ 34 35static inline void __buffer_unlink(struct journal_head *jh) 36{ 37 transaction_t *transaction; 38 39 transaction = jh->b_cp_transaction; 40 jh->b_cp_transaction = NULL; 41 42 jh->b_cpnext->b_cpprev = jh->b_cpprev; 43 jh->b_cpprev->b_cpnext = jh->b_cpnext; 44 if (transaction->t_checkpoint_list == jh) 45 transaction->t_checkpoint_list = jh->b_cpnext; 46 if (transaction->t_checkpoint_list == jh) 47 transaction->t_checkpoint_list = NULL; 48} 49 50/* 51 * Try to release a checkpointed buffer from its transaction. 52 * Returns 1 if we released it. 53 * Requires journal_datalist_lock 54 */ 55static int __try_to_free_cp_buf(struct journal_head *jh) 56{ 57 int ret = 0; 58 struct buffer_head *bh = jh2bh(jh); 59 60 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 61 JBUFFER_TRACE(jh, "remove from checkpoint list"); 62 __journal_remove_checkpoint(jh); 63 __journal_remove_journal_head(bh); 64 BUFFER_TRACE(bh, "release"); 65 /* BUF_LOCKED -> BUF_CLEAN (fwiw) */ 66 refile_buffer(bh); 67 __brelse(bh); 68 ret = 1; 69 } 70 return ret; 71} 72 73/* 74 * log_wait_for_space: wait until there is space in the journal. 75 * 76 * Called with the journal already locked, but it will be unlocked if we have 77 * to wait for a checkpoint to free up some space in the log. 78 */ 79 80void log_wait_for_space(journal_t *journal, int nblocks) 81{ 82 while (log_space_left(journal) < nblocks) { 83 if (journal->j_flags & JFS_ABORT) 84 return; 85 unlock_journal(journal); 86 down(&journal->j_checkpoint_sem); 87 lock_journal(journal); 88 89 /* Test again, another process may have checkpointed 90 * while we were waiting for the checkpoint lock */ 91 if (log_space_left(journal) < nblocks) { 92 log_do_checkpoint(journal, nblocks); 93 } 94 up(&journal->j_checkpoint_sem); 95 } 96} 97 98/* 99 * Clean up a transaction's checkpoint list. 100 * 101 * We wait for any pending IO to complete and make sure any clean 102 * buffers are removed from the transaction. 103 * 104 * Return 1 if we performed any actions which might have destroyed the 105 * checkpoint. (journal_remove_checkpoint() deletes the transaction when 106 * the last checkpoint buffer is cleansed) 107 * 108 * Called with the journal locked. 109 * Called with journal_datalist_lock held. 110 */ 111static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) 112{ 113 struct journal_head *jh, *next_jh, *last_jh; 114 struct buffer_head *bh; 115 int ret = 0; 116 117 assert_spin_locked(&journal_datalist_lock); 118 jh = transaction->t_checkpoint_list; 119 if (!jh) 120 return 0; 121 122 last_jh = jh->b_cpprev; 123 next_jh = jh; 124 do { 125 jh = next_jh; 126 bh = jh2bh(jh); 127 if (buffer_locked(bh)) { 128 atomic_inc(&bh->b_count); 129 spin_unlock(&journal_datalist_lock); 130 unlock_journal(journal); 131 wait_on_buffer(bh); 132 /* the journal_head may have gone by now */ 133 BUFFER_TRACE(bh, "brelse"); 134 __brelse(bh); 135 goto out_return_1; 136 } 137 138 if (jh->b_transaction != NULL) { 139 transaction_t *transaction = jh->b_transaction; 140 tid_t tid = transaction->t_tid; 141 142 spin_unlock(&journal_datalist_lock); 143 log_start_commit(journal, transaction); 144 unlock_journal(journal); 145 log_wait_commit(journal, tid); 146 goto out_return_1; 147 } 148 149 /* 150 * We used to test for (jh->b_list != BUF_CLEAN) here. 151 * But unmap_underlying_metadata() can place buffer onto 152 * BUF_CLEAN. Since refile_buffer() no longer takes buffers 153 * off checkpoint lists, we cope with it here 154 */ 155 /* 156 * AKPM: I think the buffer_jdirty test is redundant - it 157 * shouldn't have NULL b_transaction? 158 */ 159 next_jh = jh->b_cpnext; 160 if (!buffer_dirty(bh) && !buffer_jdirty(bh)) { 161 BUFFER_TRACE(bh, "remove from checkpoint"); 162 __journal_remove_checkpoint(jh); 163 __journal_remove_journal_head(bh); 164 refile_buffer(bh); 165 __brelse(bh); 166 ret = 1; 167 } 168 169 jh = next_jh; 170 } while (jh != last_jh); 171 172 return ret; 173out_return_1: 174 lock_journal(journal); 175 spin_lock(&journal_datalist_lock); 176 return 1; 177} 178 179#define NR_BATCH 64 180 181static void __flush_batch(struct buffer_head **bhs, int *batch_count) 182{ 183 int i; 184 185 spin_unlock(&journal_datalist_lock); 186 ll_rw_block(WRITE, *batch_count, bhs); 187 run_task_queue(&tq_disk); 188 spin_lock(&journal_datalist_lock); 189 for (i = 0; i < *batch_count; i++) { 190 struct buffer_head *bh = bhs[i]; 191 clear_bit(BH_JWrite, &bh->b_state); 192 BUFFER_TRACE(bh, "brelse"); 193 __brelse(bh); 194 } 195 *batch_count = 0; 196} 197 198/* 199 * Try to flush one buffer from the checkpoint list to disk. 200 * 201 * Return 1 if something happened which requires us to abort the current 202 * scan of the checkpoint list. 203 * 204 * Called with journal_datalist_lock held. 205 */ 206static int __flush_buffer(journal_t *journal, struct journal_head *jh, 207 struct buffer_head **bhs, int *batch_count, 208 int *drop_count) 209{ 210 struct buffer_head *bh = jh2bh(jh); 211 int ret = 0; 212 213 if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { 214 J_ASSERT_JH(jh, jh->b_transaction == NULL); 215 216 /* 217 * Important: we are about to write the buffer, and 218 * possibly block, while still holding the journal lock. 219 * We cannot afford to let the transaction logic start 220 * messing around with this buffer before we write it to 221 * disk, as that would break recoverability. 222 */ 223 BUFFER_TRACE(bh, "queue"); 224 atomic_inc(&bh->b_count); 225 J_ASSERT_BH(bh, !test_bit(BH_JWrite, &bh->b_state)); 226 set_bit(BH_JWrite, &bh->b_state); 227 bhs[*batch_count] = bh; 228 (*batch_count)++; 229 if (*batch_count == NR_BATCH) { 230 __flush_batch(bhs, batch_count); 231 ret = 1; 232 } 233 } else { 234 int last_buffer = 0; 235 if (jh->b_cpnext == jh) { 236 /* We may be about to drop the transaction. Tell the 237 * caller that the lists have changed. 238 */ 239 last_buffer = 1; 240 } 241 if (__try_to_free_cp_buf(jh)) { 242 (*drop_count)++; 243 ret = last_buffer; 244 } 245 } 246 return ret; 247} 248 249 250/* 251 * Perform an actual checkpoint. We don't write out only enough to 252 * satisfy the current blocked requests: rather we submit a reasonably 253 * sized chunk of the outstanding data to disk at once for 254 * efficiency. log_wait_for_space() will retry if we didn't free enough. 255 * 256 * However, we _do_ take into account the amount requested so that once 257 * the IO has been queued, we can return as soon as enough of it has 258 * completed to disk. 259 * 260 * The journal should be locked before calling this function. 261 */ 262 263/* @@@ `nblocks' is unused. Should it be used? */ 264int log_do_checkpoint (journal_t *journal, int nblocks) 265{ 266 transaction_t *transaction, *last_transaction, *next_transaction; 267 int result; 268 int target; 269 int batch_count = 0; 270 struct buffer_head *bhs[NR_BATCH]; 271 272 jbd_debug(1, "Start checkpoint\n"); 273 274 /* 275 * First thing: if there are any transactions in the log which 276 * don't need checkpointing, just eliminate them from the 277 * journal straight away. 278 */ 279 result = cleanup_journal_tail(journal); 280 jbd_debug(1, "cleanup_journal_tail returned %d\n", result); 281 if (result <= 0) 282 return result; 283 284 /* 285 * OK, we need to start writing disk blocks. Try to free up a 286 * quarter of the log in a single checkpoint if we can. 287 */ 288 /* 289 * AKPM: check this code. I had a feeling a while back that it 290 * degenerates into a busy loop at unmount time. 291 */ 292 target = (journal->j_last - journal->j_first) / 4; 293 294 spin_lock(&journal_datalist_lock); 295repeat: 296 transaction = journal->j_checkpoint_transactions; 297 if (transaction == NULL) 298 goto done; 299 last_transaction = transaction->t_cpprev; 300 next_transaction = transaction; 301 302 do { 303 struct journal_head *jh, *last_jh, *next_jh; 304 int drop_count = 0; 305 int cleanup_ret, retry = 0; 306 307 transaction = next_transaction; 308 next_transaction = transaction->t_cpnext; 309 jh = transaction->t_checkpoint_list; 310 last_jh = jh->b_cpprev; 311 next_jh = jh; 312 do { 313 jh = next_jh; 314 next_jh = jh->b_cpnext; 315 retry = __flush_buffer(journal, jh, bhs, &batch_count, 316 &drop_count); 317 } while (jh != last_jh && !retry); 318 if (batch_count) { 319 __flush_batch(bhs, &batch_count); 320 goto repeat; 321 } 322 if (retry) 323 goto repeat; 324 /* 325 * We have walked the whole transaction list without 326 * finding anything to write to disk. We had better be 327 * able to make some progress or we are in trouble. 328 */ 329 cleanup_ret = __cleanup_transaction(journal, transaction); 330 J_ASSERT(drop_count != 0 || cleanup_ret != 0); 331 goto repeat; /* __cleanup may have dropped lock */ 332 } while (transaction != last_transaction); 333 334done: 335 spin_unlock(&journal_datalist_lock); 336 result = cleanup_journal_tail(journal); 337 if (result < 0) 338 return result; 339 340 return 0; 341} 342 343/* 344 * Check the list of checkpoint transactions for the journal to see if 345 * we have already got rid of any since the last update of the log tail 346 * in the journal superblock. If so, we can instantly roll the 347 * superblock forward to remove those transactions from the log. 348 * 349 * Return <0 on error, 0 on success, 1 if there was nothing to clean up. 350 * 351 * Called with the journal lock held. 352 * 353 * This is the only part of the journaling code which really needs to be 354 * aware of transaction aborts. Checkpointing involves writing to the 355 * main filesystem area rather than to the journal, so it can proceed 356 * even in abort state, but we must not update the journal superblock if 357 * we have an abort error outstanding. 358 */ 359 360int cleanup_journal_tail(journal_t *journal) 361{ 362 transaction_t * transaction; 363 tid_t first_tid; 364 unsigned long blocknr, freed; 365 366 /* OK, work out the oldest transaction remaining in the log, and 367 * the log block it starts at. 368 * 369 * If the log is now empty, we need to work out which is the 370 * next transaction ID we will write, and where it will 371 * start. */ 372 373 /* j_checkpoint_transactions needs locking */ 374 spin_lock(&journal_datalist_lock); 375 transaction = journal->j_checkpoint_transactions; 376 if (transaction) { 377 first_tid = transaction->t_tid; 378 blocknr = transaction->t_log_start; 379 } else if ((transaction = journal->j_committing_transaction) != NULL) { 380 first_tid = transaction->t_tid; 381 blocknr = transaction->t_log_start; 382 } else if ((transaction = journal->j_running_transaction) != NULL) { 383 first_tid = transaction->t_tid; 384 blocknr = journal->j_head; 385 } else { 386 first_tid = journal->j_transaction_sequence; 387 blocknr = journal->j_head; 388 } 389 spin_unlock(&journal_datalist_lock); 390 J_ASSERT (blocknr != 0); 391 392 /* If the oldest pinned transaction is at the tail of the log 393 already then there's not much we can do right now. */ 394 if (journal->j_tail_sequence == first_tid) 395 return 1; 396 397 /* OK, update the superblock to recover the freed space. 398 * Physical blocks come first: have we wrapped beyond the end of 399 * the log? */ 400 freed = blocknr - journal->j_tail; 401 if (blocknr < journal->j_tail) 402 freed = freed + journal->j_last - journal->j_first; 403 404 jbd_debug(1, 405 "Cleaning journal tail from %d to %d (offset %lu), " 406 "freeing %lu\n", 407 journal->j_tail_sequence, first_tid, blocknr, freed); 408 409 journal->j_free += freed; 410 journal->j_tail_sequence = first_tid; 411 journal->j_tail = blocknr; 412 if (!(journal->j_flags & JFS_ABORT)) 413 journal_update_superblock(journal, 1); 414 return 0; 415} 416 417 418/* Checkpoint list management */ 419 420/* 421 * journal_clean_checkpoint_list 422 * 423 * Find all the written-back checkpoint buffers in the journal and release them. 424 * 425 * Called with the journal locked. 426 * Called with journal_datalist_lock held. 427 * Returns number of bufers reaped (for debug) 428 */ 429 430int __journal_clean_checkpoint_list(journal_t *journal) 431{ 432 transaction_t *transaction, *last_transaction, *next_transaction; 433 int ret = 0; 434 435 transaction = journal->j_checkpoint_transactions; 436 if (transaction == 0) 437 goto out; 438 439 last_transaction = transaction->t_cpprev; 440 next_transaction = transaction; 441 do { 442 struct journal_head *jh; 443 444 transaction = next_transaction; 445 next_transaction = transaction->t_cpnext; 446 jh = transaction->t_checkpoint_list; 447 if (jh) { 448 struct journal_head *last_jh = jh->b_cpprev; 449 struct journal_head *next_jh = jh; 450 do { 451 jh = next_jh; 452 next_jh = jh->b_cpnext; 453 ret += __try_to_free_cp_buf(jh); 454 } while (jh != last_jh); 455 } 456 } while (transaction != last_transaction); 457out: 458 return ret; 459} 460 461/* 462 * journal_remove_checkpoint: called after a buffer has been committed 463 * to disk (either by being write-back flushed to disk, or being 464 * committed to the log). 465 * 466 * We cannot safely clean a transaction out of the log until all of the 467 * buffer updates committed in that transaction have safely been stored 468 * elsewhere on disk. To achieve this, all of the buffers in a 469 * transaction need to be maintained on the transaction's checkpoint 470 * list until they have been rewritten, at which point this function is 471 * called to remove the buffer from the existing transaction's 472 * checkpoint list. 473 * 474 * This function is called with the journal locked. 475 * This function is called with journal_datalist_lock held. 476 */ 477 478void __journal_remove_checkpoint(struct journal_head *jh) 479{ 480 transaction_t *transaction; 481 journal_t *journal; 482 483 JBUFFER_TRACE(jh, "entry"); 484 485 if ((transaction = jh->b_cp_transaction) == NULL) { 486 JBUFFER_TRACE(jh, "not on transaction"); 487 goto out; 488 } 489 490 journal = transaction->t_journal; 491 492 __buffer_unlink(jh); 493 494 if (transaction->t_checkpoint_list != NULL) 495 goto out; 496 JBUFFER_TRACE(jh, "transaction has no more buffers"); 497 498 /* There is one special case to worry about: if we have just 499 pulled the buffer off a committing transaction's forget list, 500 then even if the checkpoint list is empty, the transaction 501 obviously cannot be dropped! */ 502 503 if (transaction == journal->j_committing_transaction) { 504 JBUFFER_TRACE(jh, "belongs to committing transaction"); 505 goto out; 506 } 507 508 /* OK, that was the last buffer for the transaction: we can now 509 safely remove this transaction from the log */ 510 511 __journal_drop_transaction(journal, transaction); 512 513 /* Just in case anybody was waiting for more transactions to be 514 checkpointed... */ 515 wake_up(&journal->j_wait_logspace); 516out: 517 JBUFFER_TRACE(jh, "exit"); 518} 519 520void journal_remove_checkpoint(struct journal_head *jh) 521{ 522 spin_lock(&journal_datalist_lock); 523 __journal_remove_checkpoint(jh); 524 spin_unlock(&journal_datalist_lock); 525} 526 527/* 528 * journal_insert_checkpoint: put a committed buffer onto a checkpoint 529 * list so that we know when it is safe to clean the transaction out of 530 * the log. 531 * 532 * Called with the journal locked. 533 * Called with journal_datalist_lock held. 534 */ 535void __journal_insert_checkpoint(struct journal_head *jh, 536 transaction_t *transaction) 537{ 538 JBUFFER_TRACE(jh, "entry"); 539 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jdirty(jh2bh(jh))); 540 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); 541 542 assert_spin_locked(&journal_datalist_lock); 543 jh->b_cp_transaction = transaction; 544 545 if (!transaction->t_checkpoint_list) { 546 jh->b_cpnext = jh->b_cpprev = jh; 547 } else { 548 jh->b_cpnext = transaction->t_checkpoint_list; 549 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; 550 jh->b_cpprev->b_cpnext = jh; 551 jh->b_cpnext->b_cpprev = jh; 552 } 553 transaction->t_checkpoint_list = jh; 554} 555 556void journal_insert_checkpoint(struct journal_head *jh, 557 transaction_t *transaction) 558{ 559 spin_lock(&journal_datalist_lock); 560 __journal_insert_checkpoint(jh, transaction); 561 spin_unlock(&journal_datalist_lock); 562} 563 564/* 565 * We've finished with this transaction structure: adios... 566 * 567 * The transaction must have no links except for the checkpoint by this 568 * point. 569 * 570 * Called with the journal locked. 571 * Called with journal_datalist_lock held. 572 */ 573 574void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) 575{ 576 assert_spin_locked(&journal_datalist_lock); 577 if (transaction->t_cpnext) { 578 transaction->t_cpnext->t_cpprev = transaction->t_cpprev; 579 transaction->t_cpprev->t_cpnext = transaction->t_cpnext; 580 if (journal->j_checkpoint_transactions == transaction) 581 journal->j_checkpoint_transactions = 582 transaction->t_cpnext; 583 if (journal->j_checkpoint_transactions == transaction) 584 journal->j_checkpoint_transactions = NULL; 585 } 586 587 J_ASSERT (transaction->t_ilist == NULL); 588 J_ASSERT (transaction->t_buffers == NULL); 589 J_ASSERT (transaction->t_sync_datalist == NULL); 590 J_ASSERT (transaction->t_async_datalist == NULL); 591 J_ASSERT (transaction->t_forget == NULL); 592 J_ASSERT (transaction->t_iobuf_list == NULL); 593 J_ASSERT (transaction->t_shadow_list == NULL); 594 J_ASSERT (transaction->t_log_list == NULL); 595 J_ASSERT (transaction->t_checkpoint_list == NULL); 596 J_ASSERT (transaction->t_updates == 0); 597 J_ASSERT (list_empty(&transaction->t_jcb)); 598 599 J_ASSERT (transaction->t_journal->j_committing_transaction != 600 transaction); 601 602 jbd_debug (1, "Dropping transaction %d, all done\n", 603 transaction->t_tid); 604 kfree (transaction); 605} 606 607