1/* 2 * linux/fs/jbd/revoke.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 5 * 6 * Copyright 2000 Red Hat corp --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Journal revoke routines for the generic filesystem journaling code; 13 * part of the ext2fs journaling system. 14 * 15 * Revoke is the mechanism used to prevent old log records for deleted 16 * metadata from being replayed on top of newer data using the same 17 * blocks. The revoke mechanism is used in two separate places: 18 * 19 * + Commit: during commit we write the entire list of the current 20 * transaction's revoked blocks to the journal 21 * 22 * + Recovery: during recovery we record the transaction ID of all 23 * revoked blocks. If there are multiple revoke records in the log 24 * for a single block, only the last one counts, and if there is a log 25 * entry for a block beyond the last revoke, then that log entry still 26 * gets replayed. 27 * 28 * We can get interactions between revokes and new log data within a 29 * single transaction: 30 * 31 * Block is revoked and then journaled: 32 * The desired end result is the journaling of the new block, so we 33 * cancel the revoke before the transaction commits. 34 * 35 * Block is journaled and then revoked: 36 * The revoke must take precedence over the write of the block, so we 37 * need either to cancel the journal entry or to write the revoke 38 * later in the log than the log block. In this case, we choose the 39 * latter: journaling a block cancels any revoke record for that block 40 * in the current transaction, so any revoke for that block in the 41 * transaction must have happened after the block was journaled and so 42 * the revoke must take precedence. 43 * 44 * Block is revoked and then written as data: 45 * The data write is allowed to succeed, but the revoke is _not_ 46 * cancelled. We still need to prevent old log records from 47 * overwriting the new data. We don't even need to clear the revoke 48 * bit here. 49 * 50 * Revoke information on buffers is a tri-state value: 51 * 52 * RevokeValid clear: no cached revoke status, need to look it up 53 * RevokeValid set, Revoked clear: 54 * buffer has not been revoked, and cancel_revoke 55 * need do nothing. 56 * RevokeValid set, Revoked set: 57 * buffer has been revoked. 58 */ 59 60#ifndef __KERNEL__ 61#include "jfs_user.h" 62#else 63#include <linux/time.h> 64#include <linux/fs.h> 65#include <linux/jbd.h> 66#include <linux/errno.h> 67#include <linux/slab.h> 68#include <linux/list.h> 69#include <linux/init.h> 70#endif 71 72static struct kmem_cache *revoke_record_cache; 73static struct kmem_cache *revoke_table_cache; 74 75/* Each revoke record represents one single revoked block. During 76 journal replay, this involves recording the transaction ID of the 77 last transaction to revoke this block. */ 78 79struct jbd_revoke_record_s 80{ 81 struct list_head hash; 82 tid_t sequence; /* Used for recovery only */ 83 unsigned long blocknr; 84}; 85 86 87/* The revoke table is just a simple hash table of revoke records. */ 88struct jbd_revoke_table_s 89{ 90 /* It is conceivable that we might want a larger hash table 91 * for recovery. Must be a power of two. */ 92 int hash_size; 93 int hash_shift; 94 struct list_head *hash_table; 95}; 96 97 98#ifdef __KERNEL__ 99static void write_one_revoke_record(journal_t *, transaction_t *, 100 struct journal_head **, int *, 101 struct jbd_revoke_record_s *); 102static void flush_descriptor(journal_t *, struct journal_head *, int); 103#endif 104 105/* Utility functions to maintain the revoke table */ 106 107/* Borrowed from buffer.c: this is a tried and tested block hash function */ 108static inline int hash(journal_t *journal, unsigned long block) 109{ 110 struct jbd_revoke_table_s *table = journal->j_revoke; 111 int hash_shift = table->hash_shift; 112 113 return ((block << (hash_shift - 6)) ^ 114 (block >> 13) ^ 115 (block << (hash_shift - 12))) & (table->hash_size - 1); 116} 117 118static int insert_revoke_hash(journal_t *journal, unsigned long blocknr, 119 tid_t seq) 120{ 121 struct list_head *hash_list; 122 struct jbd_revoke_record_s *record; 123 124repeat: 125 record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS); 126 if (!record) 127 goto oom; 128 129 record->sequence = seq; 130 record->blocknr = blocknr; 131 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; 132 spin_lock(&journal->j_revoke_lock); 133 list_add(&record->hash, hash_list); 134 spin_unlock(&journal->j_revoke_lock); 135 return 0; 136 137oom: 138 if (!journal_oom_retry) 139 return -ENOMEM; 140 jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__); 141 yield(); 142 goto repeat; 143} 144 145/* Find a revoke record in the journal's hash table. */ 146 147static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, 148 unsigned long blocknr) 149{ 150 struct list_head *hash_list; 151 struct jbd_revoke_record_s *record; 152 153 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; 154 155 spin_lock(&journal->j_revoke_lock); 156 record = (struct jbd_revoke_record_s *) hash_list->next; 157 while (&(record->hash) != hash_list) { 158 if (record->blocknr == blocknr) { 159 spin_unlock(&journal->j_revoke_lock); 160 return record; 161 } 162 record = (struct jbd_revoke_record_s *) record->hash.next; 163 } 164 spin_unlock(&journal->j_revoke_lock); 165 return NULL; 166} 167 168int __init journal_init_revoke_caches(void) 169{ 170 revoke_record_cache = kmem_cache_create("revoke_record", 171 sizeof(struct jbd_revoke_record_s), 172 0, SLAB_HWCACHE_ALIGN, NULL, NULL); 173 if (revoke_record_cache == 0) 174 return -ENOMEM; 175 176 revoke_table_cache = kmem_cache_create("revoke_table", 177 sizeof(struct jbd_revoke_table_s), 178 0, 0, NULL, NULL); 179 if (revoke_table_cache == 0) { 180 kmem_cache_destroy(revoke_record_cache); 181 revoke_record_cache = NULL; 182 return -ENOMEM; 183 } 184 return 0; 185} 186 187void journal_destroy_revoke_caches(void) 188{ 189 kmem_cache_destroy(revoke_record_cache); 190 revoke_record_cache = NULL; 191 kmem_cache_destroy(revoke_table_cache); 192 revoke_table_cache = NULL; 193} 194 195/* Initialise the revoke table for a given journal to a given size. */ 196 197int journal_init_revoke(journal_t *journal, int hash_size) 198{ 199 int shift, tmp; 200 201 J_ASSERT (journal->j_revoke_table[0] == NULL); 202 203 shift = 0; 204 tmp = hash_size; 205 while((tmp >>= 1UL) != 0UL) 206 shift++; 207 208 journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); 209 if (!journal->j_revoke_table[0]) 210 return -ENOMEM; 211 journal->j_revoke = journal->j_revoke_table[0]; 212 213 /* Check that the hash_size is a power of two */ 214 J_ASSERT ((hash_size & (hash_size-1)) == 0); 215 216 journal->j_revoke->hash_size = hash_size; 217 218 journal->j_revoke->hash_shift = shift; 219 220 journal->j_revoke->hash_table = 221 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); 222 if (!journal->j_revoke->hash_table) { 223 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); 224 journal->j_revoke = NULL; 225 return -ENOMEM; 226 } 227 228 for (tmp = 0; tmp < hash_size; tmp++) 229 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); 230 231 journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); 232 if (!journal->j_revoke_table[1]) { 233 kfree(journal->j_revoke_table[0]->hash_table); 234 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); 235 return -ENOMEM; 236 } 237 238 journal->j_revoke = journal->j_revoke_table[1]; 239 240 /* Check that the hash_size is a power of two */ 241 J_ASSERT ((hash_size & (hash_size-1)) == 0); 242 243 journal->j_revoke->hash_size = hash_size; 244 245 journal->j_revoke->hash_shift = shift; 246 247 journal->j_revoke->hash_table = 248 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); 249 if (!journal->j_revoke->hash_table) { 250 kfree(journal->j_revoke_table[0]->hash_table); 251 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); 252 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]); 253 journal->j_revoke = NULL; 254 return -ENOMEM; 255 } 256 257 for (tmp = 0; tmp < hash_size; tmp++) 258 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); 259 260 spin_lock_init(&journal->j_revoke_lock); 261 262 return 0; 263} 264 265/* Destoy a journal's revoke table. The table must already be empty! */ 266 267void journal_destroy_revoke(journal_t *journal) 268{ 269 struct jbd_revoke_table_s *table; 270 struct list_head *hash_list; 271 int i; 272 273 table = journal->j_revoke_table[0]; 274 if (!table) 275 return; 276 277 for (i=0; i<table->hash_size; i++) { 278 hash_list = &table->hash_table[i]; 279 J_ASSERT (list_empty(hash_list)); 280 } 281 282 kfree(table->hash_table); 283 kmem_cache_free(revoke_table_cache, table); 284 journal->j_revoke = NULL; 285 286 table = journal->j_revoke_table[1]; 287 if (!table) 288 return; 289 290 for (i=0; i<table->hash_size; i++) { 291 hash_list = &table->hash_table[i]; 292 J_ASSERT (list_empty(hash_list)); 293 } 294 295 kfree(table->hash_table); 296 kmem_cache_free(revoke_table_cache, table); 297 journal->j_revoke = NULL; 298} 299 300 301#ifdef __KERNEL__ 302 303/* 304 * journal_revoke: revoke a given buffer_head from the journal. This 305 * prevents the block from being replayed during recovery if we take a 306 * crash after this current transaction commits. Any subsequent 307 * metadata writes of the buffer in this transaction cancel the 308 * revoke. 309 * 310 * Note that this call may block --- it is up to the caller to make 311 * sure that there are no further calls to journal_write_metadata 312 * before the revoke is complete. In ext3, this implies calling the 313 * revoke before clearing the block bitmap when we are deleting 314 * metadata. 315 * 316 * Revoke performs a journal_forget on any buffer_head passed in as a 317 * parameter, but does _not_ forget the buffer_head if the bh was only 318 * found implicitly. 319 * 320 * bh_in may not be a journalled buffer - it may have come off 321 * the hash tables without an attached journal_head. 322 * 323 * If bh_in is non-zero, journal_revoke() will decrement its b_count 324 * by one. 325 */ 326 327int journal_revoke(handle_t *handle, unsigned long blocknr, 328 struct buffer_head *bh_in) 329{ 330 struct buffer_head *bh = NULL; 331 journal_t *journal; 332 struct block_device *bdev; 333 int err; 334 335 might_sleep(); 336 if (bh_in) 337 BUFFER_TRACE(bh_in, "enter"); 338 339 journal = handle->h_transaction->t_journal; 340 if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){ 341 J_ASSERT (!"Cannot set revoke feature!"); 342 return -EINVAL; 343 } 344 345 bdev = journal->j_fs_dev; 346 bh = bh_in; 347 348 if (!bh) { 349 bh = __find_get_block(bdev, blocknr, journal->j_blocksize); 350 if (bh) 351 BUFFER_TRACE(bh, "found on hash"); 352 } 353#ifdef JBD_EXPENSIVE_CHECKING 354 else { 355 struct buffer_head *bh2; 356 357 /* If there is a different buffer_head lying around in 358 * memory anywhere... */ 359 bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); 360 if (bh2) { 361 /* ... and it has RevokeValid status... */ 362 if (bh2 != bh && buffer_revokevalid(bh2)) 363 /* ...then it better be revoked too, 364 * since it's illegal to create a revoke 365 * record against a buffer_head which is 366 * not marked revoked --- that would 367 * risk missing a subsequent revoke 368 * cancel. */ 369 J_ASSERT_BH(bh2, buffer_revoked(bh2)); 370 put_bh(bh2); 371 } 372 } 373#endif 374 375 /* We really ought not ever to revoke twice in a row without 376 first having the revoke cancelled: it's illegal to free a 377 block twice without allocating it in between! */ 378 if (bh) { 379 if (!J_EXPECT_BH(bh, !buffer_revoked(bh), 380 "inconsistent data on disk")) { 381 if (!bh_in) 382 brelse(bh); 383 return -EIO; 384 } 385 set_buffer_revoked(bh); 386 set_buffer_revokevalid(bh); 387 if (bh_in) { 388 BUFFER_TRACE(bh_in, "call journal_forget"); 389 journal_forget(handle, bh_in); 390 } else { 391 BUFFER_TRACE(bh, "call brelse"); 392 __brelse(bh); 393 } 394 } 395 396 jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); 397 err = insert_revoke_hash(journal, blocknr, 398 handle->h_transaction->t_tid); 399 BUFFER_TRACE(bh_in, "exit"); 400 return err; 401} 402 403/* 404 * Cancel an outstanding revoke. For use only internally by the 405 * journaling code (called from journal_get_write_access). 406 * 407 * We trust buffer_revoked() on the buffer if the buffer is already 408 * being journaled: if there is no revoke pending on the buffer, then we 409 * don't do anything here. 410 * 411 * This would break if it were possible for a buffer to be revoked and 412 * discarded, and then reallocated within the same transaction. In such 413 * a case we would have lost the revoked bit, but when we arrived here 414 * the second time we would still have a pending revoke to cancel. So, 415 * do not trust the Revoked bit on buffers unless RevokeValid is also 416 * set. 417 * 418 * The caller must have the journal locked. 419 */ 420int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) 421{ 422 struct jbd_revoke_record_s *record; 423 journal_t *journal = handle->h_transaction->t_journal; 424 int need_cancel; 425 int did_revoke = 0; /* akpm: debug */ 426 struct buffer_head *bh = jh2bh(jh); 427 428 jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); 429 430 /* Is the existing Revoke bit valid? If so, we trust it, and 431 * only perform the full cancel if the revoke bit is set. If 432 * not, we can't trust the revoke bit, and we need to do the 433 * full search for a revoke record. */ 434 if (test_set_buffer_revokevalid(bh)) { 435 need_cancel = test_clear_buffer_revoked(bh); 436 } else { 437 need_cancel = 1; 438 clear_buffer_revoked(bh); 439 } 440 441 if (need_cancel) { 442 record = find_revoke_record(journal, bh->b_blocknr); 443 if (record) { 444 jbd_debug(4, "cancelled existing revoke on " 445 "blocknr %llu\n", (unsigned long long)bh->b_blocknr); 446 spin_lock(&journal->j_revoke_lock); 447 list_del(&record->hash); 448 spin_unlock(&journal->j_revoke_lock); 449 kmem_cache_free(revoke_record_cache, record); 450 did_revoke = 1; 451 } 452 } 453 454#ifdef JBD_EXPENSIVE_CHECKING 455 /* There better not be one left behind by now! */ 456 record = find_revoke_record(journal, bh->b_blocknr); 457 J_ASSERT_JH(jh, record == NULL); 458#endif 459 460 /* Finally, have we just cleared revoke on an unhashed 461 * buffer_head? If so, we'd better make sure we clear the 462 * revoked status on any hashed alias too, otherwise the revoke 463 * state machine will get very upset later on. */ 464 if (need_cancel) { 465 struct buffer_head *bh2; 466 bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); 467 if (bh2) { 468 if (bh2 != bh) 469 clear_buffer_revoked(bh2); 470 __brelse(bh2); 471 } 472 } 473 return did_revoke; 474} 475 476/* journal_switch_revoke table select j_revoke for next transaction 477 * we do not want to suspend any processing until all revokes are 478 * written -bzzz 479 */ 480void journal_switch_revoke_table(journal_t *journal) 481{ 482 int i; 483 484 if (journal->j_revoke == journal->j_revoke_table[0]) 485 journal->j_revoke = journal->j_revoke_table[1]; 486 else 487 journal->j_revoke = journal->j_revoke_table[0]; 488 489 for (i = 0; i < journal->j_revoke->hash_size; i++) 490 INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); 491} 492 493/* 494 * Write revoke records to the journal for all entries in the current 495 * revoke hash, deleting the entries as we go. 496 * 497 * Called with the journal lock held. 498 */ 499 500void journal_write_revoke_records(journal_t *journal, 501 transaction_t *transaction) 502{ 503 struct journal_head *descriptor; 504 struct jbd_revoke_record_s *record; 505 struct jbd_revoke_table_s *revoke; 506 struct list_head *hash_list; 507 int i, offset, count; 508 509 descriptor = NULL; 510 offset = 0; 511 count = 0; 512 513 /* select revoke table for committing transaction */ 514 revoke = journal->j_revoke == journal->j_revoke_table[0] ? 515 journal->j_revoke_table[1] : journal->j_revoke_table[0]; 516 517 for (i = 0; i < revoke->hash_size; i++) { 518 hash_list = &revoke->hash_table[i]; 519 520 while (!list_empty(hash_list)) { 521 record = (struct jbd_revoke_record_s *) 522 hash_list->next; 523 write_one_revoke_record(journal, transaction, 524 &descriptor, &offset, 525 record); 526 count++; 527 list_del(&record->hash); 528 kmem_cache_free(revoke_record_cache, record); 529 } 530 } 531 if (descriptor) 532 flush_descriptor(journal, descriptor, offset); 533 jbd_debug(1, "Wrote %d revoke records\n", count); 534} 535 536/* 537 * Write out one revoke record. We need to create a new descriptor 538 * block if the old one is full or if we have not already created one. 539 */ 540 541static void write_one_revoke_record(journal_t *journal, 542 transaction_t *transaction, 543 struct journal_head **descriptorp, 544 int *offsetp, 545 struct jbd_revoke_record_s *record) 546{ 547 struct journal_head *descriptor; 548 int offset; 549 journal_header_t *header; 550 551 /* If we are already aborting, this all becomes a noop. We 552 still need to go round the loop in 553 journal_write_revoke_records in order to free all of the 554 revoke records: only the IO to the journal is omitted. */ 555 if (is_journal_aborted(journal)) 556 return; 557 558 descriptor = *descriptorp; 559 offset = *offsetp; 560 561 /* Make sure we have a descriptor with space left for the record */ 562 if (descriptor) { 563 if (offset == journal->j_blocksize) { 564 flush_descriptor(journal, descriptor, offset); 565 descriptor = NULL; 566 } 567 } 568 569 if (!descriptor) { 570 descriptor = journal_get_descriptor_buffer(journal); 571 if (!descriptor) 572 return; 573 header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; 574 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); 575 header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK); 576 header->h_sequence = cpu_to_be32(transaction->t_tid); 577 578 /* Record it so that we can wait for IO completion later */ 579 JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); 580 journal_file_buffer(descriptor, transaction, BJ_LogCtl); 581 582 offset = sizeof(journal_revoke_header_t); 583 *descriptorp = descriptor; 584 } 585 586 * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = 587 cpu_to_be32(record->blocknr); 588 offset += 4; 589 *offsetp = offset; 590} 591 592/* 593 * Flush a revoke descriptor out to the journal. If we are aborting, 594 * this is a noop; otherwise we are generating a buffer which needs to 595 * be waited for during commit, so it has to go onto the appropriate 596 * journal buffer list. 597 */ 598 599static void flush_descriptor(journal_t *journal, 600 struct journal_head *descriptor, 601 int offset) 602{ 603 journal_revoke_header_t *header; 604 struct buffer_head *bh = jh2bh(descriptor); 605 606 if (is_journal_aborted(journal)) { 607 put_bh(bh); 608 return; 609 } 610 611 header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data; 612 header->r_count = cpu_to_be32(offset); 613 set_buffer_jwrite(bh); 614 BUFFER_TRACE(bh, "write"); 615 set_buffer_dirty(bh); 616 ll_rw_block(SWRITE, 1, &bh); 617} 618#endif 619 620/* 621 * Revoke support for recovery. 622 * 623 * Recovery needs to be able to: 624 * 625 * record all revoke records, including the tid of the latest instance 626 * of each revoke in the journal 627 * 628 * check whether a given block in a given transaction should be replayed 629 * (ie. has not been revoked by a revoke record in that or a subsequent 630 * transaction) 631 * 632 * empty the revoke table after recovery. 633 */ 634 635/* 636 * First, setting revoke records. We create a new revoke record for 637 * every block ever revoked in the log as we scan it for recovery, and 638 * we update the existing records if we find multiple revokes for a 639 * single block. 640 */ 641 642int journal_set_revoke(journal_t *journal, 643 unsigned long blocknr, 644 tid_t sequence) 645{ 646 struct jbd_revoke_record_s *record; 647 648 record = find_revoke_record(journal, blocknr); 649 if (record) { 650 /* If we have multiple occurrences, only record the 651 * latest sequence number in the hashed record */ 652 if (tid_gt(sequence, record->sequence)) 653 record->sequence = sequence; 654 return 0; 655 } 656 return insert_revoke_hash(journal, blocknr, sequence); 657} 658 659/* 660 * Test revoke records. For a given block referenced in the log, has 661 * that block been revoked? A revoke record with a given transaction 662 * sequence number revokes all blocks in that transaction and earlier 663 * ones, but later transactions still need replayed. 664 */ 665 666int journal_test_revoke(journal_t *journal, 667 unsigned long blocknr, 668 tid_t sequence) 669{ 670 struct jbd_revoke_record_s *record; 671 672 record = find_revoke_record(journal, blocknr); 673 if (!record) 674 return 0; 675 if (tid_gt(sequence, record->sequence)) 676 return 0; 677 return 1; 678} 679 680/* 681 * Finally, once recovery is over, we need to clear the revoke table so 682 * that it can be reused by the running filesystem. 683 */ 684 685void journal_clear_revoke(journal_t *journal) 686{ 687 int i; 688 struct list_head *hash_list; 689 struct jbd_revoke_record_s *record; 690 struct jbd_revoke_table_s *revoke; 691 692 revoke = journal->j_revoke; 693 694 for (i = 0; i < revoke->hash_size; i++) { 695 hash_list = &revoke->hash_table[i]; 696 while (!list_empty(hash_list)) { 697 record = (struct jbd_revoke_record_s*) hash_list->next; 698 list_del(&record->hash); 699 kmem_cache_free(revoke_record_cache, record); 700 } 701 } 702} 703