1/* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19#include <linux/fs.h> 20#include <linux/blkdev.h> 21#include <linux/scatterlist.h> 22#include <linux/swap.h> 23#include <linux/radix-tree.h> 24#include <linux/writeback.h> 25#include <linux/buffer_head.h> 26#include <linux/workqueue.h> 27#include <linux/kthread.h> 28#include <linux/freezer.h> 29#include <linux/crc32c.h> 30#include <linux/slab.h> 31#include "compat.h" 32#include "ctree.h" 33#include "disk-io.h" 34#include "transaction.h" 35#include "btrfs_inode.h" 36#include "volumes.h" 37#include "print-tree.h" 38#include "async-thread.h" 39#include "locking.h" 40#include "tree-log.h" 41#include "free-space-cache.h" 42 43static struct extent_io_ops btree_extent_io_ops; 44static void end_workqueue_fn(struct btrfs_work *work); 45static void free_fs_root(struct btrfs_root *root); 46 47/* 48 * end_io_wq structs are used to do processing in task context when an IO is 49 * complete. This is used during reads to verify checksums, and it is used 50 * by writes to insert metadata for new file extents after IO is complete. 51 */ 52struct end_io_wq { 53 struct bio *bio; 54 bio_end_io_t *end_io; 55 void *private; 56 struct btrfs_fs_info *info; 57 int error; 58 int metadata; 59 struct list_head list; 60 struct btrfs_work work; 61}; 62 63/* 64 * async submit bios are used to offload expensive checksumming 65 * onto the worker threads. They checksum file and metadata bios 66 * just before they are sent down the IO stack. 67 */ 68struct async_submit_bio { 69 struct inode *inode; 70 struct bio *bio; 71 struct list_head list; 72 extent_submit_bio_hook_t *submit_bio_start; 73 extent_submit_bio_hook_t *submit_bio_done; 74 int rw; 75 int mirror_num; 76 unsigned long bio_flags; 77 /* 78 * bio_offset is optional, can be used if the pages in the bio 79 * can't tell us where in the file the bio should go 80 */ 81 u64 bio_offset; 82 struct btrfs_work work; 83}; 84 85/* These are used to set the lockdep class on the extent buffer locks. 86 * The class is set by the readpage_end_io_hook after the buffer has 87 * passed csum validation but before the pages are unlocked. 88 * 89 * The lockdep class is also set by btrfs_init_new_buffer on freshly 90 * allocated blocks. 91 * 92 * The class is based on the level in the tree block, which allows lockdep 93 * to know that lower nodes nest inside the locks of higher nodes. 94 * 95 * We also add a check to make sure the highest level of the tree is 96 * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this 97 * code needs update as well. 98 */ 99#ifdef CONFIG_DEBUG_LOCK_ALLOC 100# if BTRFS_MAX_LEVEL != 8 101# error 102# endif 103static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; 104static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { 105 /* leaf */ 106 "btrfs-extent-00", 107 "btrfs-extent-01", 108 "btrfs-extent-02", 109 "btrfs-extent-03", 110 "btrfs-extent-04", 111 "btrfs-extent-05", 112 "btrfs-extent-06", 113 "btrfs-extent-07", 114 /* highest possible level */ 115 "btrfs-extent-08", 116}; 117#endif 118 119/* 120 * extents on the btree inode are pretty simple, there's one extent 121 * that covers the entire device 122 */ 123static struct extent_map *btree_get_extent(struct inode *inode, 124 struct page *page, size_t page_offset, u64 start, u64 len, 125 int create) 126{ 127 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 128 struct extent_map *em; 129 int ret; 130 131 read_lock(&em_tree->lock); 132 em = lookup_extent_mapping(em_tree, start, len); 133 if (em) { 134 em->bdev = 135 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 136 read_unlock(&em_tree->lock); 137 goto out; 138 } 139 read_unlock(&em_tree->lock); 140 141 em = alloc_extent_map(GFP_NOFS); 142 if (!em) { 143 em = ERR_PTR(-ENOMEM); 144 goto out; 145 } 146 em->start = 0; 147 em->len = (u64)-1; 148 em->block_len = (u64)-1; 149 em->block_start = 0; 150 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 151 152 write_lock(&em_tree->lock); 153 ret = add_extent_mapping(em_tree, em); 154 if (ret == -EEXIST) { 155 u64 failed_start = em->start; 156 u64 failed_len = em->len; 157 158 free_extent_map(em); 159 em = lookup_extent_mapping(em_tree, start, len); 160 if (em) { 161 ret = 0; 162 } else { 163 em = lookup_extent_mapping(em_tree, failed_start, 164 failed_len); 165 ret = -EIO; 166 } 167 } else if (ret) { 168 free_extent_map(em); 169 em = NULL; 170 } 171 write_unlock(&em_tree->lock); 172 173 if (ret) 174 em = ERR_PTR(ret); 175out: 176 return em; 177} 178 179u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) 180{ 181 return crc32c(seed, data, len); 182} 183 184void btrfs_csum_final(u32 crc, char *result) 185{ 186 *(__le32 *)result = ~cpu_to_le32(crc); 187} 188 189/* 190 * compute the csum for a btree block, and either verify it or write it 191 * into the csum field of the block. 192 */ 193static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, 194 int verify) 195{ 196 u16 csum_size = 197 btrfs_super_csum_size(&root->fs_info->super_copy); 198 char *result = NULL; 199 unsigned long len; 200 unsigned long cur_len; 201 unsigned long offset = BTRFS_CSUM_SIZE; 202 char *map_token = NULL; 203 char *kaddr; 204 unsigned long map_start; 205 unsigned long map_len; 206 int err; 207 u32 crc = ~(u32)0; 208 unsigned long inline_result; 209 210 len = buf->len - offset; 211 while (len > 0) { 212 err = map_private_extent_buffer(buf, offset, 32, 213 &map_token, &kaddr, 214 &map_start, &map_len, KM_USER0); 215 if (err) 216 return 1; 217 cur_len = min(len, map_len - (offset - map_start)); 218 crc = btrfs_csum_data(root, kaddr + offset - map_start, 219 crc, cur_len); 220 len -= cur_len; 221 offset += cur_len; 222 unmap_extent_buffer(buf, map_token, KM_USER0); 223 } 224 if (csum_size > sizeof(inline_result)) { 225 result = kzalloc(csum_size * sizeof(char), GFP_NOFS); 226 if (!result) 227 return 1; 228 } else { 229 result = (char *)&inline_result; 230 } 231 232 btrfs_csum_final(crc, result); 233 234 if (verify) { 235 if (memcmp_extent_buffer(buf, result, 0, csum_size)) { 236 u32 val; 237 u32 found = 0; 238 memcpy(&found, result, csum_size); 239 240 read_extent_buffer(buf, &val, 0, csum_size); 241 if (printk_ratelimit()) { 242 printk(KERN_INFO "btrfs: %s checksum verify " 243 "failed on %llu wanted %X found %X " 244 "level %d\n", 245 root->fs_info->sb->s_id, 246 (unsigned long long)buf->start, val, found, 247 btrfs_header_level(buf)); 248 } 249 if (result != (char *)&inline_result) 250 kfree(result); 251 return 1; 252 } 253 } else { 254 write_extent_buffer(buf, result, 0, csum_size); 255 } 256 if (result != (char *)&inline_result) 257 kfree(result); 258 return 0; 259} 260 261/* 262 * we can't consider a given block up to date unless the transid of the 263 * block matches the transid in the parent node's pointer. This is how we 264 * detect blocks that either didn't get written at all or got written 265 * in the wrong place. 266 */ 267static int verify_parent_transid(struct extent_io_tree *io_tree, 268 struct extent_buffer *eb, u64 parent_transid) 269{ 270 struct extent_state *cached_state = NULL; 271 int ret; 272 273 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 274 return 0; 275 276 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, 277 0, &cached_state, GFP_NOFS); 278 if (extent_buffer_uptodate(io_tree, eb, cached_state) && 279 btrfs_header_generation(eb) == parent_transid) { 280 ret = 0; 281 goto out; 282 } 283 if (printk_ratelimit()) { 284 printk("parent transid verify failed on %llu wanted %llu " 285 "found %llu\n", 286 (unsigned long long)eb->start, 287 (unsigned long long)parent_transid, 288 (unsigned long long)btrfs_header_generation(eb)); 289 } 290 ret = 1; 291 clear_extent_buffer_uptodate(io_tree, eb, &cached_state); 292out: 293 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, 294 &cached_state, GFP_NOFS); 295 return ret; 296} 297 298/* 299 * helper to read a given tree block, doing retries as required when 300 * the checksums don't match and we have alternate mirrors to try. 301 */ 302static int btree_read_extent_buffer_pages(struct btrfs_root *root, 303 struct extent_buffer *eb, 304 u64 start, u64 parent_transid) 305{ 306 struct extent_io_tree *io_tree; 307 int ret; 308 int num_copies = 0; 309 int mirror_num = 0; 310 311 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 312 while (1) { 313 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 314 btree_get_extent, mirror_num); 315 if (!ret && 316 !verify_parent_transid(io_tree, eb, parent_transid)) 317 return ret; 318 319 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 320 eb->start, eb->len); 321 if (num_copies == 1) 322 return ret; 323 324 mirror_num++; 325 if (mirror_num > num_copies) 326 return ret; 327 } 328 return -EIO; 329} 330 331/* 332 * checksum a dirty tree block before IO. This has extra checks to make sure 333 * we only fill in the checksum field in the first page of a multi-page block 334 */ 335 336static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) 337{ 338 struct extent_io_tree *tree; 339 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 340 u64 found_start; 341 int found_level; 342 unsigned long len; 343 struct extent_buffer *eb; 344 int ret; 345 346 tree = &BTRFS_I(page->mapping->host)->io_tree; 347 348 if (page->private == EXTENT_PAGE_PRIVATE) 349 goto out; 350 if (!page->private) 351 goto out; 352 len = page->private >> 2; 353 WARN_ON(len == 0); 354 355 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 356 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, 357 btrfs_header_generation(eb)); 358 BUG_ON(ret); 359 found_start = btrfs_header_bytenr(eb); 360 if (found_start != start) { 361 WARN_ON(1); 362 goto err; 363 } 364 if (eb->first_page != page) { 365 WARN_ON(1); 366 goto err; 367 } 368 if (!PageUptodate(page)) { 369 WARN_ON(1); 370 goto err; 371 } 372 found_level = btrfs_header_level(eb); 373 374 csum_tree_block(root, eb, 0); 375err: 376 free_extent_buffer(eb); 377out: 378 return 0; 379} 380 381static int check_tree_block_fsid(struct btrfs_root *root, 382 struct extent_buffer *eb) 383{ 384 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 385 u8 fsid[BTRFS_UUID_SIZE]; 386 int ret = 1; 387 388 read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb), 389 BTRFS_FSID_SIZE); 390 while (fs_devices) { 391 if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { 392 ret = 0; 393 break; 394 } 395 fs_devices = fs_devices->seed; 396 } 397 return ret; 398} 399 400#ifdef CONFIG_DEBUG_LOCK_ALLOC 401void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) 402{ 403 lockdep_set_class_and_name(&eb->lock, 404 &btrfs_eb_class[level], 405 btrfs_eb_name[level]); 406} 407#endif 408 409static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, 410 struct extent_state *state) 411{ 412 struct extent_io_tree *tree; 413 u64 found_start; 414 int found_level; 415 unsigned long len; 416 struct extent_buffer *eb; 417 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 418 int ret = 0; 419 420 tree = &BTRFS_I(page->mapping->host)->io_tree; 421 if (page->private == EXTENT_PAGE_PRIVATE) 422 goto out; 423 if (!page->private) 424 goto out; 425 426 len = page->private >> 2; 427 WARN_ON(len == 0); 428 429 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 430 431 found_start = btrfs_header_bytenr(eb); 432 if (found_start != start) { 433 if (printk_ratelimit()) { 434 printk(KERN_INFO "btrfs bad tree block start " 435 "%llu %llu\n", 436 (unsigned long long)found_start, 437 (unsigned long long)eb->start); 438 } 439 ret = -EIO; 440 goto err; 441 } 442 if (eb->first_page != page) { 443 printk(KERN_INFO "btrfs bad first page %lu %lu\n", 444 eb->first_page->index, page->index); 445 WARN_ON(1); 446 ret = -EIO; 447 goto err; 448 } 449 if (check_tree_block_fsid(root, eb)) { 450 if (printk_ratelimit()) { 451 printk(KERN_INFO "btrfs bad fsid on block %llu\n", 452 (unsigned long long)eb->start); 453 } 454 ret = -EIO; 455 goto err; 456 } 457 found_level = btrfs_header_level(eb); 458 459 btrfs_set_buffer_lockdep_class(eb, found_level); 460 461 ret = csum_tree_block(root, eb, 1); 462 if (ret) 463 ret = -EIO; 464 465 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 466 end = eb->start + end - 1; 467err: 468 free_extent_buffer(eb); 469out: 470 return ret; 471} 472 473static void end_workqueue_bio(struct bio *bio, int err) 474{ 475 struct end_io_wq *end_io_wq = bio->bi_private; 476 struct btrfs_fs_info *fs_info; 477 478 fs_info = end_io_wq->info; 479 end_io_wq->error = err; 480 end_io_wq->work.func = end_workqueue_fn; 481 end_io_wq->work.flags = 0; 482 483 if (bio->bi_rw & REQ_WRITE) { 484 if (end_io_wq->metadata) 485 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 486 &end_io_wq->work); 487 else 488 btrfs_queue_worker(&fs_info->endio_write_workers, 489 &end_io_wq->work); 490 } else { 491 if (end_io_wq->metadata) 492 btrfs_queue_worker(&fs_info->endio_meta_workers, 493 &end_io_wq->work); 494 else 495 btrfs_queue_worker(&fs_info->endio_workers, 496 &end_io_wq->work); 497 } 498} 499 500int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 501 int metadata) 502{ 503 struct end_io_wq *end_io_wq; 504 end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); 505 if (!end_io_wq) 506 return -ENOMEM; 507 508 end_io_wq->private = bio->bi_private; 509 end_io_wq->end_io = bio->bi_end_io; 510 end_io_wq->info = info; 511 end_io_wq->error = 0; 512 end_io_wq->bio = bio; 513 end_io_wq->metadata = metadata; 514 515 bio->bi_private = end_io_wq; 516 bio->bi_end_io = end_workqueue_bio; 517 return 0; 518} 519 520unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) 521{ 522 unsigned long limit = min_t(unsigned long, 523 info->workers.max_workers, 524 info->fs_devices->open_devices); 525 return 256 * limit; 526} 527 528int btrfs_congested_async(struct btrfs_fs_info *info, int iodone) 529{ 530 return atomic_read(&info->nr_async_bios) > 531 btrfs_async_submit_limit(info); 532} 533 534static void run_one_async_start(struct btrfs_work *work) 535{ 536 struct btrfs_fs_info *fs_info; 537 struct async_submit_bio *async; 538 539 async = container_of(work, struct async_submit_bio, work); 540 fs_info = BTRFS_I(async->inode)->root->fs_info; 541 async->submit_bio_start(async->inode, async->rw, async->bio, 542 async->mirror_num, async->bio_flags, 543 async->bio_offset); 544} 545 546static void run_one_async_done(struct btrfs_work *work) 547{ 548 struct btrfs_fs_info *fs_info; 549 struct async_submit_bio *async; 550 int limit; 551 552 async = container_of(work, struct async_submit_bio, work); 553 fs_info = BTRFS_I(async->inode)->root->fs_info; 554 555 limit = btrfs_async_submit_limit(fs_info); 556 limit = limit * 2 / 3; 557 558 atomic_dec(&fs_info->nr_async_submits); 559 560 if (atomic_read(&fs_info->nr_async_submits) < limit && 561 waitqueue_active(&fs_info->async_submit_wait)) 562 wake_up(&fs_info->async_submit_wait); 563 564 async->submit_bio_done(async->inode, async->rw, async->bio, 565 async->mirror_num, async->bio_flags, 566 async->bio_offset); 567} 568 569static void run_one_async_free(struct btrfs_work *work) 570{ 571 struct async_submit_bio *async; 572 573 async = container_of(work, struct async_submit_bio, work); 574 kfree(async); 575} 576 577int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 578 int rw, struct bio *bio, int mirror_num, 579 unsigned long bio_flags, 580 u64 bio_offset, 581 extent_submit_bio_hook_t *submit_bio_start, 582 extent_submit_bio_hook_t *submit_bio_done) 583{ 584 struct async_submit_bio *async; 585 586 async = kmalloc(sizeof(*async), GFP_NOFS); 587 if (!async) 588 return -ENOMEM; 589 590 async->inode = inode; 591 async->rw = rw; 592 async->bio = bio; 593 async->mirror_num = mirror_num; 594 async->submit_bio_start = submit_bio_start; 595 async->submit_bio_done = submit_bio_done; 596 597 async->work.func = run_one_async_start; 598 async->work.ordered_func = run_one_async_done; 599 async->work.ordered_free = run_one_async_free; 600 601 async->work.flags = 0; 602 async->bio_flags = bio_flags; 603 async->bio_offset = bio_offset; 604 605 atomic_inc(&fs_info->nr_async_submits); 606 607 if (rw & REQ_SYNC) 608 btrfs_set_work_high_prio(&async->work); 609 610 btrfs_queue_worker(&fs_info->workers, &async->work); 611 612 while (atomic_read(&fs_info->async_submit_draining) && 613 atomic_read(&fs_info->nr_async_submits)) { 614 wait_event(fs_info->async_submit_wait, 615 (atomic_read(&fs_info->nr_async_submits) == 0)); 616 } 617 618 return 0; 619} 620 621static int btree_csum_one_bio(struct bio *bio) 622{ 623 struct bio_vec *bvec = bio->bi_io_vec; 624 int bio_index = 0; 625 struct btrfs_root *root; 626 627 WARN_ON(bio->bi_vcnt <= 0); 628 while (bio_index < bio->bi_vcnt) { 629 root = BTRFS_I(bvec->bv_page->mapping->host)->root; 630 csum_dirty_buffer(root, bvec->bv_page); 631 bio_index++; 632 bvec++; 633 } 634 return 0; 635} 636 637static int __btree_submit_bio_start(struct inode *inode, int rw, 638 struct bio *bio, int mirror_num, 639 unsigned long bio_flags, 640 u64 bio_offset) 641{ 642 /* 643 * when we're called for a write, we're already in the async 644 * submission context. Just jump into btrfs_map_bio 645 */ 646 btree_csum_one_bio(bio); 647 return 0; 648} 649 650static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 651 int mirror_num, unsigned long bio_flags, 652 u64 bio_offset) 653{ 654 /* 655 * when we're called for a write, we're already in the async 656 * submission context. Just jump into btrfs_map_bio 657 */ 658 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 659} 660 661static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 662 int mirror_num, unsigned long bio_flags, 663 u64 bio_offset) 664{ 665 int ret; 666 667 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, 668 bio, 1); 669 BUG_ON(ret); 670 671 if (!(rw & REQ_WRITE)) { 672 /* 673 * called for a read, do the setup so that checksum validation 674 * can happen in the async kernel threads 675 */ 676 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 677 mirror_num, 0); 678 } 679 680 /* 681 * kthread helpers are used to submit writes so that checksumming 682 * can happen in parallel across all CPUs 683 */ 684 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 685 inode, rw, bio, mirror_num, 0, 686 bio_offset, 687 __btree_submit_bio_start, 688 __btree_submit_bio_done); 689} 690 691static int btree_writepage(struct page *page, struct writeback_control *wbc) 692{ 693 struct extent_io_tree *tree; 694 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 695 struct extent_buffer *eb; 696 int was_dirty; 697 698 tree = &BTRFS_I(page->mapping->host)->io_tree; 699 if (!(current->flags & PF_MEMALLOC)) { 700 return extent_write_full_page(tree, page, 701 btree_get_extent, wbc); 702 } 703 704 redirty_page_for_writepage(wbc, page); 705 eb = btrfs_find_tree_block(root, page_offset(page), 706 PAGE_CACHE_SIZE); 707 WARN_ON(!eb); 708 709 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 710 if (!was_dirty) { 711 spin_lock(&root->fs_info->delalloc_lock); 712 root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE; 713 spin_unlock(&root->fs_info->delalloc_lock); 714 } 715 free_extent_buffer(eb); 716 717 unlock_page(page); 718 return 0; 719} 720 721static int btree_writepages(struct address_space *mapping, 722 struct writeback_control *wbc) 723{ 724 struct extent_io_tree *tree; 725 tree = &BTRFS_I(mapping->host)->io_tree; 726 if (wbc->sync_mode == WB_SYNC_NONE) { 727 struct btrfs_root *root = BTRFS_I(mapping->host)->root; 728 u64 num_dirty; 729 unsigned long thresh = 32 * 1024 * 1024; 730 731 if (wbc->for_kupdate) 732 return 0; 733 734 /* this is a bit racy, but that's ok */ 735 num_dirty = root->fs_info->dirty_metadata_bytes; 736 if (num_dirty < thresh) 737 return 0; 738 } 739 return extent_writepages(tree, mapping, btree_get_extent, wbc); 740} 741 742static int btree_readpage(struct file *file, struct page *page) 743{ 744 struct extent_io_tree *tree; 745 tree = &BTRFS_I(page->mapping->host)->io_tree; 746 return extent_read_full_page(tree, page, btree_get_extent); 747} 748 749static int btree_releasepage(struct page *page, gfp_t gfp_flags) 750{ 751 struct extent_io_tree *tree; 752 struct extent_map_tree *map; 753 int ret; 754 755 if (PageWriteback(page) || PageDirty(page)) 756 return 0; 757 758 tree = &BTRFS_I(page->mapping->host)->io_tree; 759 map = &BTRFS_I(page->mapping->host)->extent_tree; 760 761 ret = try_release_extent_state(map, tree, page, gfp_flags); 762 if (!ret) 763 return 0; 764 765 ret = try_release_extent_buffer(tree, page); 766 if (ret == 1) { 767 ClearPagePrivate(page); 768 set_page_private(page, 0); 769 page_cache_release(page); 770 } 771 772 return ret; 773} 774 775static void btree_invalidatepage(struct page *page, unsigned long offset) 776{ 777 struct extent_io_tree *tree; 778 tree = &BTRFS_I(page->mapping->host)->io_tree; 779 extent_invalidatepage(tree, page, offset); 780 btree_releasepage(page, GFP_NOFS); 781 if (PagePrivate(page)) { 782 printk(KERN_WARNING "btrfs warning page private not zero " 783 "on page %llu\n", (unsigned long long)page_offset(page)); 784 ClearPagePrivate(page); 785 set_page_private(page, 0); 786 page_cache_release(page); 787 } 788} 789 790static const struct address_space_operations btree_aops = { 791 .readpage = btree_readpage, 792 .writepage = btree_writepage, 793 .writepages = btree_writepages, 794 .releasepage = btree_releasepage, 795 .invalidatepage = btree_invalidatepage, 796 .sync_page = block_sync_page, 797}; 798 799int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 800 u64 parent_transid) 801{ 802 struct extent_buffer *buf = NULL; 803 struct inode *btree_inode = root->fs_info->btree_inode; 804 int ret = 0; 805 806 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 807 if (!buf) 808 return 0; 809 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 810 buf, 0, 0, btree_get_extent, 0); 811 free_extent_buffer(buf); 812 return ret; 813} 814 815struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 816 u64 bytenr, u32 blocksize) 817{ 818 struct inode *btree_inode = root->fs_info->btree_inode; 819 struct extent_buffer *eb; 820 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, 821 bytenr, blocksize, GFP_NOFS); 822 return eb; 823} 824 825struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 826 u64 bytenr, u32 blocksize) 827{ 828 struct inode *btree_inode = root->fs_info->btree_inode; 829 struct extent_buffer *eb; 830 831 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, 832 bytenr, blocksize, NULL, GFP_NOFS); 833 return eb; 834} 835 836 837int btrfs_write_tree_block(struct extent_buffer *buf) 838{ 839 return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, 840 buf->start + buf->len - 1); 841} 842 843int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 844{ 845 return filemap_fdatawait_range(buf->first_page->mapping, 846 buf->start, buf->start + buf->len - 1); 847} 848 849struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 850 u32 blocksize, u64 parent_transid) 851{ 852 struct extent_buffer *buf = NULL; 853 struct inode *btree_inode = root->fs_info->btree_inode; 854 struct extent_io_tree *io_tree; 855 int ret; 856 857 io_tree = &BTRFS_I(btree_inode)->io_tree; 858 859 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 860 if (!buf) 861 return NULL; 862 863 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 864 865 if (ret == 0) 866 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); 867 return buf; 868 869} 870 871int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, 872 struct extent_buffer *buf) 873{ 874 struct inode *btree_inode = root->fs_info->btree_inode; 875 if (btrfs_header_generation(buf) == 876 root->fs_info->running_transaction->transid) { 877 btrfs_assert_tree_locked(buf); 878 879 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { 880 spin_lock(&root->fs_info->delalloc_lock); 881 if (root->fs_info->dirty_metadata_bytes >= buf->len) 882 root->fs_info->dirty_metadata_bytes -= buf->len; 883 else 884 WARN_ON(1); 885 spin_unlock(&root->fs_info->delalloc_lock); 886 } 887 888 /* ugh, clear_extent_buffer_dirty needs to lock the page */ 889 btrfs_set_lock_blocking(buf); 890 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 891 buf); 892 } 893 return 0; 894} 895 896static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, 897 u32 stripesize, struct btrfs_root *root, 898 struct btrfs_fs_info *fs_info, 899 u64 objectid) 900{ 901 root->node = NULL; 902 root->commit_root = NULL; 903 root->sectorsize = sectorsize; 904 root->nodesize = nodesize; 905 root->leafsize = leafsize; 906 root->stripesize = stripesize; 907 root->ref_cows = 0; 908 root->track_dirty = 0; 909 root->in_radix = 0; 910 root->orphan_item_inserted = 0; 911 root->orphan_cleanup_state = 0; 912 913 root->fs_info = fs_info; 914 root->objectid = objectid; 915 root->last_trans = 0; 916 root->highest_objectid = 0; 917 root->name = NULL; 918 root->in_sysfs = 0; 919 root->inode_tree = RB_ROOT; 920 root->block_rsv = NULL; 921 root->orphan_block_rsv = NULL; 922 923 INIT_LIST_HEAD(&root->dirty_list); 924 INIT_LIST_HEAD(&root->orphan_list); 925 INIT_LIST_HEAD(&root->root_list); 926 spin_lock_init(&root->node_lock); 927 spin_lock_init(&root->orphan_lock); 928 spin_lock_init(&root->inode_lock); 929 spin_lock_init(&root->accounting_lock); 930 mutex_init(&root->objectid_mutex); 931 mutex_init(&root->log_mutex); 932 init_waitqueue_head(&root->log_writer_wait); 933 init_waitqueue_head(&root->log_commit_wait[0]); 934 init_waitqueue_head(&root->log_commit_wait[1]); 935 atomic_set(&root->log_commit[0], 0); 936 atomic_set(&root->log_commit[1], 0); 937 atomic_set(&root->log_writers, 0); 938 root->log_batch = 0; 939 root->log_transid = 0; 940 root->last_log_commit = 0; 941 extent_io_tree_init(&root->dirty_log_pages, 942 fs_info->btree_inode->i_mapping, GFP_NOFS); 943 944 memset(&root->root_key, 0, sizeof(root->root_key)); 945 memset(&root->root_item, 0, sizeof(root->root_item)); 946 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); 947 memset(&root->root_kobj, 0, sizeof(root->root_kobj)); 948 root->defrag_trans_start = fs_info->generation; 949 init_completion(&root->kobj_unregister); 950 root->defrag_running = 0; 951 root->root_key.objectid = objectid; 952 root->anon_super.s_root = NULL; 953 root->anon_super.s_dev = 0; 954 INIT_LIST_HEAD(&root->anon_super.s_list); 955 INIT_LIST_HEAD(&root->anon_super.s_instances); 956 init_rwsem(&root->anon_super.s_umount); 957 958 return 0; 959} 960 961static int find_and_setup_root(struct btrfs_root *tree_root, 962 struct btrfs_fs_info *fs_info, 963 u64 objectid, 964 struct btrfs_root *root) 965{ 966 int ret; 967 u32 blocksize; 968 u64 generation; 969 970 __setup_root(tree_root->nodesize, tree_root->leafsize, 971 tree_root->sectorsize, tree_root->stripesize, 972 root, fs_info, objectid); 973 ret = btrfs_find_last_root(tree_root, objectid, 974 &root->root_item, &root->root_key); 975 if (ret > 0) 976 return -ENOENT; 977 BUG_ON(ret); 978 979 generation = btrfs_root_generation(&root->root_item); 980 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 981 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 982 blocksize, generation); 983 BUG_ON(!root->node); 984 root->commit_root = btrfs_root_node(root); 985 return 0; 986} 987 988static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 989 struct btrfs_fs_info *fs_info) 990{ 991 struct btrfs_root *root; 992 struct btrfs_root *tree_root = fs_info->tree_root; 993 struct extent_buffer *leaf; 994 995 root = kzalloc(sizeof(*root), GFP_NOFS); 996 if (!root) 997 return ERR_PTR(-ENOMEM); 998 999 __setup_root(tree_root->nodesize, tree_root->leafsize, 1000 tree_root->sectorsize, tree_root->stripesize, 1001 root, fs_info, BTRFS_TREE_LOG_OBJECTID); 1002 1003 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; 1004 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1005 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; 1006 /* 1007 * log trees do not get reference counted because they go away 1008 * before a real commit is actually done. They do store pointers 1009 * to file data extents, and those reference counts still get 1010 * updated (along with back refs to the log tree). 1011 */ 1012 root->ref_cows = 0; 1013 1014 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1015 BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); 1016 if (IS_ERR(leaf)) { 1017 kfree(root); 1018 return ERR_CAST(leaf); 1019 } 1020 1021 memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); 1022 btrfs_set_header_bytenr(leaf, leaf->start); 1023 btrfs_set_header_generation(leaf, trans->transid); 1024 btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); 1025 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); 1026 root->node = leaf; 1027 1028 write_extent_buffer(root->node, root->fs_info->fsid, 1029 (unsigned long)btrfs_header_fsid(root->node), 1030 BTRFS_FSID_SIZE); 1031 btrfs_mark_buffer_dirty(root->node); 1032 btrfs_tree_unlock(root->node); 1033 return root; 1034} 1035 1036int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 1037 struct btrfs_fs_info *fs_info) 1038{ 1039 struct btrfs_root *log_root; 1040 1041 log_root = alloc_log_tree(trans, fs_info); 1042 if (IS_ERR(log_root)) 1043 return PTR_ERR(log_root); 1044 WARN_ON(fs_info->log_root_tree); 1045 fs_info->log_root_tree = log_root; 1046 return 0; 1047} 1048 1049int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 1050 struct btrfs_root *root) 1051{ 1052 struct btrfs_root *log_root; 1053 struct btrfs_inode_item *inode_item; 1054 1055 log_root = alloc_log_tree(trans, root->fs_info); 1056 if (IS_ERR(log_root)) 1057 return PTR_ERR(log_root); 1058 1059 log_root->last_trans = trans->transid; 1060 log_root->root_key.offset = root->root_key.objectid; 1061 1062 inode_item = &log_root->root_item.inode; 1063 inode_item->generation = cpu_to_le64(1); 1064 inode_item->size = cpu_to_le64(3); 1065 inode_item->nlink = cpu_to_le32(1); 1066 inode_item->nbytes = cpu_to_le64(root->leafsize); 1067 inode_item->mode = cpu_to_le32(S_IFDIR | 0755); 1068 1069 btrfs_set_root_node(&log_root->root_item, log_root->node); 1070 1071 WARN_ON(root->log_root); 1072 root->log_root = log_root; 1073 root->log_transid = 0; 1074 root->last_log_commit = 0; 1075 return 0; 1076} 1077 1078struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, 1079 struct btrfs_key *location) 1080{ 1081 struct btrfs_root *root; 1082 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1083 struct btrfs_path *path; 1084 struct extent_buffer *l; 1085 u64 generation; 1086 u32 blocksize; 1087 int ret = 0; 1088 1089 root = kzalloc(sizeof(*root), GFP_NOFS); 1090 if (!root) 1091 return ERR_PTR(-ENOMEM); 1092 if (location->offset == (u64)-1) { 1093 ret = find_and_setup_root(tree_root, fs_info, 1094 location->objectid, root); 1095 if (ret) { 1096 kfree(root); 1097 return ERR_PTR(ret); 1098 } 1099 goto out; 1100 } 1101 1102 __setup_root(tree_root->nodesize, tree_root->leafsize, 1103 tree_root->sectorsize, tree_root->stripesize, 1104 root, fs_info, location->objectid); 1105 1106 path = btrfs_alloc_path(); 1107 BUG_ON(!path); 1108 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); 1109 if (ret == 0) { 1110 l = path->nodes[0]; 1111 read_extent_buffer(l, &root->root_item, 1112 btrfs_item_ptr_offset(l, path->slots[0]), 1113 sizeof(root->root_item)); 1114 memcpy(&root->root_key, location, sizeof(*location)); 1115 } 1116 btrfs_free_path(path); 1117 if (ret) { 1118 if (ret > 0) 1119 ret = -ENOENT; 1120 return ERR_PTR(ret); 1121 } 1122 1123 generation = btrfs_root_generation(&root->root_item); 1124 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1125 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1126 blocksize, generation); 1127 root->commit_root = btrfs_root_node(root); 1128 BUG_ON(!root->node); 1129out: 1130 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) 1131 root->ref_cows = 1; 1132 1133 return root; 1134} 1135 1136struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, 1137 u64 root_objectid) 1138{ 1139 struct btrfs_root *root; 1140 1141 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID) 1142 return fs_info->tree_root; 1143 if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID) 1144 return fs_info->extent_root; 1145 1146 root = radix_tree_lookup(&fs_info->fs_roots_radix, 1147 (unsigned long)root_objectid); 1148 return root; 1149} 1150 1151struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 1152 struct btrfs_key *location) 1153{ 1154 struct btrfs_root *root; 1155 int ret; 1156 1157 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) 1158 return fs_info->tree_root; 1159 if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) 1160 return fs_info->extent_root; 1161 if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) 1162 return fs_info->chunk_root; 1163 if (location->objectid == BTRFS_DEV_TREE_OBJECTID) 1164 return fs_info->dev_root; 1165 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) 1166 return fs_info->csum_root; 1167again: 1168 spin_lock(&fs_info->fs_roots_radix_lock); 1169 root = radix_tree_lookup(&fs_info->fs_roots_radix, 1170 (unsigned long)location->objectid); 1171 spin_unlock(&fs_info->fs_roots_radix_lock); 1172 if (root) 1173 return root; 1174 1175 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1176 if (IS_ERR(root)) 1177 return root; 1178 1179 set_anon_super(&root->anon_super, NULL); 1180 1181 if (btrfs_root_refs(&root->root_item) == 0) { 1182 ret = -ENOENT; 1183 goto fail; 1184 } 1185 1186 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); 1187 if (ret < 0) 1188 goto fail; 1189 if (ret == 0) 1190 root->orphan_item_inserted = 1; 1191 1192 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1193 if (ret) 1194 goto fail; 1195 1196 spin_lock(&fs_info->fs_roots_radix_lock); 1197 ret = radix_tree_insert(&fs_info->fs_roots_radix, 1198 (unsigned long)root->root_key.objectid, 1199 root); 1200 if (ret == 0) 1201 root->in_radix = 1; 1202 1203 spin_unlock(&fs_info->fs_roots_radix_lock); 1204 radix_tree_preload_end(); 1205 if (ret) { 1206 if (ret == -EEXIST) { 1207 free_fs_root(root); 1208 goto again; 1209 } 1210 goto fail; 1211 } 1212 1213 ret = btrfs_find_dead_roots(fs_info->tree_root, 1214 root->root_key.objectid); 1215 WARN_ON(ret); 1216 return root; 1217fail: 1218 free_fs_root(root); 1219 return ERR_PTR(ret); 1220} 1221 1222struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, 1223 struct btrfs_key *location, 1224 const char *name, int namelen) 1225{ 1226 return btrfs_read_fs_root_no_name(fs_info, location); 1227} 1228 1229static int btrfs_congested_fn(void *congested_data, int bdi_bits) 1230{ 1231 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; 1232 int ret = 0; 1233 struct btrfs_device *device; 1234 struct backing_dev_info *bdi; 1235 1236 list_for_each_entry(device, &info->fs_devices->devices, dev_list) { 1237 if (!device->bdev) 1238 continue; 1239 bdi = blk_get_backing_dev_info(device->bdev); 1240 if (bdi && bdi_congested(bdi, bdi_bits)) { 1241 ret = 1; 1242 break; 1243 } 1244 } 1245 return ret; 1246} 1247 1248/* 1249 * this unplugs every device on the box, and it is only used when page 1250 * is null 1251 */ 1252static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 1253{ 1254 struct btrfs_device *device; 1255 struct btrfs_fs_info *info; 1256 1257 info = (struct btrfs_fs_info *)bdi->unplug_io_data; 1258 list_for_each_entry(device, &info->fs_devices->devices, dev_list) { 1259 if (!device->bdev) 1260 continue; 1261 1262 bdi = blk_get_backing_dev_info(device->bdev); 1263 if (bdi->unplug_io_fn) 1264 bdi->unplug_io_fn(bdi, page); 1265 } 1266} 1267 1268static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 1269{ 1270 struct inode *inode; 1271 struct extent_map_tree *em_tree; 1272 struct extent_map *em; 1273 struct address_space *mapping; 1274 u64 offset; 1275 1276 /* the generic O_DIRECT read code does this */ 1277 if (1 || !page) { 1278 __unplug_io_fn(bdi, page); 1279 return; 1280 } 1281 1282 /* 1283 * page->mapping may change at any time. Get a consistent copy 1284 * and use that for everything below 1285 */ 1286 smp_mb(); 1287 mapping = page->mapping; 1288 if (!mapping) 1289 return; 1290 1291 inode = mapping->host; 1292 1293 /* 1294 * don't do the expensive searching for a small number of 1295 * devices 1296 */ 1297 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) { 1298 __unplug_io_fn(bdi, page); 1299 return; 1300 } 1301 1302 offset = page_offset(page); 1303 1304 em_tree = &BTRFS_I(inode)->extent_tree; 1305 read_lock(&em_tree->lock); 1306 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); 1307 read_unlock(&em_tree->lock); 1308 if (!em) { 1309 __unplug_io_fn(bdi, page); 1310 return; 1311 } 1312 1313 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 1314 free_extent_map(em); 1315 __unplug_io_fn(bdi, page); 1316 return; 1317 } 1318 offset = offset - em->start; 1319 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree, 1320 em->block_start + offset, page); 1321 free_extent_map(em); 1322} 1323 1324/* 1325 * If this fails, caller must call bdi_destroy() to get rid of the 1326 * bdi again. 1327 */ 1328static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) 1329{ 1330 int err; 1331 1332 bdi->capabilities = BDI_CAP_MAP_COPY; 1333 err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY); 1334 if (err) 1335 return err; 1336 1337 bdi->ra_pages = default_backing_dev_info.ra_pages; 1338 bdi->unplug_io_fn = btrfs_unplug_io_fn; 1339 bdi->unplug_io_data = info; 1340 bdi->congested_fn = btrfs_congested_fn; 1341 bdi->congested_data = info; 1342 return 0; 1343} 1344 1345static int bio_ready_for_csum(struct bio *bio) 1346{ 1347 u64 length = 0; 1348 u64 buf_len = 0; 1349 u64 start = 0; 1350 struct page *page; 1351 struct extent_io_tree *io_tree = NULL; 1352 struct btrfs_fs_info *info = NULL; 1353 struct bio_vec *bvec; 1354 int i; 1355 int ret; 1356 1357 bio_for_each_segment(bvec, bio, i) { 1358 page = bvec->bv_page; 1359 if (page->private == EXTENT_PAGE_PRIVATE) { 1360 length += bvec->bv_len; 1361 continue; 1362 } 1363 if (!page->private) { 1364 length += bvec->bv_len; 1365 continue; 1366 } 1367 length = bvec->bv_len; 1368 buf_len = page->private >> 2; 1369 start = page_offset(page) + bvec->bv_offset; 1370 io_tree = &BTRFS_I(page->mapping->host)->io_tree; 1371 info = BTRFS_I(page->mapping->host)->root->fs_info; 1372 } 1373 /* are we fully contained in this bio? */ 1374 if (buf_len <= length) 1375 return 1; 1376 1377 ret = extent_range_uptodate(io_tree, start + length, 1378 start + buf_len - 1); 1379 return ret; 1380} 1381 1382/* 1383 * called by the kthread helper functions to finally call the bio end_io 1384 * functions. This is where read checksum verification actually happens 1385 */ 1386static void end_workqueue_fn(struct btrfs_work *work) 1387{ 1388 struct bio *bio; 1389 struct end_io_wq *end_io_wq; 1390 struct btrfs_fs_info *fs_info; 1391 int error; 1392 1393 end_io_wq = container_of(work, struct end_io_wq, work); 1394 bio = end_io_wq->bio; 1395 fs_info = end_io_wq->info; 1396 1397 /* metadata bio reads are special because the whole tree block must 1398 * be checksummed at once. This makes sure the entire block is in 1399 * ram and up to date before trying to verify things. For 1400 * blocksize <= pagesize, it is basically a noop 1401 */ 1402 if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata && 1403 !bio_ready_for_csum(bio)) { 1404 btrfs_queue_worker(&fs_info->endio_meta_workers, 1405 &end_io_wq->work); 1406 return; 1407 } 1408 error = end_io_wq->error; 1409 bio->bi_private = end_io_wq->private; 1410 bio->bi_end_io = end_io_wq->end_io; 1411 kfree(end_io_wq); 1412 bio_endio(bio, error); 1413} 1414 1415static int cleaner_kthread(void *arg) 1416{ 1417 struct btrfs_root *root = arg; 1418 1419 do { 1420 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1421 1422 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1423 mutex_trylock(&root->fs_info->cleaner_mutex)) { 1424 btrfs_run_delayed_iputs(root); 1425 btrfs_clean_old_snapshots(root); 1426 mutex_unlock(&root->fs_info->cleaner_mutex); 1427 } 1428 1429 if (freezing(current)) { 1430 refrigerator(); 1431 } else { 1432 set_current_state(TASK_INTERRUPTIBLE); 1433 if (!kthread_should_stop()) 1434 schedule(); 1435 __set_current_state(TASK_RUNNING); 1436 } 1437 } while (!kthread_should_stop()); 1438 return 0; 1439} 1440 1441static int transaction_kthread(void *arg) 1442{ 1443 struct btrfs_root *root = arg; 1444 struct btrfs_trans_handle *trans; 1445 struct btrfs_transaction *cur; 1446 u64 transid; 1447 unsigned long now; 1448 unsigned long delay; 1449 int ret; 1450 1451 do { 1452 delay = HZ * 30; 1453 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1454 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1455 1456 spin_lock(&root->fs_info->new_trans_lock); 1457 cur = root->fs_info->running_transaction; 1458 if (!cur) { 1459 spin_unlock(&root->fs_info->new_trans_lock); 1460 goto sleep; 1461 } 1462 1463 now = get_seconds(); 1464 if (!cur->blocked && 1465 (now < cur->start_time || now - cur->start_time < 30)) { 1466 spin_unlock(&root->fs_info->new_trans_lock); 1467 delay = HZ * 5; 1468 goto sleep; 1469 } 1470 transid = cur->transid; 1471 spin_unlock(&root->fs_info->new_trans_lock); 1472 1473 trans = btrfs_join_transaction(root, 1); 1474 if (transid == trans->transid) { 1475 ret = btrfs_commit_transaction(trans, root); 1476 BUG_ON(ret); 1477 } else { 1478 btrfs_end_transaction(trans, root); 1479 } 1480sleep: 1481 wake_up_process(root->fs_info->cleaner_kthread); 1482 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1483 1484 if (freezing(current)) { 1485 refrigerator(); 1486 } else { 1487 set_current_state(TASK_INTERRUPTIBLE); 1488 if (!kthread_should_stop() && 1489 !btrfs_transaction_blocked(root->fs_info)) 1490 schedule_timeout(delay); 1491 __set_current_state(TASK_RUNNING); 1492 } 1493 } while (!kthread_should_stop()); 1494 return 0; 1495} 1496 1497struct btrfs_root *open_ctree(struct super_block *sb, 1498 struct btrfs_fs_devices *fs_devices, 1499 char *options) 1500{ 1501 u32 sectorsize; 1502 u32 nodesize; 1503 u32 leafsize; 1504 u32 blocksize; 1505 u32 stripesize; 1506 u64 generation; 1507 u64 features; 1508 struct btrfs_key location; 1509 struct buffer_head *bh; 1510 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), 1511 GFP_NOFS); 1512 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), 1513 GFP_NOFS); 1514 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), 1515 GFP_NOFS); 1516 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), 1517 GFP_NOFS); 1518 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), 1519 GFP_NOFS); 1520 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), 1521 GFP_NOFS); 1522 struct btrfs_root *log_tree_root; 1523 1524 int ret; 1525 int err = -EINVAL; 1526 1527 struct btrfs_super_block *disk_super; 1528 1529 if (!extent_root || !tree_root || !fs_info || 1530 !chunk_root || !dev_root || !csum_root) { 1531 err = -ENOMEM; 1532 goto fail; 1533 } 1534 1535 ret = init_srcu_struct(&fs_info->subvol_srcu); 1536 if (ret) { 1537 err = ret; 1538 goto fail; 1539 } 1540 1541 ret = setup_bdi(fs_info, &fs_info->bdi); 1542 if (ret) { 1543 err = ret; 1544 goto fail_srcu; 1545 } 1546 1547 fs_info->btree_inode = new_inode(sb); 1548 if (!fs_info->btree_inode) { 1549 err = -ENOMEM; 1550 goto fail_bdi; 1551 } 1552 1553 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 1554 INIT_LIST_HEAD(&fs_info->trans_list); 1555 INIT_LIST_HEAD(&fs_info->dead_roots); 1556 INIT_LIST_HEAD(&fs_info->delayed_iputs); 1557 INIT_LIST_HEAD(&fs_info->hashers); 1558 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1559 INIT_LIST_HEAD(&fs_info->ordered_operations); 1560 INIT_LIST_HEAD(&fs_info->caching_block_groups); 1561 spin_lock_init(&fs_info->delalloc_lock); 1562 spin_lock_init(&fs_info->new_trans_lock); 1563 spin_lock_init(&fs_info->ref_cache_lock); 1564 spin_lock_init(&fs_info->fs_roots_radix_lock); 1565 spin_lock_init(&fs_info->delayed_iput_lock); 1566 1567 init_completion(&fs_info->kobj_unregister); 1568 fs_info->tree_root = tree_root; 1569 fs_info->extent_root = extent_root; 1570 fs_info->csum_root = csum_root; 1571 fs_info->chunk_root = chunk_root; 1572 fs_info->dev_root = dev_root; 1573 fs_info->fs_devices = fs_devices; 1574 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1575 INIT_LIST_HEAD(&fs_info->space_info); 1576 btrfs_mapping_init(&fs_info->mapping_tree); 1577 btrfs_init_block_rsv(&fs_info->global_block_rsv); 1578 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); 1579 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 1580 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 1581 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 1582 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); 1583 mutex_init(&fs_info->durable_block_rsv_mutex); 1584 atomic_set(&fs_info->nr_async_submits, 0); 1585 atomic_set(&fs_info->async_delalloc_pages, 0); 1586 atomic_set(&fs_info->async_submit_draining, 0); 1587 atomic_set(&fs_info->nr_async_bios, 0); 1588 fs_info->sb = sb; 1589 fs_info->max_inline = 8192 * 1024; 1590 fs_info->metadata_ratio = 0; 1591 1592 fs_info->thread_pool_size = min_t(unsigned long, 1593 num_online_cpus() + 2, 8); 1594 1595 INIT_LIST_HEAD(&fs_info->ordered_extents); 1596 spin_lock_init(&fs_info->ordered_extent_lock); 1597 1598 sb->s_blocksize = 4096; 1599 sb->s_blocksize_bits = blksize_bits(4096); 1600 sb->s_bdi = &fs_info->bdi; 1601 1602 fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; 1603 fs_info->btree_inode->i_nlink = 1; 1604 /* 1605 * we set the i_size on the btree inode to the max possible int. 1606 * the real end of the address space is determined by all of 1607 * the devices in the system 1608 */ 1609 fs_info->btree_inode->i_size = OFFSET_MAX; 1610 fs_info->btree_inode->i_mapping->a_ops = &btree_aops; 1611 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; 1612 1613 RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); 1614 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, 1615 fs_info->btree_inode->i_mapping, 1616 GFP_NOFS); 1617 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree, 1618 GFP_NOFS); 1619 1620 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; 1621 1622 BTRFS_I(fs_info->btree_inode)->root = tree_root; 1623 memset(&BTRFS_I(fs_info->btree_inode)->location, 0, 1624 sizeof(struct btrfs_key)); 1625 BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; 1626 insert_inode_hash(fs_info->btree_inode); 1627 1628 spin_lock_init(&fs_info->block_group_cache_lock); 1629 fs_info->block_group_cache_tree = RB_ROOT; 1630 1631 extent_io_tree_init(&fs_info->freed_extents[0], 1632 fs_info->btree_inode->i_mapping, GFP_NOFS); 1633 extent_io_tree_init(&fs_info->freed_extents[1], 1634 fs_info->btree_inode->i_mapping, GFP_NOFS); 1635 fs_info->pinned_extents = &fs_info->freed_extents[0]; 1636 fs_info->do_barriers = 1; 1637 1638 1639 mutex_init(&fs_info->trans_mutex); 1640 mutex_init(&fs_info->ordered_operations_mutex); 1641 mutex_init(&fs_info->tree_log_mutex); 1642 mutex_init(&fs_info->chunk_mutex); 1643 mutex_init(&fs_info->transaction_kthread_mutex); 1644 mutex_init(&fs_info->cleaner_mutex); 1645 mutex_init(&fs_info->volume_mutex); 1646 init_rwsem(&fs_info->extent_commit_sem); 1647 init_rwsem(&fs_info->cleanup_work_sem); 1648 init_rwsem(&fs_info->subvol_sem); 1649 1650 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 1651 btrfs_init_free_cluster(&fs_info->data_alloc_cluster); 1652 1653 init_waitqueue_head(&fs_info->transaction_throttle); 1654 init_waitqueue_head(&fs_info->transaction_wait); 1655 init_waitqueue_head(&fs_info->async_submit_wait); 1656 1657 __setup_root(4096, 4096, 4096, 4096, tree_root, 1658 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1659 1660 1661 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1662 if (!bh) 1663 goto fail_iput; 1664 1665 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 1666 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 1667 sizeof(fs_info->super_for_commit)); 1668 brelse(bh); 1669 1670 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); 1671 1672 disk_super = &fs_info->super_copy; 1673 if (!btrfs_super_root(disk_super)) 1674 goto fail_iput; 1675 1676 ret = btrfs_parse_options(tree_root, options); 1677 if (ret) { 1678 err = ret; 1679 goto fail_iput; 1680 } 1681 1682 features = btrfs_super_incompat_flags(disk_super) & 1683 ~BTRFS_FEATURE_INCOMPAT_SUPP; 1684 if (features) { 1685 printk(KERN_ERR "BTRFS: couldn't mount because of " 1686 "unsupported optional features (%Lx).\n", 1687 (unsigned long long)features); 1688 err = -EINVAL; 1689 goto fail_iput; 1690 } 1691 1692 features = btrfs_super_incompat_flags(disk_super); 1693 if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { 1694 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; 1695 btrfs_set_super_incompat_flags(disk_super, features); 1696 } 1697 1698 features = btrfs_super_compat_ro_flags(disk_super) & 1699 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 1700 if (!(sb->s_flags & MS_RDONLY) && features) { 1701 printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " 1702 "unsupported option features (%Lx).\n", 1703 (unsigned long long)features); 1704 err = -EINVAL; 1705 goto fail_iput; 1706 } 1707 1708 btrfs_init_workers(&fs_info->generic_worker, 1709 "genwork", 1, NULL); 1710 1711 btrfs_init_workers(&fs_info->workers, "worker", 1712 fs_info->thread_pool_size, 1713 &fs_info->generic_worker); 1714 1715 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", 1716 fs_info->thread_pool_size, 1717 &fs_info->generic_worker); 1718 1719 btrfs_init_workers(&fs_info->submit_workers, "submit", 1720 min_t(u64, fs_devices->num_devices, 1721 fs_info->thread_pool_size), 1722 &fs_info->generic_worker); 1723 1724 /* a higher idle thresh on the submit workers makes it much more 1725 * likely that bios will be send down in a sane order to the 1726 * devices 1727 */ 1728 fs_info->submit_workers.idle_thresh = 64; 1729 1730 fs_info->workers.idle_thresh = 16; 1731 fs_info->workers.ordered = 1; 1732 1733 fs_info->delalloc_workers.idle_thresh = 2; 1734 fs_info->delalloc_workers.ordered = 1; 1735 1736 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, 1737 &fs_info->generic_worker); 1738 btrfs_init_workers(&fs_info->endio_workers, "endio", 1739 fs_info->thread_pool_size, 1740 &fs_info->generic_worker); 1741 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", 1742 fs_info->thread_pool_size, 1743 &fs_info->generic_worker); 1744 btrfs_init_workers(&fs_info->endio_meta_write_workers, 1745 "endio-meta-write", fs_info->thread_pool_size, 1746 &fs_info->generic_worker); 1747 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 1748 fs_info->thread_pool_size, 1749 &fs_info->generic_worker); 1750 1751 /* 1752 * endios are largely parallel and should have a very 1753 * low idle thresh 1754 */ 1755 fs_info->endio_workers.idle_thresh = 4; 1756 fs_info->endio_meta_workers.idle_thresh = 4; 1757 1758 fs_info->endio_write_workers.idle_thresh = 2; 1759 fs_info->endio_meta_write_workers.idle_thresh = 2; 1760 1761 btrfs_start_workers(&fs_info->workers, 1); 1762 btrfs_start_workers(&fs_info->generic_worker, 1); 1763 btrfs_start_workers(&fs_info->submit_workers, 1); 1764 btrfs_start_workers(&fs_info->delalloc_workers, 1); 1765 btrfs_start_workers(&fs_info->fixup_workers, 1); 1766 btrfs_start_workers(&fs_info->endio_workers, 1); 1767 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1768 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1769 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1770 1771 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1772 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1773 4 * 1024 * 1024 / PAGE_CACHE_SIZE); 1774 1775 nodesize = btrfs_super_nodesize(disk_super); 1776 leafsize = btrfs_super_leafsize(disk_super); 1777 sectorsize = btrfs_super_sectorsize(disk_super); 1778 stripesize = btrfs_super_stripesize(disk_super); 1779 tree_root->nodesize = nodesize; 1780 tree_root->leafsize = leafsize; 1781 tree_root->sectorsize = sectorsize; 1782 tree_root->stripesize = stripesize; 1783 1784 sb->s_blocksize = sectorsize; 1785 sb->s_blocksize_bits = blksize_bits(sectorsize); 1786 1787 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, 1788 sizeof(disk_super->magic))) { 1789 printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); 1790 goto fail_sb_buffer; 1791 } 1792 1793 mutex_lock(&fs_info->chunk_mutex); 1794 ret = btrfs_read_sys_array(tree_root); 1795 mutex_unlock(&fs_info->chunk_mutex); 1796 if (ret) { 1797 printk(KERN_WARNING "btrfs: failed to read the system " 1798 "array on %s\n", sb->s_id); 1799 goto fail_sb_buffer; 1800 } 1801 1802 blocksize = btrfs_level_size(tree_root, 1803 btrfs_super_chunk_root_level(disk_super)); 1804 generation = btrfs_super_chunk_root_generation(disk_super); 1805 1806 __setup_root(nodesize, leafsize, sectorsize, stripesize, 1807 chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); 1808 1809 chunk_root->node = read_tree_block(chunk_root, 1810 btrfs_super_chunk_root(disk_super), 1811 blocksize, generation); 1812 BUG_ON(!chunk_root->node); 1813 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 1814 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", 1815 sb->s_id); 1816 goto fail_chunk_root; 1817 } 1818 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); 1819 chunk_root->commit_root = btrfs_root_node(chunk_root); 1820 1821 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, 1822 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), 1823 BTRFS_UUID_SIZE); 1824 1825 mutex_lock(&fs_info->chunk_mutex); 1826 ret = btrfs_read_chunk_tree(chunk_root); 1827 mutex_unlock(&fs_info->chunk_mutex); 1828 if (ret) { 1829 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 1830 sb->s_id); 1831 goto fail_chunk_root; 1832 } 1833 1834 btrfs_close_extra_devices(fs_devices); 1835 1836 blocksize = btrfs_level_size(tree_root, 1837 btrfs_super_root_level(disk_super)); 1838 generation = btrfs_super_generation(disk_super); 1839 1840 tree_root->node = read_tree_block(tree_root, 1841 btrfs_super_root(disk_super), 1842 blocksize, generation); 1843 if (!tree_root->node) 1844 goto fail_chunk_root; 1845 if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { 1846 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", 1847 sb->s_id); 1848 goto fail_tree_root; 1849 } 1850 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 1851 tree_root->commit_root = btrfs_root_node(tree_root); 1852 1853 ret = find_and_setup_root(tree_root, fs_info, 1854 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 1855 if (ret) 1856 goto fail_tree_root; 1857 extent_root->track_dirty = 1; 1858 1859 ret = find_and_setup_root(tree_root, fs_info, 1860 BTRFS_DEV_TREE_OBJECTID, dev_root); 1861 if (ret) 1862 goto fail_extent_root; 1863 dev_root->track_dirty = 1; 1864 1865 ret = find_and_setup_root(tree_root, fs_info, 1866 BTRFS_CSUM_TREE_OBJECTID, csum_root); 1867 if (ret) 1868 goto fail_dev_root; 1869 1870 csum_root->track_dirty = 1; 1871 1872 fs_info->generation = generation; 1873 fs_info->last_trans_committed = generation; 1874 fs_info->data_alloc_profile = (u64)-1; 1875 fs_info->metadata_alloc_profile = (u64)-1; 1876 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1877 1878 ret = btrfs_read_block_groups(extent_root); 1879 if (ret) { 1880 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 1881 goto fail_block_groups; 1882 } 1883 1884 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1885 "btrfs-cleaner"); 1886 if (IS_ERR(fs_info->cleaner_kthread)) 1887 goto fail_block_groups; 1888 1889 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1890 tree_root, 1891 "btrfs-transaction"); 1892 if (IS_ERR(fs_info->transaction_kthread)) 1893 goto fail_cleaner; 1894 1895 if (!btrfs_test_opt(tree_root, SSD) && 1896 !btrfs_test_opt(tree_root, NOSSD) && 1897 !fs_info->fs_devices->rotating) { 1898 printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD " 1899 "mode\n"); 1900 btrfs_set_opt(fs_info->mount_opt, SSD); 1901 } 1902 1903 if (btrfs_super_log_root(disk_super) != 0) { 1904 u64 bytenr = btrfs_super_log_root(disk_super); 1905 1906 if (fs_devices->rw_devices == 0) { 1907 printk(KERN_WARNING "Btrfs log replay required " 1908 "on RO media\n"); 1909 err = -EIO; 1910 goto fail_trans_kthread; 1911 } 1912 blocksize = 1913 btrfs_level_size(tree_root, 1914 btrfs_super_log_root_level(disk_super)); 1915 1916 log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 1917 if (!log_tree_root) { 1918 err = -ENOMEM; 1919 goto fail_trans_kthread; 1920 } 1921 1922 __setup_root(nodesize, leafsize, sectorsize, stripesize, 1923 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); 1924 1925 log_tree_root->node = read_tree_block(tree_root, bytenr, 1926 blocksize, 1927 generation + 1); 1928 ret = btrfs_recover_log_trees(log_tree_root); 1929 BUG_ON(ret); 1930 1931 if (sb->s_flags & MS_RDONLY) { 1932 ret = btrfs_commit_super(tree_root); 1933 BUG_ON(ret); 1934 } 1935 } 1936 1937 ret = btrfs_find_orphan_roots(tree_root); 1938 BUG_ON(ret); 1939 1940 if (!(sb->s_flags & MS_RDONLY)) { 1941 ret = btrfs_cleanup_fs_roots(fs_info); 1942 BUG_ON(ret); 1943 1944 ret = btrfs_recover_relocation(tree_root); 1945 if (ret < 0) { 1946 printk(KERN_WARNING 1947 "btrfs: failed to recover relocation\n"); 1948 err = -EINVAL; 1949 goto fail_trans_kthread; 1950 } 1951 } 1952 1953 location.objectid = BTRFS_FS_TREE_OBJECTID; 1954 location.type = BTRFS_ROOT_ITEM_KEY; 1955 location.offset = (u64)-1; 1956 1957 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 1958 if (!fs_info->fs_root) 1959 goto fail_trans_kthread; 1960 if (IS_ERR(fs_info->fs_root)) { 1961 err = PTR_ERR(fs_info->fs_root); 1962 goto fail_trans_kthread; 1963 } 1964 1965 if (!(sb->s_flags & MS_RDONLY)) { 1966 down_read(&fs_info->cleanup_work_sem); 1967 btrfs_orphan_cleanup(fs_info->fs_root); 1968 up_read(&fs_info->cleanup_work_sem); 1969 } 1970 1971 return tree_root; 1972 1973fail_trans_kthread: 1974 kthread_stop(fs_info->transaction_kthread); 1975fail_cleaner: 1976 kthread_stop(fs_info->cleaner_kthread); 1977 1978 /* 1979 * make sure we're done with the btree inode before we stop our 1980 * kthreads 1981 */ 1982 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 1983 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 1984 1985fail_block_groups: 1986 btrfs_free_block_groups(fs_info); 1987 free_extent_buffer(csum_root->node); 1988 free_extent_buffer(csum_root->commit_root); 1989fail_dev_root: 1990 free_extent_buffer(dev_root->node); 1991 free_extent_buffer(dev_root->commit_root); 1992fail_extent_root: 1993 free_extent_buffer(extent_root->node); 1994 free_extent_buffer(extent_root->commit_root); 1995fail_tree_root: 1996 free_extent_buffer(tree_root->node); 1997 free_extent_buffer(tree_root->commit_root); 1998fail_chunk_root: 1999 free_extent_buffer(chunk_root->node); 2000 free_extent_buffer(chunk_root->commit_root); 2001fail_sb_buffer: 2002 btrfs_stop_workers(&fs_info->generic_worker); 2003 btrfs_stop_workers(&fs_info->fixup_workers); 2004 btrfs_stop_workers(&fs_info->delalloc_workers); 2005 btrfs_stop_workers(&fs_info->workers); 2006 btrfs_stop_workers(&fs_info->endio_workers); 2007 btrfs_stop_workers(&fs_info->endio_meta_workers); 2008 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2009 btrfs_stop_workers(&fs_info->endio_write_workers); 2010 btrfs_stop_workers(&fs_info->submit_workers); 2011fail_iput: 2012 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2013 iput(fs_info->btree_inode); 2014 2015 btrfs_close_devices(fs_info->fs_devices); 2016 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2017fail_bdi: 2018 bdi_destroy(&fs_info->bdi); 2019fail_srcu: 2020 cleanup_srcu_struct(&fs_info->subvol_srcu); 2021fail: 2022 kfree(extent_root); 2023 kfree(tree_root); 2024 kfree(fs_info); 2025 kfree(chunk_root); 2026 kfree(dev_root); 2027 kfree(csum_root); 2028 return ERR_PTR(err); 2029} 2030 2031static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 2032{ 2033 char b[BDEVNAME_SIZE]; 2034 2035 if (uptodate) { 2036 set_buffer_uptodate(bh); 2037 } else { 2038 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { 2039 printk(KERN_WARNING "lost page write due to " 2040 "I/O error on %s\n", 2041 bdevname(bh->b_bdev, b)); 2042 } 2043 /* note, we dont' set_buffer_write_io_error because we have 2044 * our own ways of dealing with the IO errors 2045 */ 2046 clear_buffer_uptodate(bh); 2047 } 2048 unlock_buffer(bh); 2049 put_bh(bh); 2050} 2051 2052struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) 2053{ 2054 struct buffer_head *bh; 2055 struct buffer_head *latest = NULL; 2056 struct btrfs_super_block *super; 2057 int i; 2058 u64 transid = 0; 2059 u64 bytenr; 2060 2061 /* we would like to check all the supers, but that would make 2062 * a btrfs mount succeed after a mkfs from a different FS. 2063 * So, we need to add a special mount option to scan for 2064 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 2065 */ 2066 for (i = 0; i < 1; i++) { 2067 bytenr = btrfs_sb_offset(i); 2068 if (bytenr + 4096 >= i_size_read(bdev->bd_inode)) 2069 break; 2070 bh = __bread(bdev, bytenr / 4096, 4096); 2071 if (!bh) 2072 continue; 2073 2074 super = (struct btrfs_super_block *)bh->b_data; 2075 if (btrfs_super_bytenr(super) != bytenr || 2076 strncmp((char *)(&super->magic), BTRFS_MAGIC, 2077 sizeof(super->magic))) { 2078 brelse(bh); 2079 continue; 2080 } 2081 2082 if (!latest || btrfs_super_generation(super) > transid) { 2083 brelse(latest); 2084 latest = bh; 2085 transid = btrfs_super_generation(super); 2086 } else { 2087 brelse(bh); 2088 } 2089 } 2090 return latest; 2091} 2092 2093/* 2094 * this should be called twice, once with wait == 0 and 2095 * once with wait == 1. When wait == 0 is done, all the buffer heads 2096 * we write are pinned. 2097 * 2098 * They are released when wait == 1 is done. 2099 * max_mirrors must be the same for both runs, and it indicates how 2100 * many supers on this one device should be written. 2101 * 2102 * max_mirrors == 0 means to write them all. 2103 */ 2104static int write_dev_supers(struct btrfs_device *device, 2105 struct btrfs_super_block *sb, 2106 int do_barriers, int wait, int max_mirrors) 2107{ 2108 struct buffer_head *bh; 2109 int i; 2110 int ret; 2111 int errors = 0; 2112 u32 crc; 2113 u64 bytenr; 2114 int last_barrier = 0; 2115 2116 if (max_mirrors == 0) 2117 max_mirrors = BTRFS_SUPER_MIRROR_MAX; 2118 2119 /* make sure only the last submit_bh does a barrier */ 2120 if (do_barriers) { 2121 for (i = 0; i < max_mirrors; i++) { 2122 bytenr = btrfs_sb_offset(i); 2123 if (bytenr + BTRFS_SUPER_INFO_SIZE >= 2124 device->total_bytes) 2125 break; 2126 last_barrier = i; 2127 } 2128 } 2129 2130 for (i = 0; i < max_mirrors; i++) { 2131 bytenr = btrfs_sb_offset(i); 2132 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 2133 break; 2134 2135 if (wait) { 2136 bh = __find_get_block(device->bdev, bytenr / 4096, 2137 BTRFS_SUPER_INFO_SIZE); 2138 BUG_ON(!bh); 2139 wait_on_buffer(bh); 2140 if (!buffer_uptodate(bh)) 2141 errors++; 2142 2143 /* drop our reference */ 2144 brelse(bh); 2145 2146 /* drop the reference from the wait == 0 run */ 2147 brelse(bh); 2148 continue; 2149 } else { 2150 btrfs_set_super_bytenr(sb, bytenr); 2151 2152 crc = ~(u32)0; 2153 crc = btrfs_csum_data(NULL, (char *)sb + 2154 BTRFS_CSUM_SIZE, crc, 2155 BTRFS_SUPER_INFO_SIZE - 2156 BTRFS_CSUM_SIZE); 2157 btrfs_csum_final(crc, sb->csum); 2158 2159 /* 2160 * one reference for us, and we leave it for the 2161 * caller 2162 */ 2163 bh = __getblk(device->bdev, bytenr / 4096, 2164 BTRFS_SUPER_INFO_SIZE); 2165 memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); 2166 2167 /* one reference for submit_bh */ 2168 get_bh(bh); 2169 2170 set_buffer_uptodate(bh); 2171 lock_buffer(bh); 2172 bh->b_end_io = btrfs_end_buffer_write_sync; 2173 } 2174 2175 if (i == last_barrier && do_barriers && device->barriers) { 2176 ret = submit_bh(WRITE_BARRIER, bh); 2177 if (ret == -EOPNOTSUPP) { 2178 printk("btrfs: disabling barriers on dev %s\n", 2179 device->name); 2180 set_buffer_uptodate(bh); 2181 device->barriers = 0; 2182 /* one reference for submit_bh */ 2183 get_bh(bh); 2184 lock_buffer(bh); 2185 ret = submit_bh(WRITE_SYNC, bh); 2186 } 2187 } else { 2188 ret = submit_bh(WRITE_SYNC, bh); 2189 } 2190 2191 if (ret) 2192 errors++; 2193 } 2194 return errors < i ? 0 : -1; 2195} 2196 2197int write_all_supers(struct btrfs_root *root, int max_mirrors) 2198{ 2199 struct list_head *head; 2200 struct btrfs_device *dev; 2201 struct btrfs_super_block *sb; 2202 struct btrfs_dev_item *dev_item; 2203 int ret; 2204 int do_barriers; 2205 int max_errors; 2206 int total_errors = 0; 2207 u64 flags; 2208 2209 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 2210 do_barriers = !btrfs_test_opt(root, NOBARRIER); 2211 2212 sb = &root->fs_info->super_for_commit; 2213 dev_item = &sb->dev_item; 2214 2215 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2216 head = &root->fs_info->fs_devices->devices; 2217 list_for_each_entry(dev, head, dev_list) { 2218 if (!dev->bdev) { 2219 total_errors++; 2220 continue; 2221 } 2222 if (!dev->in_fs_metadata || !dev->writeable) 2223 continue; 2224 2225 btrfs_set_stack_device_generation(dev_item, 0); 2226 btrfs_set_stack_device_type(dev_item, dev->type); 2227 btrfs_set_stack_device_id(dev_item, dev->devid); 2228 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); 2229 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); 2230 btrfs_set_stack_device_io_align(dev_item, dev->io_align); 2231 btrfs_set_stack_device_io_width(dev_item, dev->io_width); 2232 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); 2233 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); 2234 memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); 2235 2236 flags = btrfs_super_flags(sb); 2237 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); 2238 2239 ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors); 2240 if (ret) 2241 total_errors++; 2242 } 2243 if (total_errors > max_errors) { 2244 printk(KERN_ERR "btrfs: %d errors while writing supers\n", 2245 total_errors); 2246 BUG(); 2247 } 2248 2249 total_errors = 0; 2250 list_for_each_entry(dev, head, dev_list) { 2251 if (!dev->bdev) 2252 continue; 2253 if (!dev->in_fs_metadata || !dev->writeable) 2254 continue; 2255 2256 ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors); 2257 if (ret) 2258 total_errors++; 2259 } 2260 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2261 if (total_errors > max_errors) { 2262 printk(KERN_ERR "btrfs: %d errors while writing supers\n", 2263 total_errors); 2264 BUG(); 2265 } 2266 return 0; 2267} 2268 2269int write_ctree_super(struct btrfs_trans_handle *trans, 2270 struct btrfs_root *root, int max_mirrors) 2271{ 2272 int ret; 2273 2274 ret = write_all_supers(root, max_mirrors); 2275 return ret; 2276} 2277 2278int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 2279{ 2280 spin_lock(&fs_info->fs_roots_radix_lock); 2281 radix_tree_delete(&fs_info->fs_roots_radix, 2282 (unsigned long)root->root_key.objectid); 2283 spin_unlock(&fs_info->fs_roots_radix_lock); 2284 2285 if (btrfs_root_refs(&root->root_item) == 0) 2286 synchronize_srcu(&fs_info->subvol_srcu); 2287 2288 free_fs_root(root); 2289 return 0; 2290} 2291 2292static void free_fs_root(struct btrfs_root *root) 2293{ 2294 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); 2295 if (root->anon_super.s_dev) { 2296 down_write(&root->anon_super.s_umount); 2297 kill_anon_super(&root->anon_super); 2298 } 2299 free_extent_buffer(root->node); 2300 free_extent_buffer(root->commit_root); 2301 kfree(root->name); 2302 kfree(root); 2303} 2304 2305static int del_fs_roots(struct btrfs_fs_info *fs_info) 2306{ 2307 int ret; 2308 struct btrfs_root *gang[8]; 2309 int i; 2310 2311 while (!list_empty(&fs_info->dead_roots)) { 2312 gang[0] = list_entry(fs_info->dead_roots.next, 2313 struct btrfs_root, root_list); 2314 list_del(&gang[0]->root_list); 2315 2316 if (gang[0]->in_radix) { 2317 btrfs_free_fs_root(fs_info, gang[0]); 2318 } else { 2319 free_extent_buffer(gang[0]->node); 2320 free_extent_buffer(gang[0]->commit_root); 2321 kfree(gang[0]); 2322 } 2323 } 2324 2325 while (1) { 2326 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, 2327 (void **)gang, 0, 2328 ARRAY_SIZE(gang)); 2329 if (!ret) 2330 break; 2331 for (i = 0; i < ret; i++) 2332 btrfs_free_fs_root(fs_info, gang[i]); 2333 } 2334 return 0; 2335} 2336 2337int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) 2338{ 2339 u64 root_objectid = 0; 2340 struct btrfs_root *gang[8]; 2341 int i; 2342 int ret; 2343 2344 while (1) { 2345 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, 2346 (void **)gang, root_objectid, 2347 ARRAY_SIZE(gang)); 2348 if (!ret) 2349 break; 2350 2351 root_objectid = gang[ret - 1]->root_key.objectid + 1; 2352 for (i = 0; i < ret; i++) { 2353 root_objectid = gang[i]->root_key.objectid; 2354 btrfs_orphan_cleanup(gang[i]); 2355 } 2356 root_objectid++; 2357 } 2358 return 0; 2359} 2360 2361int btrfs_commit_super(struct btrfs_root *root) 2362{ 2363 struct btrfs_trans_handle *trans; 2364 int ret; 2365 2366 mutex_lock(&root->fs_info->cleaner_mutex); 2367 btrfs_run_delayed_iputs(root); 2368 btrfs_clean_old_snapshots(root); 2369 mutex_unlock(&root->fs_info->cleaner_mutex); 2370 2371 /* wait until ongoing cleanup work done */ 2372 down_write(&root->fs_info->cleanup_work_sem); 2373 up_write(&root->fs_info->cleanup_work_sem); 2374 2375 trans = btrfs_join_transaction(root, 1); 2376 ret = btrfs_commit_transaction(trans, root); 2377 BUG_ON(ret); 2378 /* run commit again to drop the original snapshot */ 2379 trans = btrfs_join_transaction(root, 1); 2380 btrfs_commit_transaction(trans, root); 2381 ret = btrfs_write_and_wait_transaction(NULL, root); 2382 BUG_ON(ret); 2383 2384 ret = write_ctree_super(NULL, root, 0); 2385 return ret; 2386} 2387 2388int close_ctree(struct btrfs_root *root) 2389{ 2390 struct btrfs_fs_info *fs_info = root->fs_info; 2391 int ret; 2392 2393 fs_info->closing = 1; 2394 smp_mb(); 2395 2396 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2397 ret = btrfs_commit_super(root); 2398 if (ret) 2399 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2400 } 2401 2402 kthread_stop(root->fs_info->transaction_kthread); 2403 kthread_stop(root->fs_info->cleaner_kthread); 2404 2405 fs_info->closing = 2; 2406 smp_mb(); 2407 2408 if (fs_info->delalloc_bytes) { 2409 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 2410 (unsigned long long)fs_info->delalloc_bytes); 2411 } 2412 if (fs_info->total_ref_cache_size) { 2413 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", 2414 (unsigned long long)fs_info->total_ref_cache_size); 2415 } 2416 2417 free_extent_buffer(fs_info->extent_root->node); 2418 free_extent_buffer(fs_info->extent_root->commit_root); 2419 free_extent_buffer(fs_info->tree_root->node); 2420 free_extent_buffer(fs_info->tree_root->commit_root); 2421 free_extent_buffer(root->fs_info->chunk_root->node); 2422 free_extent_buffer(root->fs_info->chunk_root->commit_root); 2423 free_extent_buffer(root->fs_info->dev_root->node); 2424 free_extent_buffer(root->fs_info->dev_root->commit_root); 2425 free_extent_buffer(root->fs_info->csum_root->node); 2426 free_extent_buffer(root->fs_info->csum_root->commit_root); 2427 2428 btrfs_free_block_groups(root->fs_info); 2429 2430 del_fs_roots(fs_info); 2431 2432 iput(fs_info->btree_inode); 2433 2434 btrfs_stop_workers(&fs_info->generic_worker); 2435 btrfs_stop_workers(&fs_info->fixup_workers); 2436 btrfs_stop_workers(&fs_info->delalloc_workers); 2437 btrfs_stop_workers(&fs_info->workers); 2438 btrfs_stop_workers(&fs_info->endio_workers); 2439 btrfs_stop_workers(&fs_info->endio_meta_workers); 2440 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2441 btrfs_stop_workers(&fs_info->endio_write_workers); 2442 btrfs_stop_workers(&fs_info->submit_workers); 2443 2444 btrfs_close_devices(fs_info->fs_devices); 2445 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2446 2447 bdi_destroy(&fs_info->bdi); 2448 cleanup_srcu_struct(&fs_info->subvol_srcu); 2449 2450 kfree(fs_info->extent_root); 2451 kfree(fs_info->tree_root); 2452 kfree(fs_info->chunk_root); 2453 kfree(fs_info->dev_root); 2454 kfree(fs_info->csum_root); 2455 return 0; 2456} 2457 2458int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) 2459{ 2460 int ret; 2461 struct inode *btree_inode = buf->first_page->mapping->host; 2462 2463 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf, 2464 NULL); 2465 if (!ret) 2466 return ret; 2467 2468 ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, 2469 parent_transid); 2470 return !ret; 2471} 2472 2473int btrfs_set_buffer_uptodate(struct extent_buffer *buf) 2474{ 2475 struct inode *btree_inode = buf->first_page->mapping->host; 2476 return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, 2477 buf); 2478} 2479 2480void btrfs_mark_buffer_dirty(struct extent_buffer *buf) 2481{ 2482 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 2483 u64 transid = btrfs_header_generation(buf); 2484 struct inode *btree_inode = root->fs_info->btree_inode; 2485 int was_dirty; 2486 2487 btrfs_assert_tree_locked(buf); 2488 if (transid != root->fs_info->generation) { 2489 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 2490 "found %llu running %llu\n", 2491 (unsigned long long)buf->start, 2492 (unsigned long long)transid, 2493 (unsigned long long)root->fs_info->generation); 2494 WARN_ON(1); 2495 } 2496 was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 2497 buf); 2498 if (!was_dirty) { 2499 spin_lock(&root->fs_info->delalloc_lock); 2500 root->fs_info->dirty_metadata_bytes += buf->len; 2501 spin_unlock(&root->fs_info->delalloc_lock); 2502 } 2503} 2504 2505void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 2506{ 2507 /* 2508 * looks as though older kernels can get into trouble with 2509 * this code, they end up stuck in balance_dirty_pages forever 2510 */ 2511 u64 num_dirty; 2512 unsigned long thresh = 32 * 1024 * 1024; 2513 2514 if (current->flags & PF_MEMALLOC) 2515 return; 2516 2517 num_dirty = root->fs_info->dirty_metadata_bytes; 2518 2519 if (num_dirty > thresh) { 2520 balance_dirty_pages_ratelimited_nr( 2521 root->fs_info->btree_inode->i_mapping, 1); 2522 } 2523 return; 2524} 2525 2526int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) 2527{ 2528 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 2529 int ret; 2530 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 2531 if (ret == 0) 2532 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); 2533 return ret; 2534} 2535 2536int btree_lock_page_hook(struct page *page) 2537{ 2538 struct inode *inode = page->mapping->host; 2539 struct btrfs_root *root = BTRFS_I(inode)->root; 2540 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2541 struct extent_buffer *eb; 2542 unsigned long len; 2543 u64 bytenr = page_offset(page); 2544 2545 if (page->private == EXTENT_PAGE_PRIVATE) 2546 goto out; 2547 2548 len = page->private >> 2; 2549 eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS); 2550 if (!eb) 2551 goto out; 2552 2553 btrfs_tree_lock(eb); 2554 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2555 2556 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 2557 spin_lock(&root->fs_info->delalloc_lock); 2558 if (root->fs_info->dirty_metadata_bytes >= eb->len) 2559 root->fs_info->dirty_metadata_bytes -= eb->len; 2560 else 2561 WARN_ON(1); 2562 spin_unlock(&root->fs_info->delalloc_lock); 2563 } 2564 2565 btrfs_tree_unlock(eb); 2566 free_extent_buffer(eb); 2567out: 2568 lock_page(page); 2569 return 0; 2570} 2571 2572static struct extent_io_ops btree_extent_io_ops = { 2573 .write_cache_pages_lock_hook = btree_lock_page_hook, 2574 .readpage_end_io_hook = btree_readpage_end_io_hook, 2575 .submit_bio_hook = btree_submit_bio_hook, 2576 /* note we're sharing with inode.c for the merge bio hook */ 2577 .merge_bio_hook = btrfs_merge_bio_hook, 2578}; 2579