1/* 2 * Copyright (C) 2005, 2006 3 * Avishay Traeger (avishay@gmail.com) 4 * Copyright (C) 2008, 2009 5 * Boaz Harrosh <bharrosh@panasas.com> 6 * 7 * Copyrights for code taken from ext2: 8 * Copyright (C) 1992, 1993, 1994, 1995 9 * Remy Card (card@masi.ibp.fr) 10 * Laboratoire MASI - Institut Blaise Pascal 11 * Universite Pierre et Marie Curie (Paris VI) 12 * from 13 * linux/fs/minix/inode.c 14 * Copyright (C) 1991, 1992 Linus Torvalds 15 * 16 * This file is part of exofs. 17 * 18 * exofs is free software; you can redistribute it and/or modify 19 * it under the terms of the GNU General Public License as published by 20 * the Free Software Foundation. Since it is based on ext2, and the only 21 * valid version of GPL for the Linux kernel is version 2, the only valid 22 * version of GPL for exofs is version 2. 23 * 24 * exofs is distributed in the hope that it will be useful, 25 * but WITHOUT ANY WARRANTY; without even the implied warranty of 26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 27 * GNU General Public License for more details. 28 * 29 * You should have received a copy of the GNU General Public License 30 * along with exofs; if not, write to the Free Software 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 32 */ 33 34#include <linux/slab.h> 35 36#include "exofs.h" 37 38#define EXOFS_DBGMSG2(M...) do {} while (0) 39 40enum { BIO_MAX_PAGES_KMALLOC = 41 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), 42 MAX_PAGES_KMALLOC = 43 PAGE_SIZE / sizeof(struct page *), 44}; 45 46struct page_collect { 47 struct exofs_sb_info *sbi; 48 struct inode *inode; 49 unsigned expected_pages; 50 struct exofs_io_state *ios; 51 52 struct page **pages; 53 unsigned alloc_pages; 54 unsigned nr_pages; 55 unsigned long length; 56 loff_t pg_first; /* keep 64bit also in 32-arches */ 57 bool read_4_write; /* This means two things: that the read is sync 58 * And the pages should not be unlocked. 59 */ 60}; 61 62static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, 63 struct inode *inode) 64{ 65 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 66 67 pcol->sbi = sbi; 68 pcol->inode = inode; 69 pcol->expected_pages = expected_pages; 70 71 pcol->ios = NULL; 72 pcol->pages = NULL; 73 pcol->alloc_pages = 0; 74 pcol->nr_pages = 0; 75 pcol->length = 0; 76 pcol->pg_first = -1; 77 pcol->read_4_write = false; 78} 79 80static void _pcol_reset(struct page_collect *pcol) 81{ 82 pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); 83 84 pcol->pages = NULL; 85 pcol->alloc_pages = 0; 86 pcol->nr_pages = 0; 87 pcol->length = 0; 88 pcol->pg_first = -1; 89 pcol->ios = NULL; 90 91 /* this is probably the end of the loop but in writes 92 * it might not end here. don't be left with nothing 93 */ 94 if (!pcol->expected_pages) 95 pcol->expected_pages = MAX_PAGES_KMALLOC; 96} 97 98static int pcol_try_alloc(struct page_collect *pcol) 99{ 100 unsigned pages = min_t(unsigned, pcol->expected_pages, 101 MAX_PAGES_KMALLOC); 102 103 if (!pcol->ios) { /* First time allocate io_state */ 104 int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); 105 106 if (ret) 107 return ret; 108 } 109 110 /* TODO: easily support bio chaining */ 111 pages = min_t(unsigned, pages, 112 pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC); 113 114 for (; pages; pages >>= 1) { 115 pcol->pages = kmalloc(pages * sizeof(struct page *), 116 GFP_KERNEL); 117 if (likely(pcol->pages)) { 118 pcol->alloc_pages = pages; 119 return 0; 120 } 121 } 122 123 EXOFS_ERR("Failed to kmalloc expected_pages=%u\n", 124 pcol->expected_pages); 125 return -ENOMEM; 126} 127 128static void pcol_free(struct page_collect *pcol) 129{ 130 kfree(pcol->pages); 131 pcol->pages = NULL; 132 133 if (pcol->ios) { 134 exofs_put_io_state(pcol->ios); 135 pcol->ios = NULL; 136 } 137} 138 139static int pcol_add_page(struct page_collect *pcol, struct page *page, 140 unsigned len) 141{ 142 if (unlikely(pcol->nr_pages >= pcol->alloc_pages)) 143 return -ENOMEM; 144 145 pcol->pages[pcol->nr_pages++] = page; 146 pcol->length += len; 147 return 0; 148} 149 150static int update_read_page(struct page *page, int ret) 151{ 152 if (ret == 0) { 153 /* Everything is OK */ 154 SetPageUptodate(page); 155 if (PageError(page)) 156 ClearPageError(page); 157 } else if (ret == -EFAULT) { 158 /* In this case we were trying to read something that wasn't on 159 * disk yet - return a page full of zeroes. This should be OK, 160 * because the object should be empty (if there was a write 161 * before this read, the read would be waiting with the page 162 * locked */ 163 clear_highpage(page); 164 165 SetPageUptodate(page); 166 if (PageError(page)) 167 ClearPageError(page); 168 ret = 0; /* recovered error */ 169 EXOFS_DBGMSG("recovered read error\n"); 170 } else /* Error */ 171 SetPageError(page); 172 173 return ret; 174} 175 176static void update_write_page(struct page *page, int ret) 177{ 178 if (ret) { 179 mapping_set_error(page->mapping, ret); 180 SetPageError(page); 181 } 182 end_page_writeback(page); 183} 184 185/* Called at the end of reads, to optionally unlock pages and update their 186 * status. 187 */ 188static int __readpages_done(struct page_collect *pcol, bool do_unlock) 189{ 190 int i; 191 u64 resid; 192 u64 good_bytes; 193 u64 length = 0; 194 int ret = exofs_check_io(pcol->ios, &resid); 195 196 if (likely(!ret)) 197 good_bytes = pcol->length; 198 else 199 good_bytes = pcol->length - resid; 200 201 EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" 202 " length=0x%lx nr_pages=%u\n", 203 pcol->inode->i_ino, _LLU(good_bytes), pcol->length, 204 pcol->nr_pages); 205 206 for (i = 0; i < pcol->nr_pages; i++) { 207 struct page *page = pcol->pages[i]; 208 struct inode *inode = page->mapping->host; 209 int page_stat; 210 211 if (inode != pcol->inode) 212 continue; /* osd might add more pages at end */ 213 214 if (likely(length < good_bytes)) 215 page_stat = 0; 216 else 217 page_stat = ret; 218 219 EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n", 220 inode->i_ino, page->index, 221 page_stat ? "bad_bytes" : "good_bytes"); 222 223 ret = update_read_page(page, page_stat); 224 if (do_unlock) 225 unlock_page(page); 226 length += PAGE_SIZE; 227 } 228 229 pcol_free(pcol); 230 EXOFS_DBGMSG2("readpages_done END\n"); 231 return ret; 232} 233 234/* callback of async reads */ 235static void readpages_done(struct exofs_io_state *ios, void *p) 236{ 237 struct page_collect *pcol = p; 238 239 __readpages_done(pcol, true); 240 atomic_dec(&pcol->sbi->s_curr_pending); 241 kfree(pcol); 242} 243 244static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) 245{ 246 int i; 247 248 for (i = 0; i < pcol->nr_pages; i++) { 249 struct page *page = pcol->pages[i]; 250 251 if (rw == READ) 252 update_read_page(page, ret); 253 else 254 update_write_page(page, ret); 255 256 unlock_page(page); 257 } 258} 259 260static int read_exec(struct page_collect *pcol, bool is_sync) 261{ 262 struct exofs_i_info *oi = exofs_i(pcol->inode); 263 struct exofs_io_state *ios = pcol->ios; 264 struct page_collect *pcol_copy = NULL; 265 int ret; 266 267 if (!pcol->pages) 268 return 0; 269 270 /* see comment in _readpage() about sync reads */ 271 WARN_ON(is_sync && (pcol->nr_pages != 1)); 272 273 ios->pages = pcol->pages; 274 ios->nr_pages = pcol->nr_pages; 275 ios->length = pcol->length; 276 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; 277 278 if (is_sync) { 279 exofs_oi_read(oi, pcol->ios); 280 return __readpages_done(pcol, false); 281 } 282 283 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 284 if (!pcol_copy) { 285 ret = -ENOMEM; 286 goto err; 287 } 288 289 *pcol_copy = *pcol; 290 ios->done = readpages_done; 291 ios->private = pcol_copy; 292 ret = exofs_oi_read(oi, ios); 293 if (unlikely(ret)) 294 goto err; 295 296 atomic_inc(&pcol->sbi->s_curr_pending); 297 298 EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", 299 ios->obj.id, _LLU(ios->offset), pcol->length); 300 301 /* pages ownership was passed to pcol_copy */ 302 _pcol_reset(pcol); 303 return 0; 304 305err: 306 if (!is_sync) 307 _unlock_pcol_pages(pcol, ret, READ); 308 309 pcol_free(pcol); 310 311 kfree(pcol_copy); 312 return ret; 313} 314 315/* readpage_strip is called either directly from readpage() or by the VFS from 316 * within read_cache_pages(), to add one more page to be read. It will try to 317 * collect as many contiguous pages as posible. If a discontinuity is 318 * encountered, or it runs out of resources, it will submit the previous segment 319 * and will start a new collection. Eventually caller must submit the last 320 * segment if present. 321 */ 322static int readpage_strip(void *data, struct page *page) 323{ 324 struct page_collect *pcol = data; 325 struct inode *inode = pcol->inode; 326 struct exofs_i_info *oi = exofs_i(inode); 327 loff_t i_size = i_size_read(inode); 328 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 329 size_t len; 330 int ret; 331 332 if (PageUptodate(page)) 333 EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, 334 page->index); 335 336 if (page->index < end_index) 337 len = PAGE_CACHE_SIZE; 338 else if (page->index == end_index) 339 len = i_size & ~PAGE_CACHE_MASK; 340 else 341 len = 0; 342 343 if (!len || !obj_created(oi)) { 344 /* this will be out of bounds, or doesn't exist yet. 345 * Current page is cleared and the request is split 346 */ 347 clear_highpage(page); 348 349 SetPageUptodate(page); 350 if (PageError(page)) 351 ClearPageError(page); 352 353 if (!pcol->read_4_write) 354 unlock_page(page); 355 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," 356 " splitting\n", inode->i_ino, page->index); 357 358 return read_exec(pcol, false); 359 } 360 361try_again: 362 363 if (unlikely(pcol->pg_first == -1)) { 364 pcol->pg_first = page->index; 365 } else if (unlikely((pcol->pg_first + pcol->nr_pages) != 366 page->index)) { 367 /* Discontinuity detected, split the request */ 368 ret = read_exec(pcol, false); 369 if (unlikely(ret)) 370 goto fail; 371 goto try_again; 372 } 373 374 if (!pcol->pages) { 375 ret = pcol_try_alloc(pcol); 376 if (unlikely(ret)) 377 goto fail; 378 } 379 380 if (len != PAGE_CACHE_SIZE) 381 zero_user(page, len, PAGE_CACHE_SIZE - len); 382 383 EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", 384 inode->i_ino, page->index, len); 385 386 ret = pcol_add_page(pcol, page, len); 387 if (ret) { 388 EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p " 389 "this_len=0x%zx nr_pages=%u length=0x%lx\n", 390 page, len, pcol->nr_pages, pcol->length); 391 392 /* split the request, and start again with current page */ 393 ret = read_exec(pcol, false); 394 if (unlikely(ret)) 395 goto fail; 396 397 goto try_again; 398 } 399 400 return 0; 401 402fail: 403 /* SetPageError(page); ??? */ 404 unlock_page(page); 405 return ret; 406} 407 408static int exofs_readpages(struct file *file, struct address_space *mapping, 409 struct list_head *pages, unsigned nr_pages) 410{ 411 struct page_collect pcol; 412 int ret; 413 414 _pcol_init(&pcol, nr_pages, mapping->host); 415 416 ret = read_cache_pages(mapping, pages, readpage_strip, &pcol); 417 if (ret) { 418 EXOFS_ERR("read_cache_pages => %d\n", ret); 419 return ret; 420 } 421 422 return read_exec(&pcol, false); 423} 424 425static int _readpage(struct page *page, bool is_sync) 426{ 427 struct page_collect pcol; 428 int ret; 429 430 _pcol_init(&pcol, 1, page->mapping->host); 431 432 /* readpage_strip might call read_exec(,is_sync==false) at several 433 * places but not if we have a single page. 434 */ 435 pcol.read_4_write = is_sync; 436 ret = readpage_strip(&pcol, page); 437 if (ret) { 438 EXOFS_ERR("_readpage => %d\n", ret); 439 return ret; 440 } 441 442 return read_exec(&pcol, is_sync); 443} 444 445/* 446 * We don't need the file 447 */ 448static int exofs_readpage(struct file *file, struct page *page) 449{ 450 return _readpage(page, false); 451} 452 453/* Callback for osd_write. All writes are asynchronous */ 454static void writepages_done(struct exofs_io_state *ios, void *p) 455{ 456 struct page_collect *pcol = p; 457 int i; 458 u64 resid; 459 u64 good_bytes; 460 u64 length = 0; 461 int ret = exofs_check_io(ios, &resid); 462 463 atomic_dec(&pcol->sbi->s_curr_pending); 464 465 if (likely(!ret)) 466 good_bytes = pcol->length; 467 else 468 good_bytes = pcol->length - resid; 469 470 EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" 471 " length=0x%lx nr_pages=%u\n", 472 pcol->inode->i_ino, _LLU(good_bytes), pcol->length, 473 pcol->nr_pages); 474 475 for (i = 0; i < pcol->nr_pages; i++) { 476 struct page *page = pcol->pages[i]; 477 struct inode *inode = page->mapping->host; 478 int page_stat; 479 480 if (inode != pcol->inode) 481 continue; /* osd might add more pages to a bio */ 482 483 if (likely(length < good_bytes)) 484 page_stat = 0; 485 else 486 page_stat = ret; 487 488 update_write_page(page, page_stat); 489 unlock_page(page); 490 EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", 491 inode->i_ino, page->index, page_stat); 492 493 length += PAGE_SIZE; 494 } 495 496 pcol_free(pcol); 497 kfree(pcol); 498 EXOFS_DBGMSG2("writepages_done END\n"); 499} 500 501static int write_exec(struct page_collect *pcol) 502{ 503 struct exofs_i_info *oi = exofs_i(pcol->inode); 504 struct exofs_io_state *ios = pcol->ios; 505 struct page_collect *pcol_copy = NULL; 506 int ret; 507 508 if (!pcol->pages) 509 return 0; 510 511 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 512 if (!pcol_copy) { 513 EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n"); 514 ret = -ENOMEM; 515 goto err; 516 } 517 518 *pcol_copy = *pcol; 519 520 ios->pages = pcol_copy->pages; 521 ios->nr_pages = pcol_copy->nr_pages; 522 ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; 523 ios->length = pcol_copy->length; 524 ios->done = writepages_done; 525 ios->private = pcol_copy; 526 527 ret = exofs_oi_write(oi, ios); 528 if (unlikely(ret)) { 529 EXOFS_ERR("write_exec: exofs_oi_write() Faild\n"); 530 goto err; 531 } 532 533 atomic_inc(&pcol->sbi->s_curr_pending); 534 EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", 535 pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), 536 pcol->length); 537 /* pages ownership was passed to pcol_copy */ 538 _pcol_reset(pcol); 539 return 0; 540 541err: 542 _unlock_pcol_pages(pcol, ret, WRITE); 543 pcol_free(pcol); 544 kfree(pcol_copy); 545 546 return ret; 547} 548 549/* writepage_strip is called either directly from writepage() or by the VFS from 550 * within write_cache_pages(), to add one more page to be written to storage. 551 * It will try to collect as many contiguous pages as possible. If a 552 * discontinuity is encountered or it runs out of resources it will submit the 553 * previous segment and will start a new collection. 554 * Eventually caller must submit the last segment if present. 555 */ 556static int writepage_strip(struct page *page, 557 struct writeback_control *wbc_unused, void *data) 558{ 559 struct page_collect *pcol = data; 560 struct inode *inode = pcol->inode; 561 struct exofs_i_info *oi = exofs_i(inode); 562 loff_t i_size = i_size_read(inode); 563 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 564 size_t len; 565 int ret; 566 567 BUG_ON(!PageLocked(page)); 568 569 ret = wait_obj_created(oi); 570 if (unlikely(ret)) 571 goto fail; 572 573 if (page->index < end_index) 574 /* in this case, the page is within the limits of the file */ 575 len = PAGE_CACHE_SIZE; 576 else { 577 len = i_size & ~PAGE_CACHE_MASK; 578 579 if (page->index > end_index || !len) { 580 /* in this case, the page is outside the limits 581 * (truncate in progress) 582 */ 583 ret = write_exec(pcol); 584 if (unlikely(ret)) 585 goto fail; 586 if (PageError(page)) 587 ClearPageError(page); 588 unlock_page(page); 589 EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) " 590 "outside the limits\n", 591 inode->i_ino, page->index); 592 return 0; 593 } 594 } 595 596try_again: 597 598 if (unlikely(pcol->pg_first == -1)) { 599 pcol->pg_first = page->index; 600 } else if (unlikely((pcol->pg_first + pcol->nr_pages) != 601 page->index)) { 602 /* Discontinuity detected, split the request */ 603 ret = write_exec(pcol); 604 if (unlikely(ret)) 605 goto fail; 606 607 EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n", 608 inode->i_ino, page->index); 609 goto try_again; 610 } 611 612 if (!pcol->pages) { 613 ret = pcol_try_alloc(pcol); 614 if (unlikely(ret)) 615 goto fail; 616 } 617 618 EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", 619 inode->i_ino, page->index, len); 620 621 ret = pcol_add_page(pcol, page, len); 622 if (unlikely(ret)) { 623 EXOFS_DBGMSG2("Failed pcol_add_page " 624 "nr_pages=%u total_length=0x%lx\n", 625 pcol->nr_pages, pcol->length); 626 627 /* split the request, next loop will start again */ 628 ret = write_exec(pcol); 629 if (unlikely(ret)) { 630 EXOFS_DBGMSG("write_exec faild => %d", ret); 631 goto fail; 632 } 633 634 goto try_again; 635 } 636 637 BUG_ON(PageWriteback(page)); 638 set_page_writeback(page); 639 640 return 0; 641 642fail: 643 EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n", 644 inode->i_ino, page->index, ret); 645 set_bit(AS_EIO, &page->mapping->flags); 646 unlock_page(page); 647 return ret; 648} 649 650static int exofs_writepages(struct address_space *mapping, 651 struct writeback_control *wbc) 652{ 653 struct page_collect pcol; 654 long start, end, expected_pages; 655 int ret; 656 657 start = wbc->range_start >> PAGE_CACHE_SHIFT; 658 end = (wbc->range_end == LLONG_MAX) ? 659 start + mapping->nrpages : 660 wbc->range_end >> PAGE_CACHE_SHIFT; 661 662 if (start || end) 663 expected_pages = end - start + 1; 664 else 665 expected_pages = mapping->nrpages; 666 667 if (expected_pages < 32L) 668 expected_pages = 32L; 669 670 EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " 671 "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n", 672 mapping->host->i_ino, wbc->range_start, wbc->range_end, 673 mapping->nrpages, start, end, expected_pages); 674 675 _pcol_init(&pcol, expected_pages, mapping->host); 676 677 ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); 678 if (ret) { 679 EXOFS_ERR("write_cache_pages => %d\n", ret); 680 return ret; 681 } 682 683 return write_exec(&pcol); 684} 685 686static int exofs_writepage(struct page *page, struct writeback_control *wbc) 687{ 688 struct page_collect pcol; 689 int ret; 690 691 _pcol_init(&pcol, 1, page->mapping->host); 692 693 ret = writepage_strip(page, NULL, &pcol); 694 if (ret) { 695 EXOFS_ERR("exofs_writepage => %d\n", ret); 696 return ret; 697 } 698 699 return write_exec(&pcol); 700} 701 702/* i_mutex held using inode->i_size directly */ 703static void _write_failed(struct inode *inode, loff_t to) 704{ 705 if (to > inode->i_size) 706 truncate_pagecache(inode, to, inode->i_size); 707} 708 709int exofs_write_begin(struct file *file, struct address_space *mapping, 710 loff_t pos, unsigned len, unsigned flags, 711 struct page **pagep, void **fsdata) 712{ 713 int ret = 0; 714 struct page *page; 715 716 page = *pagep; 717 if (page == NULL) { 718 ret = simple_write_begin(file, mapping, pos, len, flags, pagep, 719 fsdata); 720 if (ret) { 721 EXOFS_DBGMSG("simple_write_begin faild\n"); 722 goto out; 723 } 724 725 page = *pagep; 726 } 727 728 /* read modify write */ 729 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { 730 ret = _readpage(page, true); 731 if (ret) { 732 /*SetPageError was done by _readpage. Is it ok?*/ 733 unlock_page(page); 734 EXOFS_DBGMSG("__readpage_filler faild\n"); 735 } 736 } 737out: 738 if (unlikely(ret)) 739 _write_failed(mapping->host, pos + len); 740 741 return ret; 742} 743 744static int exofs_write_begin_export(struct file *file, 745 struct address_space *mapping, 746 loff_t pos, unsigned len, unsigned flags, 747 struct page **pagep, void **fsdata) 748{ 749 *pagep = NULL; 750 751 return exofs_write_begin(file, mapping, pos, len, flags, pagep, 752 fsdata); 753} 754 755static int exofs_write_end(struct file *file, struct address_space *mapping, 756 loff_t pos, unsigned len, unsigned copied, 757 struct page *page, void *fsdata) 758{ 759 struct inode *inode = mapping->host; 760 /* According to comment in simple_write_end i_mutex is held */ 761 loff_t i_size = inode->i_size; 762 int ret; 763 764 ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata); 765 if (unlikely(ret)) 766 _write_failed(inode, pos + len); 767 768 /* TODO: once simple_write_end marks inode dirty remove */ 769 if (i_size != inode->i_size) 770 mark_inode_dirty(inode); 771 return ret; 772} 773 774static int exofs_releasepage(struct page *page, gfp_t gfp) 775{ 776 EXOFS_DBGMSG("page 0x%lx\n", page->index); 777 WARN_ON(1); 778 return 0; 779} 780 781static void exofs_invalidatepage(struct page *page, unsigned long offset) 782{ 783 EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset); 784 WARN_ON(1); 785} 786 787const struct address_space_operations exofs_aops = { 788 .readpage = exofs_readpage, 789 .readpages = exofs_readpages, 790 .writepage = exofs_writepage, 791 .writepages = exofs_writepages, 792 .write_begin = exofs_write_begin_export, 793 .write_end = exofs_write_end, 794 .releasepage = exofs_releasepage, 795 .set_page_dirty = __set_page_dirty_nobuffers, 796 .invalidatepage = exofs_invalidatepage, 797 798 /* Not implemented Yet */ 799 .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ 800 .direct_IO = NULL, /* TODO: Should be trivial to do */ 801 802 /* With these NULL has special meaning or default is not exported */ 803 .sync_page = NULL, 804 .get_xip_mem = NULL, 805 .migratepage = NULL, 806 .launder_page = NULL, 807 .is_partially_uptodate = NULL, 808 .error_remove_page = NULL, 809}; 810 811/****************************************************************************** 812 * INODE OPERATIONS 813 *****************************************************************************/ 814 815/* 816 * Test whether an inode is a fast symlink. 817 */ 818static inline int exofs_inode_is_fast_symlink(struct inode *inode) 819{ 820 struct exofs_i_info *oi = exofs_i(inode); 821 822 return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); 823} 824 825const struct osd_attr g_attr_logical_length = ATTR_DEF( 826 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); 827 828static int _do_truncate(struct inode *inode, loff_t newsize) 829{ 830 struct exofs_i_info *oi = exofs_i(inode); 831 int ret; 832 833 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 834 835 ret = exofs_oi_truncate(oi, (u64)newsize); 836 if (likely(!ret)) 837 truncate_setsize(inode, newsize); 838 839 EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n", 840 inode->i_ino, newsize, ret); 841 return ret; 842} 843 844/* 845 * Set inode attributes - update size attribute on OSD if needed, 846 * otherwise just call generic functions. 847 */ 848int exofs_setattr(struct dentry *dentry, struct iattr *iattr) 849{ 850 struct inode *inode = dentry->d_inode; 851 int error; 852 853 /* if we are about to modify an object, and it hasn't been 854 * created yet, wait 855 */ 856 error = wait_obj_created(exofs_i(inode)); 857 if (unlikely(error)) 858 return error; 859 860 error = inode_change_ok(inode, iattr); 861 if (unlikely(error)) 862 return error; 863 864 if ((iattr->ia_valid & ATTR_SIZE) && 865 iattr->ia_size != i_size_read(inode)) { 866 error = _do_truncate(inode, iattr->ia_size); 867 if (unlikely(error)) 868 return error; 869 } 870 871 setattr_copy(inode, iattr); 872 mark_inode_dirty(inode); 873 return 0; 874} 875 876static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF( 877 EXOFS_APAGE_FS_DATA, 878 EXOFS_ATTR_INODE_FILE_LAYOUT, 879 0); 880static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF( 881 EXOFS_APAGE_FS_DATA, 882 EXOFS_ATTR_INODE_DIR_LAYOUT, 883 0); 884 885/* 886 * Read the Linux inode info from the OSD, and return it as is. In exofs the 887 * inode info is in an application specific page/attribute of the osd-object. 888 */ 889static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, 890 struct exofs_fcb *inode) 891{ 892 struct exofs_sb_info *sbi = sb->s_fs_info; 893 struct osd_attr attrs[] = { 894 [0] = g_attr_inode_data, 895 [1] = g_attr_inode_file_layout, 896 [2] = g_attr_inode_dir_layout, 897 }; 898 struct exofs_io_state *ios; 899 struct exofs_on_disk_inode_layout *layout; 900 int ret; 901 902 ret = exofs_get_io_state(&sbi->layout, &ios); 903 if (unlikely(ret)) { 904 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 905 return ret; 906 } 907 908 ios->obj.id = exofs_oi_objno(oi); 909 exofs_make_credential(oi->i_cred, &ios->obj); 910 ios->cred = oi->i_cred; 911 912 attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); 913 attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); 914 915 ios->in_attr = attrs; 916 ios->in_attr_len = ARRAY_SIZE(attrs); 917 918 ret = exofs_sbi_read(ios); 919 if (unlikely(ret)) { 920 EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", 921 _LLU(ios->obj.id), ret); 922 memset(inode, 0, sizeof(*inode)); 923 inode->i_mode = 0040000 | (0777 & ~022); 924 /* If object is lost on target we might as well enable it's 925 * delete. 926 */ 927 if ((ret == -ENOENT) || (ret == -EINVAL)) 928 ret = 0; 929 goto out; 930 } 931 932 ret = extract_attr_from_ios(ios, &attrs[0]); 933 if (ret) { 934 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); 935 goto out; 936 } 937 WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); 938 memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE); 939 940 ret = extract_attr_from_ios(ios, &attrs[1]); 941 if (ret) { 942 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); 943 goto out; 944 } 945 if (attrs[1].len) { 946 layout = attrs[1].val_ptr; 947 if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { 948 EXOFS_ERR("%s: unsupported files layout %d\n", 949 __func__, layout->gen_func); 950 ret = -ENOTSUPP; 951 goto out; 952 } 953 } 954 955 ret = extract_attr_from_ios(ios, &attrs[2]); 956 if (ret) { 957 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); 958 goto out; 959 } 960 if (attrs[2].len) { 961 layout = attrs[2].val_ptr; 962 if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { 963 EXOFS_ERR("%s: unsupported meta-data layout %d\n", 964 __func__, layout->gen_func); 965 ret = -ENOTSUPP; 966 goto out; 967 } 968 } 969 970out: 971 exofs_put_io_state(ios); 972 return ret; 973} 974 975static void __oi_init(struct exofs_i_info *oi) 976{ 977 init_waitqueue_head(&oi->i_wq); 978 oi->i_flags = 0; 979} 980/* 981 * Fill in an inode read from the OSD and set it up for use 982 */ 983struct inode *exofs_iget(struct super_block *sb, unsigned long ino) 984{ 985 struct exofs_i_info *oi; 986 struct exofs_fcb fcb; 987 struct inode *inode; 988 int ret; 989 990 inode = iget_locked(sb, ino); 991 if (!inode) 992 return ERR_PTR(-ENOMEM); 993 if (!(inode->i_state & I_NEW)) 994 return inode; 995 oi = exofs_i(inode); 996 __oi_init(oi); 997 998 /* read the inode from the osd */ 999 ret = exofs_get_inode(sb, oi, &fcb); 1000 if (ret) 1001 goto bad_inode; 1002 1003 set_obj_created(oi); 1004 1005 /* copy stuff from on-disk struct to in-memory struct */ 1006 inode->i_mode = le16_to_cpu(fcb.i_mode); 1007 inode->i_uid = le32_to_cpu(fcb.i_uid); 1008 inode->i_gid = le32_to_cpu(fcb.i_gid); 1009 inode->i_nlink = le16_to_cpu(fcb.i_links_count); 1010 inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); 1011 inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); 1012 inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); 1013 inode->i_ctime.tv_nsec = 1014 inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0; 1015 oi->i_commit_size = le64_to_cpu(fcb.i_size); 1016 i_size_write(inode, oi->i_commit_size); 1017 inode->i_blkbits = EXOFS_BLKSHIFT; 1018 inode->i_generation = le32_to_cpu(fcb.i_generation); 1019 1020 oi->i_dir_start_lookup = 0; 1021 1022 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { 1023 ret = -ESTALE; 1024 goto bad_inode; 1025 } 1026 1027 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 1028 if (fcb.i_data[0]) 1029 inode->i_rdev = 1030 old_decode_dev(le32_to_cpu(fcb.i_data[0])); 1031 else 1032 inode->i_rdev = 1033 new_decode_dev(le32_to_cpu(fcb.i_data[1])); 1034 } else { 1035 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); 1036 } 1037 1038 if (S_ISREG(inode->i_mode)) { 1039 inode->i_op = &exofs_file_inode_operations; 1040 inode->i_fop = &exofs_file_operations; 1041 inode->i_mapping->a_ops = &exofs_aops; 1042 } else if (S_ISDIR(inode->i_mode)) { 1043 inode->i_op = &exofs_dir_inode_operations; 1044 inode->i_fop = &exofs_dir_operations; 1045 inode->i_mapping->a_ops = &exofs_aops; 1046 } else if (S_ISLNK(inode->i_mode)) { 1047 if (exofs_inode_is_fast_symlink(inode)) 1048 inode->i_op = &exofs_fast_symlink_inode_operations; 1049 else { 1050 inode->i_op = &exofs_symlink_inode_operations; 1051 inode->i_mapping->a_ops = &exofs_aops; 1052 } 1053 } else { 1054 inode->i_op = &exofs_special_inode_operations; 1055 if (fcb.i_data[0]) 1056 init_special_inode(inode, inode->i_mode, 1057 old_decode_dev(le32_to_cpu(fcb.i_data[0]))); 1058 else 1059 init_special_inode(inode, inode->i_mode, 1060 new_decode_dev(le32_to_cpu(fcb.i_data[1]))); 1061 } 1062 1063 unlock_new_inode(inode); 1064 return inode; 1065 1066bad_inode: 1067 iget_failed(inode); 1068 return ERR_PTR(ret); 1069} 1070 1071int __exofs_wait_obj_created(struct exofs_i_info *oi) 1072{ 1073 if (!obj_created(oi)) { 1074 BUG_ON(!obj_2bcreated(oi)); 1075 wait_event(oi->i_wq, obj_created(oi)); 1076 } 1077 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; 1078} 1079/* 1080 * Callback function from exofs_new_inode(). The important thing is that we 1081 * set the obj_created flag so that other methods know that the object exists on 1082 * the OSD. 1083 */ 1084static void create_done(struct exofs_io_state *ios, void *p) 1085{ 1086 struct inode *inode = p; 1087 struct exofs_i_info *oi = exofs_i(inode); 1088 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 1089 int ret; 1090 1091 ret = exofs_check_io(ios, NULL); 1092 exofs_put_io_state(ios); 1093 1094 atomic_dec(&sbi->s_curr_pending); 1095 1096 if (unlikely(ret)) { 1097 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", 1098 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); 1099 /*TODO: When FS is corrupted creation can fail, object already 1100 * exist. Get rid of this asynchronous creation, if exist 1101 * increment the obj counter and try the next object. Until we 1102 * succeed. All these dangling objects will be made into lost 1103 * files by chkfs.exofs 1104 */ 1105 } 1106 1107 set_obj_created(oi); 1108 1109 atomic_dec(&inode->i_count); 1110 wake_up(&oi->i_wq); 1111} 1112 1113/* 1114 * Set up a new inode and create an object for it on the OSD 1115 */ 1116struct inode *exofs_new_inode(struct inode *dir, int mode) 1117{ 1118 struct super_block *sb; 1119 struct inode *inode; 1120 struct exofs_i_info *oi; 1121 struct exofs_sb_info *sbi; 1122 struct exofs_io_state *ios; 1123 int ret; 1124 1125 sb = dir->i_sb; 1126 inode = new_inode(sb); 1127 if (!inode) 1128 return ERR_PTR(-ENOMEM); 1129 1130 oi = exofs_i(inode); 1131 __oi_init(oi); 1132 1133 set_obj_2bcreated(oi); 1134 1135 sbi = sb->s_fs_info; 1136 1137 sb->s_dirt = 1; 1138 inode_init_owner(inode, dir, mode); 1139 inode->i_ino = sbi->s_nextid++; 1140 inode->i_blkbits = EXOFS_BLKSHIFT; 1141 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 1142 oi->i_commit_size = inode->i_size = 0; 1143 spin_lock(&sbi->s_next_gen_lock); 1144 inode->i_generation = sbi->s_next_generation++; 1145 spin_unlock(&sbi->s_next_gen_lock); 1146 insert_inode_hash(inode); 1147 1148 mark_inode_dirty(inode); 1149 1150 ret = exofs_get_io_state(&sbi->layout, &ios); 1151 if (unlikely(ret)) { 1152 EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); 1153 return ERR_PTR(ret); 1154 } 1155 1156 ios->obj.id = exofs_oi_objno(oi); 1157 exofs_make_credential(oi->i_cred, &ios->obj); 1158 1159 /* increment the refcount so that the inode will still be around when we 1160 * reach the callback 1161 */ 1162 atomic_inc(&inode->i_count); 1163 1164 ios->done = create_done; 1165 ios->private = inode; 1166 ios->cred = oi->i_cred; 1167 ret = exofs_sbi_create(ios); 1168 if (ret) { 1169 atomic_dec(&inode->i_count); 1170 exofs_put_io_state(ios); 1171 return ERR_PTR(ret); 1172 } 1173 atomic_inc(&sbi->s_curr_pending); 1174 1175 return inode; 1176} 1177 1178/* 1179 * struct to pass two arguments to update_inode's callback 1180 */ 1181struct updatei_args { 1182 struct exofs_sb_info *sbi; 1183 struct exofs_fcb fcb; 1184}; 1185 1186/* 1187 * Callback function from exofs_update_inode(). 1188 */ 1189static void updatei_done(struct exofs_io_state *ios, void *p) 1190{ 1191 struct updatei_args *args = p; 1192 1193 exofs_put_io_state(ios); 1194 1195 atomic_dec(&args->sbi->s_curr_pending); 1196 1197 kfree(args); 1198} 1199 1200/* 1201 * Write the inode to the OSD. Just fill up the struct, and set the attribute 1202 * synchronously or asynchronously depending on the do_sync flag. 1203 */ 1204static int exofs_update_inode(struct inode *inode, int do_sync) 1205{ 1206 struct exofs_i_info *oi = exofs_i(inode); 1207 struct super_block *sb = inode->i_sb; 1208 struct exofs_sb_info *sbi = sb->s_fs_info; 1209 struct exofs_io_state *ios; 1210 struct osd_attr attr; 1211 struct exofs_fcb *fcb; 1212 struct updatei_args *args; 1213 int ret; 1214 1215 args = kzalloc(sizeof(*args), GFP_KERNEL); 1216 if (!args) { 1217 EXOFS_DBGMSG("Faild kzalloc of args\n"); 1218 return -ENOMEM; 1219 } 1220 1221 fcb = &args->fcb; 1222 1223 fcb->i_mode = cpu_to_le16(inode->i_mode); 1224 fcb->i_uid = cpu_to_le32(inode->i_uid); 1225 fcb->i_gid = cpu_to_le32(inode->i_gid); 1226 fcb->i_links_count = cpu_to_le16(inode->i_nlink); 1227 fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 1228 fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec); 1229 fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); 1230 oi->i_commit_size = i_size_read(inode); 1231 fcb->i_size = cpu_to_le64(oi->i_commit_size); 1232 fcb->i_generation = cpu_to_le32(inode->i_generation); 1233 1234 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 1235 if (old_valid_dev(inode->i_rdev)) { 1236 fcb->i_data[0] = 1237 cpu_to_le32(old_encode_dev(inode->i_rdev)); 1238 fcb->i_data[1] = 0; 1239 } else { 1240 fcb->i_data[0] = 0; 1241 fcb->i_data[1] = 1242 cpu_to_le32(new_encode_dev(inode->i_rdev)); 1243 fcb->i_data[2] = 0; 1244 } 1245 } else 1246 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); 1247 1248 ret = exofs_get_io_state(&sbi->layout, &ios); 1249 if (unlikely(ret)) { 1250 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 1251 goto free_args; 1252 } 1253 1254 attr = g_attr_inode_data; 1255 attr.val_ptr = fcb; 1256 ios->out_attr_len = 1; 1257 ios->out_attr = &attr; 1258 1259 if (!obj_created(oi)) { 1260 EXOFS_DBGMSG("!obj_created\n"); 1261 BUG_ON(!obj_2bcreated(oi)); 1262 wait_event(oi->i_wq, obj_created(oi)); 1263 EXOFS_DBGMSG("wait_event done\n"); 1264 } 1265 1266 if (!do_sync) { 1267 args->sbi = sbi; 1268 ios->done = updatei_done; 1269 ios->private = args; 1270 } 1271 1272 ret = exofs_oi_write(oi, ios); 1273 if (!do_sync && !ret) { 1274 atomic_inc(&sbi->s_curr_pending); 1275 goto out; /* deallocation in updatei_done */ 1276 } 1277 1278 exofs_put_io_state(ios); 1279free_args: 1280 kfree(args); 1281out: 1282 EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n", 1283 inode->i_ino, do_sync, ret); 1284 return ret; 1285} 1286 1287int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) 1288{ 1289 return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); 1290} 1291 1292/* 1293 * Callback function from exofs_delete_inode() - don't have much cleaning up to 1294 * do. 1295 */ 1296static void delete_done(struct exofs_io_state *ios, void *p) 1297{ 1298 struct exofs_sb_info *sbi = p; 1299 1300 exofs_put_io_state(ios); 1301 1302 atomic_dec(&sbi->s_curr_pending); 1303} 1304 1305/* 1306 * Called when the refcount of an inode reaches zero. We remove the object 1307 * from the OSD here. We make sure the object was created before we try and 1308 * delete it. 1309 */ 1310void exofs_evict_inode(struct inode *inode) 1311{ 1312 struct exofs_i_info *oi = exofs_i(inode); 1313 struct super_block *sb = inode->i_sb; 1314 struct exofs_sb_info *sbi = sb->s_fs_info; 1315 struct exofs_io_state *ios; 1316 int ret; 1317 1318 truncate_inode_pages(&inode->i_data, 0); 1319 1320 /* TODO: should do better here */ 1321 if (inode->i_nlink || is_bad_inode(inode)) 1322 goto no_delete; 1323 1324 inode->i_size = 0; 1325 end_writeback(inode); 1326 1327 /* if we are deleting an obj that hasn't been created yet, wait */ 1328 if (!obj_created(oi)) { 1329 BUG_ON(!obj_2bcreated(oi)); 1330 wait_event(oi->i_wq, obj_created(oi)); 1331 /* ignore the error attempt a remove anyway */ 1332 } 1333 1334 /* Now Remove the OSD objects */ 1335 ret = exofs_get_io_state(&sbi->layout, &ios); 1336 if (unlikely(ret)) { 1337 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); 1338 return; 1339 } 1340 1341 ios->obj.id = exofs_oi_objno(oi); 1342 ios->done = delete_done; 1343 ios->private = sbi; 1344 ios->cred = oi->i_cred; 1345 ret = exofs_sbi_remove(ios); 1346 if (ret) { 1347 EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__); 1348 exofs_put_io_state(ios); 1349 return; 1350 } 1351 atomic_inc(&sbi->s_curr_pending); 1352 1353 return; 1354 1355no_delete: 1356 end_writeback(inode); 1357} 1358