1/* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18#include "xfs.h" 19#include "xfs_fs.h" 20#include "xfs_bit.h" 21#include "xfs_log.h" 22#include "xfs_inum.h" 23#include "xfs_sb.h" 24#include "xfs_ag.h" 25#include "xfs_trans.h" 26#include "xfs_mount.h" 27#include "xfs_bmap_btree.h" 28#include "xfs_alloc.h" 29#include "xfs_dinode.h" 30#include "xfs_inode.h" 31#include "xfs_inode_item.h" 32#include "xfs_bmap.h" 33#include "xfs_error.h" 34#include "xfs_vnodeops.h" 35#include "xfs_da_btree.h" 36#include "xfs_ioctl.h" 37#include "xfs_trace.h" 38 39#include <linux/dcache.h> 40 41static const struct vm_operations_struct xfs_file_vm_ops; 42 43/* 44 * xfs_iozero 45 * 46 * xfs_iozero clears the specified range of buffer supplied, 47 * and marks all the affected blocks as valid and modified. If 48 * an affected block is not allocated, it will be allocated. If 49 * an affected block is not completely overwritten, and is not 50 * valid before the operation, it will be read from disk before 51 * being partially zeroed. 52 */ 53STATIC int 54xfs_iozero( 55 struct xfs_inode *ip, /* inode */ 56 loff_t pos, /* offset in file */ 57 size_t count) /* size of data to zero */ 58{ 59 struct page *page; 60 struct address_space *mapping; 61 int status; 62 63 mapping = VFS_I(ip)->i_mapping; 64 do { 65 unsigned offset, bytes; 66 void *fsdata; 67 68 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ 69 bytes = PAGE_CACHE_SIZE - offset; 70 if (bytes > count) 71 bytes = count; 72 73 status = pagecache_write_begin(NULL, mapping, pos, bytes, 74 AOP_FLAG_UNINTERRUPTIBLE, 75 &page, &fsdata); 76 if (status) 77 break; 78 79 zero_user(page, offset, bytes); 80 81 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, 82 page, fsdata); 83 WARN_ON(status <= 0); /* can't return less than zero! */ 84 pos += bytes; 85 count -= bytes; 86 status = 0; 87 } while (count); 88 89 return (-status); 90} 91 92STATIC int 93xfs_file_fsync( 94 struct file *file, 95 int datasync) 96{ 97 struct inode *inode = file->f_mapping->host; 98 struct xfs_inode *ip = XFS_I(inode); 99 struct xfs_trans *tp; 100 int error = 0; 101 int log_flushed = 0; 102 103 trace_xfs_file_fsync(ip); 104 105 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 106 return -XFS_ERROR(EIO); 107 108 xfs_iflags_clear(ip, XFS_ITRUNCATED); 109 110 xfs_ioend_wait(ip); 111 112 /* 113 * We always need to make sure that the required inode state is safe on 114 * disk. The inode might be clean but we still might need to force the 115 * log because of committed transactions that haven't hit the disk yet. 116 * Likewise, there could be unflushed non-transactional changes to the 117 * inode core that have to go to disk and this requires us to issue 118 * a synchronous transaction to capture these changes correctly. 119 * 120 * This code relies on the assumption that if the i_update_core field 121 * of the inode is clear and the inode is unpinned then it is clean 122 * and no action is required. 123 */ 124 xfs_ilock(ip, XFS_ILOCK_SHARED); 125 126 /* 127 * First check if the VFS inode is marked dirty. All the dirtying 128 * of non-transactional updates no goes through mark_inode_dirty*, 129 * which allows us to distinguish beteeen pure timestamp updates 130 * and i_size updates which need to be caught for fdatasync. 131 * After that also theck for the dirty state in the XFS inode, which 132 * might gets cleared when the inode gets written out via the AIL 133 * or xfs_iflush_cluster. 134 */ 135 if (((inode->i_state & I_DIRTY_DATASYNC) || 136 ((inode->i_state & I_DIRTY_SYNC) && !datasync)) && 137 ip->i_update_core) { 138 /* 139 * Kick off a transaction to log the inode core to get the 140 * updates. The sync transaction will also force the log. 141 */ 142 xfs_iunlock(ip, XFS_ILOCK_SHARED); 143 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); 144 error = xfs_trans_reserve(tp, 0, 145 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); 146 if (error) { 147 xfs_trans_cancel(tp, 0); 148 return -error; 149 } 150 xfs_ilock(ip, XFS_ILOCK_EXCL); 151 152 /* 153 * Note - it's possible that we might have pushed ourselves out 154 * of the way during trans_reserve which would flush the inode. 155 * But there's no guarantee that the inode buffer has actually 156 * gone out yet (it's delwri). Plus the buffer could be pinned 157 * anyway if it's part of an inode in another recent 158 * transaction. So we play it safe and fire off the 159 * transaction anyway. 160 */ 161 xfs_trans_ijoin(tp, ip); 162 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 163 xfs_trans_set_sync(tp); 164 error = _xfs_trans_commit(tp, 0, &log_flushed); 165 166 xfs_iunlock(ip, XFS_ILOCK_EXCL); 167 } else { 168 /* 169 * Timestamps/size haven't changed since last inode flush or 170 * inode transaction commit. That means either nothing got 171 * written or a transaction committed which caught the updates. 172 * If the latter happened and the transaction hasn't hit the 173 * disk yet, the inode will be still be pinned. If it is, 174 * force the log. 175 */ 176 if (xfs_ipincount(ip)) { 177 error = _xfs_log_force_lsn(ip->i_mount, 178 ip->i_itemp->ili_last_lsn, 179 XFS_LOG_SYNC, &log_flushed); 180 } 181 xfs_iunlock(ip, XFS_ILOCK_SHARED); 182 } 183 184 if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { 185 /* 186 * If the log write didn't issue an ordered tag we need 187 * to flush the disk cache for the data device now. 188 */ 189 if (!log_flushed) 190 xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); 191 192 /* 193 * If this inode is on the RT dev we need to flush that 194 * cache as well. 195 */ 196 if (XFS_IS_REALTIME_INODE(ip)) 197 xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); 198 } 199 200 return -error; 201} 202 203STATIC ssize_t 204xfs_file_aio_read( 205 struct kiocb *iocb, 206 const struct iovec *iovp, 207 unsigned long nr_segs, 208 loff_t pos) 209{ 210 struct file *file = iocb->ki_filp; 211 struct inode *inode = file->f_mapping->host; 212 struct xfs_inode *ip = XFS_I(inode); 213 struct xfs_mount *mp = ip->i_mount; 214 size_t size = 0; 215 ssize_t ret = 0; 216 int ioflags = 0; 217 xfs_fsize_t n; 218 unsigned long seg; 219 220 XFS_STATS_INC(xs_read_calls); 221 222 BUG_ON(iocb->ki_pos != pos); 223 224 if (unlikely(file->f_flags & O_DIRECT)) 225 ioflags |= IO_ISDIRECT; 226 if (file->f_mode & FMODE_NOCMTIME) 227 ioflags |= IO_INVIS; 228 229 /* START copy & waste from filemap.c */ 230 for (seg = 0; seg < nr_segs; seg++) { 231 const struct iovec *iv = &iovp[seg]; 232 233 /* 234 * If any segment has a negative length, or the cumulative 235 * length ever wraps negative then return -EINVAL. 236 */ 237 size += iv->iov_len; 238 if (unlikely((ssize_t)(size|iv->iov_len) < 0)) 239 return XFS_ERROR(-EINVAL); 240 } 241 /* END copy & waste from filemap.c */ 242 243 if (unlikely(ioflags & IO_ISDIRECT)) { 244 xfs_buftarg_t *target = 245 XFS_IS_REALTIME_INODE(ip) ? 246 mp->m_rtdev_targp : mp->m_ddev_targp; 247 if ((iocb->ki_pos & target->bt_smask) || 248 (size & target->bt_smask)) { 249 if (iocb->ki_pos == ip->i_size) 250 return 0; 251 return -XFS_ERROR(EINVAL); 252 } 253 } 254 255 n = XFS_MAXIOFFSET(mp) - iocb->ki_pos; 256 if (n <= 0 || size == 0) 257 return 0; 258 259 if (n < size) 260 size = n; 261 262 if (XFS_FORCED_SHUTDOWN(mp)) 263 return -EIO; 264 265 if (unlikely(ioflags & IO_ISDIRECT)) 266 mutex_lock(&inode->i_mutex); 267 xfs_ilock(ip, XFS_IOLOCK_SHARED); 268 269 if (unlikely(ioflags & IO_ISDIRECT)) { 270 if (inode->i_mapping->nrpages) { 271 ret = -xfs_flushinval_pages(ip, 272 (iocb->ki_pos & PAGE_CACHE_MASK), 273 -1, FI_REMAPF_LOCKED); 274 } 275 mutex_unlock(&inode->i_mutex); 276 if (ret) { 277 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 278 return ret; 279 } 280 } 281 282 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 283 284 ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); 285 if (ret > 0) 286 XFS_STATS_ADD(xs_read_bytes, ret); 287 288 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 289 return ret; 290} 291 292STATIC ssize_t 293xfs_file_splice_read( 294 struct file *infilp, 295 loff_t *ppos, 296 struct pipe_inode_info *pipe, 297 size_t count, 298 unsigned int flags) 299{ 300 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); 301 int ioflags = 0; 302 ssize_t ret; 303 304 XFS_STATS_INC(xs_read_calls); 305 306 if (infilp->f_mode & FMODE_NOCMTIME) 307 ioflags |= IO_INVIS; 308 309 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 310 return -EIO; 311 312 xfs_ilock(ip, XFS_IOLOCK_SHARED); 313 314 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 315 316 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); 317 if (ret > 0) 318 XFS_STATS_ADD(xs_read_bytes, ret); 319 320 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 321 return ret; 322} 323 324STATIC ssize_t 325xfs_file_splice_write( 326 struct pipe_inode_info *pipe, 327 struct file *outfilp, 328 loff_t *ppos, 329 size_t count, 330 unsigned int flags) 331{ 332 struct inode *inode = outfilp->f_mapping->host; 333 struct xfs_inode *ip = XFS_I(inode); 334 xfs_fsize_t isize, new_size; 335 int ioflags = 0; 336 ssize_t ret; 337 338 XFS_STATS_INC(xs_write_calls); 339 340 if (outfilp->f_mode & FMODE_NOCMTIME) 341 ioflags |= IO_INVIS; 342 343 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 344 return -EIO; 345 346 xfs_ilock(ip, XFS_IOLOCK_EXCL); 347 348 new_size = *ppos + count; 349 350 xfs_ilock(ip, XFS_ILOCK_EXCL); 351 if (new_size > ip->i_size) 352 ip->i_new_size = new_size; 353 xfs_iunlock(ip, XFS_ILOCK_EXCL); 354 355 trace_xfs_file_splice_write(ip, count, *ppos, ioflags); 356 357 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 358 if (ret > 0) 359 XFS_STATS_ADD(xs_write_bytes, ret); 360 361 isize = i_size_read(inode); 362 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) 363 *ppos = isize; 364 365 if (*ppos > ip->i_size) { 366 xfs_ilock(ip, XFS_ILOCK_EXCL); 367 if (*ppos > ip->i_size) 368 ip->i_size = *ppos; 369 xfs_iunlock(ip, XFS_ILOCK_EXCL); 370 } 371 372 if (ip->i_new_size) { 373 xfs_ilock(ip, XFS_ILOCK_EXCL); 374 ip->i_new_size = 0; 375 if (ip->i_d.di_size > ip->i_size) 376 ip->i_d.di_size = ip->i_size; 377 xfs_iunlock(ip, XFS_ILOCK_EXCL); 378 } 379 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 380 return ret; 381} 382 383/* 384 * This routine is called to handle zeroing any space in the last 385 * block of the file that is beyond the EOF. We do this since the 386 * size is being increased without writing anything to that block 387 * and we don't want anyone to read the garbage on the disk. 388 */ 389STATIC int /* error (positive) */ 390xfs_zero_last_block( 391 xfs_inode_t *ip, 392 xfs_fsize_t offset, 393 xfs_fsize_t isize) 394{ 395 xfs_fileoff_t last_fsb; 396 xfs_mount_t *mp = ip->i_mount; 397 int nimaps; 398 int zero_offset; 399 int zero_len; 400 int error = 0; 401 xfs_bmbt_irec_t imap; 402 403 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 404 405 zero_offset = XFS_B_FSB_OFFSET(mp, isize); 406 if (zero_offset == 0) { 407 /* 408 * There are no extra bytes in the last block on disk to 409 * zero, so return. 410 */ 411 return 0; 412 } 413 414 last_fsb = XFS_B_TO_FSBT(mp, isize); 415 nimaps = 1; 416 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, 417 &nimaps, NULL); 418 if (error) { 419 return error; 420 } 421 ASSERT(nimaps > 0); 422 /* 423 * If the block underlying isize is just a hole, then there 424 * is nothing to zero. 425 */ 426 if (imap.br_startblock == HOLESTARTBLOCK) { 427 return 0; 428 } 429 /* 430 * Zero the part of the last block beyond the EOF, and write it 431 * out sync. We need to drop the ilock while we do this so we 432 * don't deadlock when the buffer cache calls back to us. 433 */ 434 xfs_iunlock(ip, XFS_ILOCK_EXCL); 435 436 zero_len = mp->m_sb.sb_blocksize - zero_offset; 437 if (isize + zero_len > offset) 438 zero_len = offset - isize; 439 error = xfs_iozero(ip, isize, zero_len); 440 441 xfs_ilock(ip, XFS_ILOCK_EXCL); 442 ASSERT(error >= 0); 443 return error; 444} 445 446/* 447 * Zero any on disk space between the current EOF and the new, 448 * larger EOF. This handles the normal case of zeroing the remainder 449 * of the last block in the file and the unusual case of zeroing blocks 450 * out beyond the size of the file. This second case only happens 451 * with fixed size extents and when the system crashes before the inode 452 * size was updated but after blocks were allocated. If fill is set, 453 * then any holes in the range are filled and zeroed. If not, the holes 454 * are left alone as holes. 455 */ 456 457int /* error (positive) */ 458xfs_zero_eof( 459 xfs_inode_t *ip, 460 xfs_off_t offset, /* starting I/O offset */ 461 xfs_fsize_t isize) /* current inode size */ 462{ 463 xfs_mount_t *mp = ip->i_mount; 464 xfs_fileoff_t start_zero_fsb; 465 xfs_fileoff_t end_zero_fsb; 466 xfs_fileoff_t zero_count_fsb; 467 xfs_fileoff_t last_fsb; 468 xfs_fileoff_t zero_off; 469 xfs_fsize_t zero_len; 470 int nimaps; 471 int error = 0; 472 xfs_bmbt_irec_t imap; 473 474 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 475 ASSERT(offset > isize); 476 477 /* 478 * First handle zeroing the block on which isize resides. 479 * We only zero a part of that block so it is handled specially. 480 */ 481 error = xfs_zero_last_block(ip, offset, isize); 482 if (error) { 483 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 484 return error; 485 } 486 487 /* 488 * Calculate the range between the new size and the old 489 * where blocks needing to be zeroed may exist. To get the 490 * block where the last byte in the file currently resides, 491 * we need to subtract one from the size and truncate back 492 * to a block boundary. We subtract 1 in case the size is 493 * exactly on a block boundary. 494 */ 495 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; 496 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); 497 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); 498 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); 499 if (last_fsb == end_zero_fsb) { 500 /* 501 * The size was only incremented on its last block. 502 * We took care of that above, so just return. 503 */ 504 return 0; 505 } 506 507 ASSERT(start_zero_fsb <= end_zero_fsb); 508 while (start_zero_fsb <= end_zero_fsb) { 509 nimaps = 1; 510 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; 511 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, 512 0, NULL, 0, &imap, &nimaps, NULL); 513 if (error) { 514 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 515 return error; 516 } 517 ASSERT(nimaps > 0); 518 519 if (imap.br_state == XFS_EXT_UNWRITTEN || 520 imap.br_startblock == HOLESTARTBLOCK) { 521 /* 522 * This loop handles initializing pages that were 523 * partially initialized by the code below this 524 * loop. It basically zeroes the part of the page 525 * that sits on a hole and sets the page as P_HOLE 526 * and calls remapf if it is a mapped file. 527 */ 528 start_zero_fsb = imap.br_startoff + imap.br_blockcount; 529 ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 530 continue; 531 } 532 533 /* 534 * There are blocks we need to zero. 535 * Drop the inode lock while we're doing the I/O. 536 * We'll still have the iolock to protect us. 537 */ 538 xfs_iunlock(ip, XFS_ILOCK_EXCL); 539 540 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); 541 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); 542 543 if ((zero_off + zero_len) > offset) 544 zero_len = offset - zero_off; 545 546 error = xfs_iozero(ip, zero_off, zero_len); 547 if (error) { 548 goto out_lock; 549 } 550 551 start_zero_fsb = imap.br_startoff + imap.br_blockcount; 552 ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 553 554 xfs_ilock(ip, XFS_ILOCK_EXCL); 555 } 556 557 return 0; 558 559out_lock: 560 xfs_ilock(ip, XFS_ILOCK_EXCL); 561 ASSERT(error >= 0); 562 return error; 563} 564 565STATIC ssize_t 566xfs_file_aio_write( 567 struct kiocb *iocb, 568 const struct iovec *iovp, 569 unsigned long nr_segs, 570 loff_t pos) 571{ 572 struct file *file = iocb->ki_filp; 573 struct address_space *mapping = file->f_mapping; 574 struct inode *inode = mapping->host; 575 struct xfs_inode *ip = XFS_I(inode); 576 struct xfs_mount *mp = ip->i_mount; 577 ssize_t ret = 0, error = 0; 578 int ioflags = 0; 579 xfs_fsize_t isize, new_size; 580 int iolock; 581 size_t ocount = 0, count; 582 int need_i_mutex; 583 584 XFS_STATS_INC(xs_write_calls); 585 586 BUG_ON(iocb->ki_pos != pos); 587 588 if (unlikely(file->f_flags & O_DIRECT)) 589 ioflags |= IO_ISDIRECT; 590 if (file->f_mode & FMODE_NOCMTIME) 591 ioflags |= IO_INVIS; 592 593 error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); 594 if (error) 595 return error; 596 597 count = ocount; 598 if (count == 0) 599 return 0; 600 601 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); 602 603 if (XFS_FORCED_SHUTDOWN(mp)) 604 return -EIO; 605 606relock: 607 if (ioflags & IO_ISDIRECT) { 608 iolock = XFS_IOLOCK_SHARED; 609 need_i_mutex = 0; 610 } else { 611 iolock = XFS_IOLOCK_EXCL; 612 need_i_mutex = 1; 613 mutex_lock(&inode->i_mutex); 614 } 615 616 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 617 618start: 619 error = -generic_write_checks(file, &pos, &count, 620 S_ISBLK(inode->i_mode)); 621 if (error) { 622 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 623 goto out_unlock_mutex; 624 } 625 626 if (ioflags & IO_ISDIRECT) { 627 xfs_buftarg_t *target = 628 XFS_IS_REALTIME_INODE(ip) ? 629 mp->m_rtdev_targp : mp->m_ddev_targp; 630 631 if ((pos & target->bt_smask) || (count & target->bt_smask)) { 632 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 633 return XFS_ERROR(-EINVAL); 634 } 635 636 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { 637 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 638 iolock = XFS_IOLOCK_EXCL; 639 need_i_mutex = 1; 640 mutex_lock(&inode->i_mutex); 641 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 642 goto start; 643 } 644 } 645 646 new_size = pos + count; 647 if (new_size > ip->i_size) 648 ip->i_new_size = new_size; 649 650 if (likely(!(ioflags & IO_INVIS))) 651 file_update_time(file); 652 653 /* 654 * If the offset is beyond the size of the file, we have a couple 655 * of things to do. First, if there is already space allocated 656 * we need to either create holes or zero the disk or ... 657 * 658 * If there is a page where the previous size lands, we need 659 * to zero it out up to the new size. 660 */ 661 662 if (pos > ip->i_size) { 663 error = xfs_zero_eof(ip, pos, ip->i_size); 664 if (error) { 665 xfs_iunlock(ip, XFS_ILOCK_EXCL); 666 goto out_unlock_internal; 667 } 668 } 669 xfs_iunlock(ip, XFS_ILOCK_EXCL); 670 671 /* 672 * If we're writing the file then make sure to clear the 673 * setuid and setgid bits if the process is not being run 674 * by root. This keeps people from modifying setuid and 675 * setgid binaries. 676 */ 677 error = -file_remove_suid(file); 678 if (unlikely(error)) 679 goto out_unlock_internal; 680 681 /* We can write back this queue in page reclaim */ 682 current->backing_dev_info = mapping->backing_dev_info; 683 684 if ((ioflags & IO_ISDIRECT)) { 685 if (mapping->nrpages) { 686 WARN_ON(need_i_mutex == 0); 687 error = xfs_flushinval_pages(ip, 688 (pos & PAGE_CACHE_MASK), 689 -1, FI_REMAPF_LOCKED); 690 if (error) 691 goto out_unlock_internal; 692 } 693 694 if (need_i_mutex) { 695 /* demote the lock now the cached pages are gone */ 696 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 697 mutex_unlock(&inode->i_mutex); 698 699 iolock = XFS_IOLOCK_SHARED; 700 need_i_mutex = 0; 701 } 702 703 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); 704 ret = generic_file_direct_write(iocb, iovp, 705 &nr_segs, pos, &iocb->ki_pos, count, ocount); 706 707 /* 708 * direct-io write to a hole: fall through to buffered I/O 709 * for completing the rest of the request. 710 */ 711 if (ret >= 0 && ret != count) { 712 XFS_STATS_ADD(xs_write_bytes, ret); 713 714 pos += ret; 715 count -= ret; 716 717 ioflags &= ~IO_ISDIRECT; 718 xfs_iunlock(ip, iolock); 719 goto relock; 720 } 721 } else { 722 int enospc = 0; 723 ssize_t ret2 = 0; 724 725write_retry: 726 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); 727 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs, 728 pos, &iocb->ki_pos, count, ret); 729 /* 730 * if we just got an ENOSPC, flush the inode now we 731 * aren't holding any page locks and retry *once* 732 */ 733 if (ret2 == -ENOSPC && !enospc) { 734 error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE); 735 if (error) 736 goto out_unlock_internal; 737 enospc = 1; 738 goto write_retry; 739 } 740 ret = ret2; 741 } 742 743 current->backing_dev_info = NULL; 744 745 isize = i_size_read(inode); 746 if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) 747 iocb->ki_pos = isize; 748 749 if (iocb->ki_pos > ip->i_size) { 750 xfs_ilock(ip, XFS_ILOCK_EXCL); 751 if (iocb->ki_pos > ip->i_size) 752 ip->i_size = iocb->ki_pos; 753 xfs_iunlock(ip, XFS_ILOCK_EXCL); 754 } 755 756 error = -ret; 757 if (ret <= 0) 758 goto out_unlock_internal; 759 760 XFS_STATS_ADD(xs_write_bytes, ret); 761 762 /* Handle various SYNC-type writes */ 763 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 764 loff_t end = pos + ret - 1; 765 int error2; 766 767 xfs_iunlock(ip, iolock); 768 if (need_i_mutex) 769 mutex_unlock(&inode->i_mutex); 770 771 error2 = filemap_write_and_wait_range(mapping, pos, end); 772 if (!error) 773 error = error2; 774 if (need_i_mutex) 775 mutex_lock(&inode->i_mutex); 776 xfs_ilock(ip, iolock); 777 778 error2 = -xfs_file_fsync(file, 779 (file->f_flags & __O_SYNC) ? 0 : 1); 780 if (!error) 781 error = error2; 782 } 783 784 out_unlock_internal: 785 if (ip->i_new_size) { 786 xfs_ilock(ip, XFS_ILOCK_EXCL); 787 ip->i_new_size = 0; 788 /* 789 * If this was a direct or synchronous I/O that failed (such 790 * as ENOSPC) then part of the I/O may have been written to 791 * disk before the error occured. In this case the on-disk 792 * file size may have been adjusted beyond the in-memory file 793 * size and now needs to be truncated back. 794 */ 795 if (ip->i_d.di_size > ip->i_size) 796 ip->i_d.di_size = ip->i_size; 797 xfs_iunlock(ip, XFS_ILOCK_EXCL); 798 } 799 xfs_iunlock(ip, iolock); 800 out_unlock_mutex: 801 if (need_i_mutex) 802 mutex_unlock(&inode->i_mutex); 803 return -error; 804} 805 806STATIC int 807xfs_file_open( 808 struct inode *inode, 809 struct file *file) 810{ 811 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) 812 return -EFBIG; 813 if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) 814 return -EIO; 815 return 0; 816} 817 818STATIC int 819xfs_dir_open( 820 struct inode *inode, 821 struct file *file) 822{ 823 struct xfs_inode *ip = XFS_I(inode); 824 int mode; 825 int error; 826 827 error = xfs_file_open(inode, file); 828 if (error) 829 return error; 830 831 /* 832 * If there are any blocks, read-ahead block 0 as we're almost 833 * certain to have the next operation be a read there. 834 */ 835 mode = xfs_ilock_map_shared(ip); 836 if (ip->i_d.di_nextents > 0) 837 xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); 838 xfs_iunlock(ip, mode); 839 return 0; 840} 841 842STATIC int 843xfs_file_release( 844 struct inode *inode, 845 struct file *filp) 846{ 847 return -xfs_release(XFS_I(inode)); 848} 849 850STATIC int 851xfs_file_readdir( 852 struct file *filp, 853 void *dirent, 854 filldir_t filldir) 855{ 856 struct inode *inode = filp->f_path.dentry->d_inode; 857 xfs_inode_t *ip = XFS_I(inode); 858 int error; 859 size_t bufsize; 860 861 /* 862 * The Linux API doesn't pass down the total size of the buffer 863 * we read into down to the filesystem. With the filldir concept 864 * it's not needed for correct information, but the XFS dir2 leaf 865 * code wants an estimate of the buffer size to calculate it's 866 * readahead window and size the buffers used for mapping to 867 * physical blocks. 868 * 869 * Try to give it an estimate that's good enough, maybe at some 870 * point we can change the ->readdir prototype to include the 871 * buffer size. For now we use the current glibc buffer size. 872 */ 873 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); 874 875 error = xfs_readdir(ip, dirent, bufsize, 876 (xfs_off_t *)&filp->f_pos, filldir); 877 if (error) 878 return -error; 879 return 0; 880} 881 882STATIC int 883xfs_file_mmap( 884 struct file *filp, 885 struct vm_area_struct *vma) 886{ 887 vma->vm_ops = &xfs_file_vm_ops; 888 vma->vm_flags |= VM_CAN_NONLINEAR; 889 890 file_accessed(filp); 891 return 0; 892} 893 894/* 895 * mmap()d file has taken write protection fault and is being made 896 * writable. We can set the page state up correctly for a writable 897 * page, which means we can do correct delalloc accounting (ENOSPC 898 * checking!) and unwritten extent mapping. 899 */ 900STATIC int 901xfs_vm_page_mkwrite( 902 struct vm_area_struct *vma, 903 struct vm_fault *vmf) 904{ 905 return block_page_mkwrite(vma, vmf, xfs_get_blocks); 906} 907 908const struct file_operations xfs_file_operations = { 909 .llseek = generic_file_llseek, 910 .read = do_sync_read, 911 .write = do_sync_write, 912 .aio_read = xfs_file_aio_read, 913 .aio_write = xfs_file_aio_write, 914 .splice_read = xfs_file_splice_read, 915 .splice_write = xfs_file_splice_write, 916 .unlocked_ioctl = xfs_file_ioctl, 917#ifdef CONFIG_COMPAT 918 .compat_ioctl = xfs_file_compat_ioctl, 919#endif 920 .mmap = xfs_file_mmap, 921 .open = xfs_file_open, 922 .release = xfs_file_release, 923 .fsync = xfs_file_fsync, 924}; 925 926const struct file_operations xfs_dir_file_operations = { 927 .open = xfs_dir_open, 928 .read = generic_read_dir, 929 .readdir = xfs_file_readdir, 930 .llseek = generic_file_llseek, 931 .unlocked_ioctl = xfs_file_ioctl, 932#ifdef CONFIG_COMPAT 933 .compat_ioctl = xfs_file_compat_ioctl, 934#endif 935 .fsync = xfs_file_fsync, 936}; 937 938static const struct vm_operations_struct xfs_file_vm_ops = { 939 .fault = filemap_fault, 940 .page_mkwrite = xfs_vm_page_mkwrite, 941}; 942