1/* 2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18#include "xfs.h" 19#include "xfs_fs.h" 20#include "xfs_bit.h" 21#include "xfs_log.h" 22#include "xfs_inum.h" 23#include "xfs_trans.h" 24#include "xfs_sb.h" 25#include "xfs_ag.h" 26#include "xfs_dir.h" 27#include "xfs_dir2.h" 28#include "xfs_alloc.h" 29#include "xfs_dmapi.h" 30#include "xfs_quota.h" 31#include "xfs_mount.h" 32#include "xfs_bmap_btree.h" 33#include "xfs_alloc_btree.h" 34#include "xfs_ialloc_btree.h" 35#include "xfs_dir_sf.h" 36#include "xfs_dir2_sf.h" 37#include "xfs_attr_sf.h" 38#include "xfs_dinode.h" 39#include "xfs_inode.h" 40#include "xfs_bmap.h" 41#include "xfs_btree.h" 42#include "xfs_ialloc.h" 43#include "xfs_rtalloc.h" 44#include "xfs_error.h" 45#include "xfs_itable.h" 46#include "xfs_rw.h" 47#include "xfs_acl.h" 48#include "xfs_cap.h" 49#include "xfs_mac.h" 50#include "xfs_attr.h" 51#include "xfs_inode_item.h" 52#include "xfs_buf_item.h" 53#include "xfs_utils.h" 54#include "xfs_iomap.h" 55 56#if defined(XFS_RW_TRACE) 57void 58xfs_rw_enter_trace( 59 int tag, 60 xfs_iocore_t *io, 61 const char *buf, 62 size_t size, 63 loff_t offset, 64 int ioflags) 65{ 66 xfs_inode_t *ip = XFS_IO_INODE(io); 67 68 if (ip->i_rwtrace == NULL) 69 return; 70 ktrace_enter(ip->i_rwtrace, 71 (void *)(unsigned long)tag, 72 (void *)ip, 73 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), 74 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), 75 (void *)(__psint_t)buf, 76 (void *)((unsigned long)size), 77 (void *)((unsigned long)((offset >> 32) & 0xffffffff)), 78 (void *)((unsigned long)(offset & 0xffffffff)), 79 (void *)((unsigned long)ioflags), 80 (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)), 81 (void *)((unsigned long)(io->io_new_size & 0xffffffff)), 82 (void *)NULL, 83 (void *)NULL, 84 (void *)NULL, 85 (void *)NULL, 86 (void *)NULL); 87} 88 89void 90xfs_inval_cached_trace( 91 xfs_iocore_t *io, 92 xfs_off_t offset, 93 xfs_off_t len, 94 xfs_off_t first, 95 xfs_off_t last) 96{ 97 xfs_inode_t *ip = XFS_IO_INODE(io); 98 99 if (ip->i_rwtrace == NULL) 100 return; 101 ktrace_enter(ip->i_rwtrace, 102 (void *)(__psint_t)XFS_INVAL_CACHED, 103 (void *)ip, 104 (void *)((unsigned long)((offset >> 32) & 0xffffffff)), 105 (void *)((unsigned long)(offset & 0xffffffff)), 106 (void *)((unsigned long)((len >> 32) & 0xffffffff)), 107 (void *)((unsigned long)(len & 0xffffffff)), 108 (void *)((unsigned long)((first >> 32) & 0xffffffff)), 109 (void *)((unsigned long)(first & 0xffffffff)), 110 (void *)((unsigned long)((last >> 32) & 0xffffffff)), 111 (void *)((unsigned long)(last & 0xffffffff)), 112 (void *)NULL, 113 (void *)NULL, 114 (void *)NULL, 115 (void *)NULL, 116 (void *)NULL, 117 (void *)NULL); 118} 119#endif 120 121/* 122 * xfs_iozero 123 * 124 * xfs_iozero clears the specified range of buffer supplied, 125 * and marks all the affected blocks as valid and modified. If 126 * an affected block is not allocated, it will be allocated. If 127 * an affected block is not completely overwritten, and is not 128 * valid before the operation, it will be read from disk before 129 * being partially zeroed. 130 */ 131STATIC int 132xfs_iozero( 133 xfs_vnode_t *vp, /* vnode */ 134 xfs_off_t pos, /* offset in file */ 135 size_t count, /* size of data to zero */ 136 xfs_off_t end_size) /* max file size to set */ 137{ 138 int status; 139 status = 0; /* XXXKAN: */ 140#ifdef XXXKAN 141 unsigned bytes; 142 struct page *page; 143 struct address_space *mapping; 144 char *kaddr; 145 146 mapping = ip->i_mapping; 147 do { 148 unsigned long index, offset; 149 150 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ 151 index = pos >> PAGE_CACHE_SHIFT; 152 bytes = PAGE_CACHE_SIZE - offset; 153 if (bytes > count) 154 bytes = count; 155 156 status = -ENOMEM; 157 page = grab_cache_page(mapping, index); 158 if (!page) 159 break; 160 161 kaddr = kmap(page); 162 status = mapping->a_ops->prepare_write(NULL, page, offset, 163 offset + bytes); 164 if (status) { 165 goto unlock; 166 } 167 168 memset((void *) (kaddr + offset), 0, bytes); 169 flush_dcache_page(page); 170 status = mapping->a_ops->commit_write(NULL, page, offset, 171 offset + bytes); 172 if (!status) { 173 pos += bytes; 174 count -= bytes; 175 if (pos > i_size_read(ip)) 176 i_size_write(ip, pos < end_size ? pos : end_size); 177 } 178 179unlock: 180 kunmap(page); 181 unlock_page(page); 182 page_cache_release(page); 183 if (status) 184 break; 185 } while (count); 186#endif 187 return (-status); 188} 189 190ssize_t /* bytes read, or (-) error */ 191xfs_read( 192 bhv_desc_t *bdp, 193 uio_t *uio, 194 int ioflags, 195 cred_t *credp) 196{ 197 ssize_t ret, size; 198 xfs_fsize_t n; 199 xfs_inode_t *ip; 200 xfs_mount_t *mp; 201 202 ip = XFS_BHVTOI(bdp); 203 mp = ip->i_mount; 204 205 XFS_STATS_INC(xs_read_calls); 206 207 if (unlikely(ioflags & IO_ISDIRECT)) { 208 if (((__psint_t)buf & BBMASK) || 209 (uio->uio_offset & mp->m_blockmask) || 210 (uio->uio_resid & mp->m_blockmask)) { 211 if (uio->uio_offset >= ip->i_d.di_size) { 212 return (0); 213 } 214 return EINVAL; 215 } 216 } 217 218 if (uio->uio_resid == 0) 219 return 0; 220 n = XFS_MAXIOFFSET(mp) - uio->uio_offset; 221 if (n <= 0) 222 return EFBIG; 223 224 size = (n < uio->uio_resid)? n : uio->uio_resid; 225 226 if (XFS_FORCED_SHUTDOWN(mp)) { 227 return EIO; 228 } 229 230 xfs_ilock(ip, XFS_IOLOCK_SHARED); 231 232#ifdef XXX 233 if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) && 234 !(ioflags & IO_INVIS)) { 235 int error; 236 vrwlock_t locktype = VRWLOCK_READ; 237 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); 238 239 error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), 240 uio->uio_offset, size, dmflags, &locktype); 241 if (error) { 242 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 243 return (error); 244 } 245 } 246#endif 247 248 ret = xfs_read_file(mp, ip, uio, ioflags); 249 250 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 251 252 XFS_STATS_ADD(xs_read_bytes, ret); 253 254 if (likely((ioflags & IO_INVIS) == 0)) { 255 xfs_ichgtime(ip, XFS_ICHGTIME_ACC); 256 } 257 258 return ret; 259} 260 261/* 262 * This routine is called to handle zeroing any space in the last 263 * block of the file that is beyond the EOF. We do this since the 264 * size is being increased without writing anything to that block 265 * and we don't want anyone to read the garbage on the disk. 266 */ 267STATIC int /* error (positive) */ 268xfs_zero_last_block( 269 xfs_vnode_t *vp, 270 xfs_iocore_t *io, 271 xfs_fsize_t isize, 272 xfs_fsize_t end_size) 273{ 274 xfs_fileoff_t last_fsb; 275 xfs_mount_t *mp; 276 int nimaps; 277 int zero_offset; 278 int zero_len; 279 int error = 0; 280 xfs_bmbt_irec_t imap; 281 xfs_off_t loff; 282 283 ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); 284 285 mp = io->io_mount; 286 287 zero_offset = XFS_B_FSB_OFFSET(mp, isize); 288 if (zero_offset == 0) { 289 /* 290 * There are no extra bytes in the last block on disk to 291 * zero, so return. 292 */ 293 return 0; 294 } 295 296 last_fsb = XFS_B_TO_FSBT(mp, isize); 297 nimaps = 1; 298 error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap, 299 &nimaps, NULL, NULL); 300 if (error) { 301 return error; 302 } 303 ASSERT(nimaps > 0); 304 /* 305 * If the block underlying isize is just a hole, then there 306 * is nothing to zero. 307 */ 308 if (imap.br_startblock == HOLESTARTBLOCK) { 309 return 0; 310 } 311 /* 312 * Zero the part of the last block beyond the EOF, and write it 313 * out sync. We need to drop the ilock while we do this so we 314 * don't deadlock when the buffer cache calls back to us. 315 */ 316 XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); 317 loff = XFS_FSB_TO_B(mp, last_fsb); 318 319 zero_len = mp->m_sb.sb_blocksize - zero_offset; 320 321 error = xfs_iozero(vp, loff + zero_offset, zero_len, end_size); 322 323 XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); 324 ASSERT(error >= 0); 325 return error; 326} 327 328/* 329 * Zero any on disk space between the current EOF and the new, 330 * larger EOF. This handles the normal case of zeroing the remainder 331 * of the last block in the file and the unusual case of zeroing blocks 332 * out beyond the size of the file. This second case only happens 333 * with fixed size extents and when the system crashes before the inode 334 * size was updated but after blocks were allocated. If fill is set, 335 * then any holes in the range are filled and zeroed. If not, the holes 336 * are left alone as holes. 337 */ 338 339int /* error (positive) */ 340xfs_zero_eof( 341 xfs_vnode_t *vp, 342 xfs_iocore_t *io, 343 xfs_off_t offset, /* starting I/O offset */ 344 xfs_fsize_t isize, /* current inode size */ 345 xfs_fsize_t end_size) /* terminal inode size */ 346{ 347 xfs_fileoff_t start_zero_fsb; 348 xfs_fileoff_t end_zero_fsb; 349 xfs_fileoff_t zero_count_fsb; 350 xfs_fileoff_t last_fsb; 351 xfs_extlen_t buf_len_fsb; 352 xfs_mount_t *mp; 353 int nimaps; 354 int error = 0; 355 xfs_bmbt_irec_t imap; 356 357 ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); 358 ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); 359 ASSERT(offset > isize); 360 361 mp = io->io_mount; 362 363 /* 364 * First handle zeroing the block on which isize resides. 365 * We only zero a part of that block so it is handled specially. 366 */ 367 error = xfs_zero_last_block(vp, io, isize, end_size); 368 if (error) { 369 ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); 370 ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); 371 return error; 372 } 373 374 /* 375 * Calculate the range between the new size and the old 376 * where blocks needing to be zeroed may exist. To get the 377 * block where the last byte in the file currently resides, 378 * we need to subtract one from the size and truncate back 379 * to a block boundary. We subtract 1 in case the size is 380 * exactly on a block boundary. 381 */ 382 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; 383 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); 384 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); 385 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); 386 if (last_fsb == end_zero_fsb) { 387 /* 388 * The size was only incremented on its last block. 389 * We took care of that above, so just return. 390 */ 391 return 0; 392 } 393 394 ASSERT(start_zero_fsb <= end_zero_fsb); 395 while (start_zero_fsb <= end_zero_fsb) { 396 nimaps = 1; 397 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; 398 error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb, 399 0, NULL, 0, &imap, &nimaps, NULL, NULL); 400 if (error) { 401 ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); 402 ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); 403 return error; 404 } 405 ASSERT(nimaps > 0); 406 407 if (imap.br_state == XFS_EXT_UNWRITTEN || 408 imap.br_startblock == HOLESTARTBLOCK) { 409 /* 410 * This loop handles initializing pages that were 411 * partially initialized by the code below this 412 * loop. It basically zeroes the part of the page 413 * that sits on a hole and sets the page as P_HOLE 414 * and calls remapf if it is a mapped file. 415 */ 416 start_zero_fsb = imap.br_startoff + imap.br_blockcount; 417 ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 418 continue; 419 } 420 421 /* 422 * There are blocks in the range requested. 423 * Zero them a single write at a time. We actually 424 * don't zero the entire range returned if it is 425 * too big and simply loop around to get the rest. 426 * That is not the most efficient thing to do, but it 427 * is simple and this path should not be exercised often. 428 */ 429 buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount, 430 mp->m_writeio_blocks << 8); 431 /* 432 * Drop the inode lock while we're doing the I/O. 433 * We'll still have the iolock to protect us. 434 */ 435 XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); 436 437 error = xfs_iozero(vp, 438 XFS_FSB_TO_B(mp, start_zero_fsb), 439 XFS_FSB_TO_B(mp, buf_len_fsb), 440 end_size); 441 442 if (error) { 443 goto out_lock; 444 } 445 446 start_zero_fsb = imap.br_startoff + buf_len_fsb; 447 ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 448 449 XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); 450 } 451 452 return 0; 453 454out_lock: 455 456 XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); 457 ASSERT(error >= 0); 458 return error; 459} 460 461ssize_t /* bytes written, or (-) error */ 462xfs_write( 463 bhv_desc_t *bdp, 464 uio_t *uio, 465 int ioflag, 466 cred_t *credp) 467{ 468 xfs_inode_t *xip; 469 xfs_mount_t *mp; 470 ssize_t ret = 0; 471 int error = 0; 472 xfs_fsize_t isize, new_size; 473 xfs_fsize_t n, limit; 474 xfs_fsize_t size; 475 xfs_iocore_t *io; 476 xfs_vnode_t *vp; 477 int iolock; 478 //int eventsent = 0; 479 vrwlock_t locktype; 480 xfs_off_t offset_c; 481 xfs_off_t *offset; 482 xfs_off_t pos; 483 484 XFS_STATS_INC(xs_write_calls); 485 486 vp = BHV_TO_VNODE(bdp); 487 xip = XFS_BHVTOI(bdp); 488 489 io = &xip->i_iocore; 490 mp = io->io_mount; 491 492 if (XFS_FORCED_SHUTDOWN(xip->i_mount)) { 493 return EIO; 494 } 495 496 size = uio->uio_resid; 497 pos = offset_c = uio->uio_offset; 498 offset = &offset_c; 499 500 if (unlikely(ioflag & IO_ISDIRECT)) { 501 if (((__psint_t)buf & BBMASK) || 502 (*offset & mp->m_blockmask) || 503 (size & mp->m_blockmask)) { 504 return EINVAL; 505 } 506 iolock = XFS_IOLOCK_SHARED; 507 locktype = VRWLOCK_WRITE_DIRECT; 508 } else { 509 if (io->io_flags & XFS_IOCORE_RT) 510 return EINVAL; 511 iolock = XFS_IOLOCK_EXCL; 512 locktype = VRWLOCK_WRITE; 513 } 514 515 iolock = XFS_IOLOCK_EXCL; 516 locktype = VRWLOCK_WRITE; 517 518 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); 519 520 isize = xip->i_d.di_size; 521 limit = XFS_MAXIOFFSET(mp); 522 523 if (ioflag & O_APPEND) 524 *offset = isize; 525 526//start: 527 n = limit - *offset; 528 if (n <= 0) { 529 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); 530 return EFBIG; 531 } 532 if (n < size) 533 size = n; 534 535 new_size = *offset + size; 536 if (new_size > isize) { 537 io->io_new_size = new_size; 538 } 539 540#ifdef RMC 541 /* probably be a long time before if ever that we do dmapi */ 542 if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && 543 !(ioflags & IO_INVIS) && !eventsent)) { 544 loff_t savedsize = *offset; 545 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); 546 547 xfs_iunlock(xip, XFS_ILOCK_EXCL); 548 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, 549 *offset, size, 550 dmflags, &locktype); 551 if (error) { 552 if (iolock) xfs_iunlock(xip, iolock); 553 return -error; 554 } 555 xfs_ilock(xip, XFS_ILOCK_EXCL); 556 eventsent = 1; 557 558 /* 559 * The iolock was dropped and reaquired in XFS_SEND_DATA 560 * so we have to recheck the size when appending. 561 * We will only "goto start;" once, since having sent the 562 * event prevents another call to XFS_SEND_DATA, which is 563 * what allows the size to change in the first place. 564 */ 565 if ((file->f_flags & O_APPEND) && 566 savedsize != xip->i_d.di_size) { 567 *offset = isize = xip->i_d.di_size; 568 goto start; 569 } 570 } 571#endif 572 573 /* 574 * If the offset is beyond the size of the file, we have a couple 575 * of things to do. First, if there is already space allocated 576 * we need to either create holes or zero the disk or ... 577 * 578 * If there is a page where the previous size lands, we need 579 * to zero it out up to the new size. 580 */ 581 582 if (!(ioflag & IO_ISDIRECT) && (*offset > isize && isize)) { 583 error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset, 584 isize, *offset + size); 585 if (error) { 586 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); 587 return(-error); 588 } 589 } 590 xfs_iunlock(xip, XFS_ILOCK_EXCL); 591 592#if 0 593 /* 594 * If we're writing the file then make sure to clear the 595 * setuid and setgid bits if the process is not being run 596 * by root. This keeps people from modifying setuid and 597 * setgid binaries. 598 */ 599 600 if (((xip->i_d.di_mode & S_ISUID) || 601 ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == 602 (S_ISGID | S_IXGRP))) && 603 !capable(CAP_FSETID)) { 604 error = xfs_write_clear_setuid(xip); 605 if (likely(!error)) 606 error = -remove_suid(file->f_dentry); 607 if (unlikely(error)) { 608 xfs_iunlock(xip, iolock); 609 goto out_unlock_mutex; 610 } 611 } 612#endif 613 614 615//retry: 616 if (unlikely(ioflag & IO_ISDIRECT)) { 617 618#ifdef RMC 619 xfs_off_t pos = *offset; 620 struct address_space *mapping = file->f_dentry->d_inode->i_mapping; 621 struct inode *inode = mapping->host; 622 623 ret = precheck_file_write(file, inode, &size, &pos); 624 if (ret || size == 0) 625 goto error; 626 627 xfs_inval_cached_pages(vp, io, pos, 1, 1); 628 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 629 /* mark_inode_dirty_sync(inode); - we do this later */ 630 631 xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, buf, size, pos, ioflags); 632 ret = generic_file_direct_IO(WRITE, file, (char *)buf, size, pos); 633 xfs_inval_cached_pages(vp, io, pos, 1, 1); 634 if (ret > 0) 635 *offset += ret; 636#endif 637 } else { 638 xfs_rw_enter_trace(XFS_WRITE_ENTER, io, buf, size, *offset, ioflags); 639 ret = xfs_write_file(xip,uio,ioflag); 640 } 641 642 xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 643 644 645//error: 646 if (ret <= 0) { 647 if (iolock) 648 xfs_rwunlock(bdp, locktype); 649 return ret; 650 } 651 652 XFS_STATS_ADD(xs_write_bytes, ret); 653 654 if (*offset > xip->i_d.di_size) { 655 xfs_ilock(xip, XFS_ILOCK_EXCL); 656 if (*offset > xip->i_d.di_size) { 657 printf("xfs_write look at doing more here %s:%d\n",__FILE__,__LINE__); 658#ifdef RMC 659 struct inode *inode = LINVFS_GET_IP(vp); 660 i_size_write(inode, *offset); 661 mark_inode_dirty_sync(inode); 662#endif 663 664 xip->i_d.di_size = *offset; 665 xip->i_update_core = 1; 666 xip->i_update_size = 1; 667 } 668 xfs_iunlock(xip, XFS_ILOCK_EXCL); 669 } 670 671 /* Handle various SYNC-type writes */ 672#if 0 673// if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { 674#endif 675 if (ioflag & IO_SYNC) { 676 /* 677 * If we're treating this as O_DSYNC and we have not updated the 678 * size, force the log. 679 */ 680 if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && 681 !(xip->i_update_size)) { 682 xfs_inode_log_item_t *iip = xip->i_itemp; 683 684 /* 685 * If an allocation transaction occurred 686 * without extending the size, then we have to force 687 * the log up the proper point to ensure that the 688 * allocation is permanent. We can't count on 689 * the fact that buffered writes lock out direct I/O 690 * writes - the direct I/O write could have extended 691 * the size nontransactionally, then finished before 692 * we started. xfs_write_file will think that the file 693 * didn't grow but the update isn't safe unless the 694 * size change is logged. 695 * 696 * Force the log if we've committed a transaction 697 * against the inode or if someone else has and 698 * the commit record hasn't gone to disk (e.g. 699 * the inode is pinned). This guarantees that 700 * all changes affecting the inode are permanent 701 * when we return. 702 */ 703 if (iip && iip->ili_last_lsn) { 704 xfs_log_force(mp, iip->ili_last_lsn, 705 XFS_LOG_FORCE | XFS_LOG_SYNC); 706 } else if (xfs_ipincount(xip) > 0) { 707 xfs_log_force(mp, (xfs_lsn_t)0, 708 XFS_LOG_FORCE | XFS_LOG_SYNC); 709 } 710 711 } else { 712 xfs_trans_t *tp; 713 714 /* 715 * O_SYNC or O_DSYNC _with_ a size update are handled 716 * the same way. 717 * 718 * If the write was synchronous then we need to make 719 * sure that the inode modification time is permanent. 720 * We'll have updated the timestamp above, so here 721 * we use a synchronous transaction to log the inode. 722 * It's not fast, but it's necessary. 723 * 724 * If this a dsync write and the size got changed 725 * non-transactionally, then we need to ensure that 726 * the size change gets logged in a synchronous 727 * transaction. 728 */ 729 730 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); 731 if ((error = xfs_trans_reserve(tp, 0, 732 XFS_SWRITE_LOG_RES(mp), 733 0, 0, 0))) { 734 /* Transaction reserve failed */ 735 xfs_trans_cancel(tp, 0); 736 } else { 737 /* Transaction reserve successful */ 738 xfs_ilock(xip, XFS_ILOCK_EXCL); 739 xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL); 740 xfs_trans_ihold(tp, xip); 741 xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE); 742 xfs_trans_set_sync(tp); 743 error = xfs_trans_commit(tp, 0, NULL); 744 xfs_iunlock(xip, XFS_ILOCK_EXCL); 745 } 746 if (error) 747 goto out_unlock_internal; 748 } 749 750 xfs_rwunlock(bdp, locktype); 751 return ret; 752 753 } /* (ioflags & O_SYNC) */ 754 755out_unlock_internal: 756 xfs_rwunlock(bdp, locktype); 757#if 0 758out_unlock_mutex: 759 if (need_i_mutex) 760 mutex_unlock(&inode->i_mutex); 761#endif 762 //out_nounlocks: 763 return -error; 764} 765 766/* 767 * Initiate IO on given buffer. 768 */ 769int 770xfs_buf_iorequest(struct xfs_buf *bp) 771{ 772 bp->b_flags &= ~(B_INVAL|B_DONE); 773 bp->b_ioflags &= ~BIO_ERROR; 774 775 if (bp->b_flags & B_ASYNC) 776 BUF_KERNPROC(bp); 777 778 if (bp->b_vp == NULL) { 779 if (bp->b_iocmd == BIO_WRITE) { 780 bp->b_flags &= ~(B_DELWRI | B_DEFERRED); 781 bufobj_wref(bp->b_bufobj); 782 } 783 784 bp->b_iooffset = (bp->b_blkno << BBSHIFT); 785 bstrategy(bp); 786 } else { 787 if (bp->b_iocmd == BIO_WRITE) { 788 /* Mark the buffer clean */ 789 bundirty(bp); 790 bufobj_wref(bp->b_bufobj); 791 vfs_busy_pages(bp, 1); 792 } else if (bp->b_iocmd == BIO_READ) { 793 vfs_busy_pages(bp, 0); 794 } 795 bp->b_iooffset = dbtob(bp->b_blkno); 796 bstrategy(bp); 797 } 798 return 0; 799} 800 801/* 802 * All xfs metadata buffers except log state machine buffers 803 * get this attached as their b_bdstrat callback function. 804 * This is so that we can catch a buffer 805 * after prematurely unpinning it to forcibly shutdown the filesystem. 806 */ 807int 808xfs_bdstrat_cb(struct xfs_buf *bp) 809{ 810 xfs_mount_t *mp; 811 812 mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *); 813 if (!XFS_FORCED_SHUTDOWN(mp)) { 814 xfs_buf_iorequest(bp); 815 return 0; 816 } else { 817 xfs_buftrace("XFS__BDSTRAT IOERROR", bp); 818 /* 819 * Metadata write that didn't get logged but 820 * written delayed anyway. These aren't associated 821 * with a transaction, and can be ignored. 822 */ 823 if (XFS_BUF_IODONE_FUNC(bp) == NULL && 824 (XFS_BUF_ISREAD(bp)) == 0) 825 return (xfs_bioerror_relse(bp)); 826 else 827 return (xfs_bioerror(bp)); 828 } 829} 830 831 832int 833xfs_bmap(bhv_desc_t *bdp, 834 xfs_off_t offset, 835 ssize_t count, 836 int flags, 837 xfs_iomap_t *iomapp, 838 int *niomaps) 839{ 840 xfs_inode_t *ip = XFS_BHVTOI(bdp); 841 xfs_iocore_t *io = &ip->i_iocore; 842 843 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); 844 ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) == 845 ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0)); 846 847 return xfs_iomap(io, offset, count, flags, iomapp, niomaps); 848} 849 850/* 851 * Wrapper around bdstrat so that we can stop data 852 * from going to disk in case we are shutting down the filesystem. 853 * Typically user data goes thru this path; one of the exceptions 854 * is the superblock. 855 */ 856int 857xfsbdstrat( 858 struct xfs_mount *mp, 859 struct xfs_buf *bp) 860{ 861 ASSERT(mp); 862 if (!XFS_FORCED_SHUTDOWN(mp)) { 863 864 xfs_buf_iorequest(bp); 865 return 0; 866 } 867 868 xfs_buftrace("XFSBDSTRAT IOERROR", bp); 869 return (xfs_bioerror_relse(bp)); 870} 871 872/* 873 * If the underlying (data/log/rt) device is readonly, there are some 874 * operations that cannot proceed. 875 */ 876int 877xfs_dev_is_read_only( 878 xfs_mount_t *mp, 879 char *message) 880{ 881 if (xfs_readonly_buftarg(mp->m_ddev_targp) || 882 xfs_readonly_buftarg(mp->m_logdev_targp) || 883 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { 884 cmn_err(CE_NOTE, 885 "XFS: %s required on read-only device.", message); 886 cmn_err(CE_NOTE, 887 "XFS: write access unavailable, cannot proceed."); 888 return EROFS; 889 } 890 return 0; 891} 892