1153323Srodrigc/* 2159451Srodrigc * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. 3159451Srodrigc * All Rights Reserved. 4153323Srodrigc * 5159451Srodrigc * This program is free software; you can redistribute it and/or 6159451Srodrigc * modify it under the terms of the GNU General Public License as 7153323Srodrigc * published by the Free Software Foundation. 8153323Srodrigc * 9159451Srodrigc * This program is distributed in the hope that it would be useful, 10159451Srodrigc * but WITHOUT ANY WARRANTY; without even the implied warranty of 11159451Srodrigc * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12159451Srodrigc * GNU General Public License for more details. 13153323Srodrigc * 14159451Srodrigc * You should have received a copy of the GNU General Public License 15159451Srodrigc * along with this program; if not, write the Free Software Foundation, 16159451Srodrigc * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17153323Srodrigc */ 18153323Srodrigc#include "xfs.h" 19153323Srodrigc#include "xfs_fs.h" 20159451Srodrigc#include "xfs_bit.h" 21159451Srodrigc#include "xfs_log.h" 22153323Srodrigc#include "xfs_inum.h" 23153323Srodrigc#include "xfs_trans.h" 24153323Srodrigc#include "xfs_sb.h" 25153323Srodrigc#include "xfs_ag.h" 26153323Srodrigc#include "xfs_dir.h" 27153323Srodrigc#include "xfs_dir2.h" 28153323Srodrigc#include "xfs_alloc.h" 29153323Srodrigc#include "xfs_dmapi.h" 30153323Srodrigc#include "xfs_quota.h" 31153323Srodrigc#include "xfs_mount.h" 32159451Srodrigc#include "xfs_bmap_btree.h" 33153323Srodrigc#include "xfs_alloc_btree.h" 34153323Srodrigc#include "xfs_ialloc_btree.h" 35153323Srodrigc#include "xfs_dir_sf.h" 36153323Srodrigc#include "xfs_dir2_sf.h" 37159451Srodrigc#include "xfs_attr_sf.h" 38153323Srodrigc#include "xfs_dinode.h" 39153323Srodrigc#include "xfs_inode.h" 40153323Srodrigc#include "xfs_bmap.h" 41159451Srodrigc#include "xfs_btree.h" 42159451Srodrigc#include "xfs_ialloc.h" 43153323Srodrigc#include "xfs_rtalloc.h" 44153323Srodrigc#include "xfs_error.h" 45153323Srodrigc#include "xfs_itable.h" 46153323Srodrigc#include "xfs_rw.h" 47153323Srodrigc#include "xfs_acl.h" 48153323Srodrigc#include "xfs_cap.h" 49153323Srodrigc#include "xfs_mac.h" 50153323Srodrigc#include "xfs_attr.h" 51153323Srodrigc#include "xfs_inode_item.h" 52153323Srodrigc#include "xfs_buf_item.h" 53153323Srodrigc#include "xfs_utils.h" 54153323Srodrigc#include "xfs_iomap.h" 55153323Srodrigc 56153323Srodrigc#if defined(XFS_RW_TRACE) 57153323Srodrigcvoid 58153323Srodrigcxfs_rw_enter_trace( 59153323Srodrigc int tag, 60153323Srodrigc xfs_iocore_t *io, 61153323Srodrigc const char *buf, 62153323Srodrigc size_t size, 63153323Srodrigc loff_t offset, 64153323Srodrigc int ioflags) 65153323Srodrigc{ 66153323Srodrigc xfs_inode_t *ip = XFS_IO_INODE(io); 67153323Srodrigc 68153323Srodrigc if (ip->i_rwtrace == NULL) 69153323Srodrigc return; 70153323Srodrigc ktrace_enter(ip->i_rwtrace, 71153323Srodrigc (void *)(unsigned long)tag, 72153323Srodrigc (void *)ip, 73153323Srodrigc (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), 74153323Srodrigc (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), 75153323Srodrigc (void *)(__psint_t)buf, 76153323Srodrigc (void *)((unsigned long)size), 77153323Srodrigc (void *)((unsigned long)((offset >> 32) & 0xffffffff)), 78153323Srodrigc (void *)((unsigned long)(offset & 0xffffffff)), 79153323Srodrigc (void *)((unsigned long)ioflags), 80153323Srodrigc (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)), 81153323Srodrigc (void *)((unsigned long)(io->io_new_size & 0xffffffff)), 82153323Srodrigc (void *)NULL, 83153323Srodrigc (void *)NULL, 84153323Srodrigc (void *)NULL, 85153323Srodrigc (void *)NULL, 86153323Srodrigc (void *)NULL); 87153323Srodrigc} 88153323Srodrigc 89153323Srodrigcvoid 90153323Srodrigcxfs_inval_cached_trace( 91153323Srodrigc xfs_iocore_t *io, 92153323Srodrigc xfs_off_t offset, 93153323Srodrigc xfs_off_t len, 94153323Srodrigc xfs_off_t first, 95153323Srodrigc xfs_off_t last) 96153323Srodrigc{ 97153323Srodrigc xfs_inode_t *ip = XFS_IO_INODE(io); 98153323Srodrigc 99153323Srodrigc if (ip->i_rwtrace == NULL) 100153323Srodrigc return; 101153323Srodrigc ktrace_enter(ip->i_rwtrace, 102153323Srodrigc (void *)(__psint_t)XFS_INVAL_CACHED, 103153323Srodrigc (void *)ip, 104153323Srodrigc (void *)((unsigned long)((offset >> 32) & 0xffffffff)), 105153323Srodrigc (void *)((unsigned long)(offset & 0xffffffff)), 106153323Srodrigc (void *)((unsigned long)((len >> 32) & 0xffffffff)), 107153323Srodrigc (void *)((unsigned long)(len & 0xffffffff)), 108153323Srodrigc (void *)((unsigned long)((first >> 32) & 0xffffffff)), 109153323Srodrigc (void *)((unsigned long)(first & 0xffffffff)), 110153323Srodrigc (void *)((unsigned long)((last >> 32) & 0xffffffff)), 111153323Srodrigc (void *)((unsigned long)(last & 0xffffffff)), 112153323Srodrigc (void *)NULL, 113153323Srodrigc (void *)NULL, 114153323Srodrigc (void *)NULL, 115153323Srodrigc (void *)NULL, 116153323Srodrigc (void *)NULL, 117153323Srodrigc (void *)NULL); 118153323Srodrigc} 119153323Srodrigc#endif 120153323Srodrigc 121153323Srodrigc/* 122153323Srodrigc * xfs_iozero 123153323Srodrigc * 124153323Srodrigc * xfs_iozero clears the specified range of buffer supplied, 125153323Srodrigc * and marks all the affected blocks as valid and modified. If 126153323Srodrigc * an affected block is not allocated, it will be allocated. If 127153323Srodrigc * an affected block is not completely overwritten, and is not 128153323Srodrigc * valid before the operation, it will be read from disk before 129153323Srodrigc * being partially zeroed. 130153323Srodrigc */ 131153323SrodrigcSTATIC int 132153323Srodrigcxfs_iozero( 133153323Srodrigc xfs_vnode_t *vp, /* vnode */ 134159451Srodrigc xfs_off_t pos, /* offset in file */ 135153323Srodrigc size_t count, /* size of data to zero */ 136159451Srodrigc xfs_off_t end_size) /* max file size to set */ 137153323Srodrigc{ 138159451Srodrigc int status; 139159451Srodrigc status = 0; /* XXXKAN: */ 140159147Simp#ifdef XXXKAN 141153323Srodrigc unsigned bytes; 142153323Srodrigc struct page *page; 143153323Srodrigc struct address_space *mapping; 144153323Srodrigc char *kaddr; 145153323Srodrigc 146153323Srodrigc mapping = ip->i_mapping; 147153323Srodrigc do { 148153323Srodrigc unsigned long index, offset; 149153323Srodrigc 150153323Srodrigc offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ 151153323Srodrigc index = pos >> PAGE_CACHE_SHIFT; 152153323Srodrigc bytes = PAGE_CACHE_SIZE - offset; 153153323Srodrigc if (bytes > count) 154153323Srodrigc bytes = count; 155153323Srodrigc 156153323Srodrigc status = -ENOMEM; 157153323Srodrigc page = grab_cache_page(mapping, index); 158153323Srodrigc if (!page) 159153323Srodrigc break; 160153323Srodrigc 161153323Srodrigc kaddr = kmap(page); 162153323Srodrigc status = mapping->a_ops->prepare_write(NULL, page, offset, 163153323Srodrigc offset + bytes); 164153323Srodrigc if (status) { 165153323Srodrigc goto unlock; 166153323Srodrigc } 167153323Srodrigc 168153323Srodrigc memset((void *) (kaddr + offset), 0, bytes); 169153323Srodrigc flush_dcache_page(page); 170153323Srodrigc status = mapping->a_ops->commit_write(NULL, page, offset, 171153323Srodrigc offset + bytes); 172153323Srodrigc if (!status) { 173153323Srodrigc pos += bytes; 174153323Srodrigc count -= bytes; 175153323Srodrigc if (pos > i_size_read(ip)) 176153323Srodrigc i_size_write(ip, pos < end_size ? pos : end_size); 177153323Srodrigc } 178153323Srodrigc 179153323Srodrigcunlock: 180153323Srodrigc kunmap(page); 181153323Srodrigc unlock_page(page); 182153323Srodrigc page_cache_release(page); 183153323Srodrigc if (status) 184153323Srodrigc break; 185153323Srodrigc } while (count); 186159451Srodrigc#endif 187153323Srodrigc return (-status); 188153323Srodrigc} 189153323Srodrigc 190153323Srodrigcssize_t /* bytes read, or (-) error */ 191153323Srodrigcxfs_read( 192153323Srodrigc bhv_desc_t *bdp, 193153323Srodrigc uio_t *uio, 194153323Srodrigc int ioflags, 195153323Srodrigc cred_t *credp) 196153323Srodrigc{ 197153323Srodrigc ssize_t ret, size; 198153323Srodrigc xfs_fsize_t n; 199153323Srodrigc xfs_inode_t *ip; 200153323Srodrigc xfs_mount_t *mp; 201153323Srodrigc 202153323Srodrigc ip = XFS_BHVTOI(bdp); 203153323Srodrigc mp = ip->i_mount; 204153323Srodrigc 205153323Srodrigc XFS_STATS_INC(xs_read_calls); 206153323Srodrigc 207153323Srodrigc if (unlikely(ioflags & IO_ISDIRECT)) { 208153323Srodrigc if (((__psint_t)buf & BBMASK) || 209153323Srodrigc (uio->uio_offset & mp->m_blockmask) || 210153323Srodrigc (uio->uio_resid & mp->m_blockmask)) { 211153323Srodrigc if (uio->uio_offset >= ip->i_d.di_size) { 212153323Srodrigc return (0); 213153323Srodrigc } 214153323Srodrigc return EINVAL; 215153323Srodrigc } 216153323Srodrigc } 217153323Srodrigc 218153323Srodrigc if (uio->uio_resid == 0) 219153323Srodrigc return 0; 220153323Srodrigc n = XFS_MAXIOFFSET(mp) - uio->uio_offset; 221153323Srodrigc if (n <= 0) 222153323Srodrigc return EFBIG; 223153323Srodrigc 224153323Srodrigc size = (n < uio->uio_resid)? n : uio->uio_resid; 225153323Srodrigc 226153323Srodrigc if (XFS_FORCED_SHUTDOWN(mp)) { 227153323Srodrigc return EIO; 228153323Srodrigc } 229153323Srodrigc 230159451Srodrigc xfs_ilock(ip, XFS_IOLOCK_SHARED); 231159451Srodrigc 232159451Srodrigc#ifdef XXX 233153323Srodrigc if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) && 234153323Srodrigc !(ioflags & IO_INVIS)) { 235153323Srodrigc int error; 236153323Srodrigc vrwlock_t locktype = VRWLOCK_READ; 237153323Srodrigc int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); 238153323Srodrigc 239153323Srodrigc error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), 240153323Srodrigc uio->uio_offset, size, dmflags, &locktype); 241153323Srodrigc if (error) { 242159451Srodrigc xfs_iunlock(ip, XFS_IOLOCK_SHARED); 243153323Srodrigc return (error); 244153323Srodrigc } 245153323Srodrigc } 246159451Srodrigc#endif 247153323Srodrigc 248153323Srodrigc ret = xfs_read_file(mp, ip, uio, ioflags); 249153323Srodrigc 250159451Srodrigc xfs_iunlock(ip, XFS_IOLOCK_SHARED); 251153323Srodrigc 252153323Srodrigc XFS_STATS_ADD(xs_read_bytes, ret); 253153323Srodrigc 254153323Srodrigc if (likely((ioflags & IO_INVIS) == 0)) { 255153323Srodrigc xfs_ichgtime(ip, XFS_ICHGTIME_ACC); 256153323Srodrigc } 257153323Srodrigc 258153323Srodrigc return ret; 259153323Srodrigc} 260153323Srodrigc 261153323Srodrigc/* 262153323Srodrigc * This routine is called to handle zeroing any space in the last 263153323Srodrigc * block of the file that is beyond the EOF. We do this since the 264153323Srodrigc * size is being increased without writing anything to that block 265153323Srodrigc * and we don't want anyone to read the garbage on the disk. 266153323Srodrigc */ 267153323SrodrigcSTATIC int /* error (positive) */ 268153323Srodrigcxfs_zero_last_block( 269153323Srodrigc xfs_vnode_t *vp, 270153323Srodrigc xfs_iocore_t *io, 271153323Srodrigc xfs_fsize_t isize, 272153323Srodrigc xfs_fsize_t end_size) 273153323Srodrigc{ 274153323Srodrigc xfs_fileoff_t last_fsb; 275153323Srodrigc xfs_mount_t *mp; 276153323Srodrigc int nimaps; 277153323Srodrigc int zero_offset; 278153323Srodrigc int zero_len; 279153323Srodrigc int error = 0; 280153323Srodrigc xfs_bmbt_irec_t imap; 281159451Srodrigc xfs_off_t loff; 282153323Srodrigc 283153323Srodrigc ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); 284153323Srodrigc 285153323Srodrigc mp = io->io_mount; 286153323Srodrigc 287159451Srodrigc zero_offset = XFS_B_FSB_OFFSET(mp, isize); 288159451Srodrigc if (zero_offset == 0) { 289153323Srodrigc /* 290153323Srodrigc * There are no extra bytes in the last block on disk to 291153323Srodrigc * zero, so return. 292153323Srodrigc */ 293153323Srodrigc return 0; 294153323Srodrigc } 295153323Srodrigc 296153323Srodrigc last_fsb = XFS_B_TO_FSBT(mp, isize); 297153323Srodrigc nimaps = 1; 298153323Srodrigc error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap, 299159451Srodrigc &nimaps, NULL, NULL); 300153323Srodrigc if (error) { 301153323Srodrigc return error; 302153323Srodrigc } 303153323Srodrigc ASSERT(nimaps > 0); 304153323Srodrigc /* 305153323Srodrigc * If the block underlying isize is just a hole, then there 306153323Srodrigc * is nothing to zero. 307153323Srodrigc */ 308153323Srodrigc if (imap.br_startblock == HOLESTARTBLOCK) { 309153323Srodrigc return 0; 310153323Srodrigc } 311153323Srodrigc /* 312153323Srodrigc * Zero the part of the last block beyond the EOF, and write it 313153323Srodrigc * out sync. We need to drop the ilock while we do this so we 314153323Srodrigc * don't deadlock when the buffer cache calls back to us. 315153323Srodrigc */ 316153323Srodrigc XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); 317153323Srodrigc loff = XFS_FSB_TO_B(mp, last_fsb); 318153323Srodrigc 319159451Srodrigc zero_len = mp->m_sb.sb_blocksize - zero_offset; 320153323Srodrigc 321153323Srodrigc error = xfs_iozero(vp, loff + zero_offset, zero_len, end_size); 322153323Srodrigc 323153323Srodrigc XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); 324153323Srodrigc ASSERT(error >= 0); 325153323Srodrigc return error; 326153323Srodrigc} 327153323Srodrigc 328153323Srodrigc/* 329153323Srodrigc * Zero any on disk space between the current EOF and the new, 330153323Srodrigc * larger EOF. This handles the normal case of zeroing the remainder 331153323Srodrigc * of the last block in the file and the unusual case of zeroing blocks 332153323Srodrigc * out beyond the size of the file. This second case only happens 333153323Srodrigc * with fixed size extents and when the system crashes before the inode 334153323Srodrigc * size was updated but after blocks were allocated. If fill is set, 335153323Srodrigc * then any holes in the range are filled and zeroed. If not, the holes 336153323Srodrigc * are left alone as holes. 337153323Srodrigc */ 338153323Srodrigc 339153323Srodrigcint /* error (positive) */ 340153323Srodrigcxfs_zero_eof( 341153323Srodrigc xfs_vnode_t *vp, 342153323Srodrigc xfs_iocore_t *io, 343153323Srodrigc xfs_off_t offset, /* starting I/O offset */ 344153323Srodrigc xfs_fsize_t isize, /* current inode size */ 345153323Srodrigc xfs_fsize_t end_size) /* terminal inode size */ 346153323Srodrigc{ 347153323Srodrigc xfs_fileoff_t start_zero_fsb; 348153323Srodrigc xfs_fileoff_t end_zero_fsb; 349153323Srodrigc xfs_fileoff_t zero_count_fsb; 350153323Srodrigc xfs_fileoff_t last_fsb; 351153323Srodrigc xfs_extlen_t buf_len_fsb; 352153323Srodrigc xfs_mount_t *mp; 353153323Srodrigc int nimaps; 354153323Srodrigc int error = 0; 355153323Srodrigc xfs_bmbt_irec_t imap; 356153323Srodrigc 357153323Srodrigc ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); 358153323Srodrigc ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); 359159451Srodrigc ASSERT(offset > isize); 360153323Srodrigc 361153323Srodrigc mp = io->io_mount; 362153323Srodrigc 363153323Srodrigc /* 364153323Srodrigc * First handle zeroing the block on which isize resides. 365153323Srodrigc * We only zero a part of that block so it is handled specially. 366153323Srodrigc */ 367159451Srodrigc error = xfs_zero_last_block(vp, io, isize, end_size); 368153323Srodrigc if (error) { 369153323Srodrigc ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); 370153323Srodrigc ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); 371153323Srodrigc return error; 372153323Srodrigc } 373153323Srodrigc 374153323Srodrigc /* 375153323Srodrigc * Calculate the range between the new size and the old 376153323Srodrigc * where blocks needing to be zeroed may exist. To get the 377153323Srodrigc * block where the last byte in the file currently resides, 378153323Srodrigc * we need to subtract one from the size and truncate back 379153323Srodrigc * to a block boundary. We subtract 1 in case the size is 380153323Srodrigc * exactly on a block boundary. 381153323Srodrigc */ 382153323Srodrigc last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; 383153323Srodrigc start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); 384153323Srodrigc end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); 385153323Srodrigc ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); 386153323Srodrigc if (last_fsb == end_zero_fsb) { 387153323Srodrigc /* 388153323Srodrigc * The size was only incremented on its last block. 389153323Srodrigc * We took care of that above, so just return. 390153323Srodrigc */ 391153323Srodrigc return 0; 392153323Srodrigc } 393153323Srodrigc 394153323Srodrigc ASSERT(start_zero_fsb <= end_zero_fsb); 395153323Srodrigc while (start_zero_fsb <= end_zero_fsb) { 396153323Srodrigc nimaps = 1; 397153323Srodrigc zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; 398153323Srodrigc error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb, 399159451Srodrigc 0, NULL, 0, &imap, &nimaps, NULL, NULL); 400153323Srodrigc if (error) { 401153323Srodrigc ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); 402153323Srodrigc ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); 403153323Srodrigc return error; 404153323Srodrigc } 405153323Srodrigc ASSERT(nimaps > 0); 406153323Srodrigc 407153323Srodrigc if (imap.br_state == XFS_EXT_UNWRITTEN || 408153323Srodrigc imap.br_startblock == HOLESTARTBLOCK) { 409153323Srodrigc /* 410153323Srodrigc * This loop handles initializing pages that were 411153323Srodrigc * partially initialized by the code below this 412153323Srodrigc * loop. It basically zeroes the part of the page 413153323Srodrigc * that sits on a hole and sets the page as P_HOLE 414153323Srodrigc * and calls remapf if it is a mapped file. 415153323Srodrigc */ 416159451Srodrigc start_zero_fsb = imap.br_startoff + imap.br_blockcount; 417153323Srodrigc ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 418153323Srodrigc continue; 419153323Srodrigc } 420153323Srodrigc 421153323Srodrigc /* 422153323Srodrigc * There are blocks in the range requested. 423153323Srodrigc * Zero them a single write at a time. We actually 424153323Srodrigc * don't zero the entire range returned if it is 425153323Srodrigc * too big and simply loop around to get the rest. 426153323Srodrigc * That is not the most efficient thing to do, but it 427153323Srodrigc * is simple and this path should not be exercised often. 428153323Srodrigc */ 429153323Srodrigc buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount, 430153323Srodrigc mp->m_writeio_blocks << 8); 431153323Srodrigc /* 432153323Srodrigc * Drop the inode lock while we're doing the I/O. 433153323Srodrigc * We'll still have the iolock to protect us. 434153323Srodrigc */ 435153323Srodrigc XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); 436153323Srodrigc 437159451Srodrigc error = xfs_iozero(vp, 438159451Srodrigc XFS_FSB_TO_B(mp, start_zero_fsb), 439159451Srodrigc XFS_FSB_TO_B(mp, buf_len_fsb), 440159451Srodrigc end_size); 441153323Srodrigc 442153323Srodrigc if (error) { 443153323Srodrigc goto out_lock; 444153323Srodrigc } 445153323Srodrigc 446153323Srodrigc start_zero_fsb = imap.br_startoff + buf_len_fsb; 447153323Srodrigc ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 448153323Srodrigc 449153323Srodrigc XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); 450153323Srodrigc } 451153323Srodrigc 452153323Srodrigc return 0; 453153323Srodrigc 454153323Srodrigcout_lock: 455153323Srodrigc 456153323Srodrigc XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); 457153323Srodrigc ASSERT(error >= 0); 458153323Srodrigc return error; 459153323Srodrigc} 460153323Srodrigc 461153323Srodrigcssize_t /* bytes written, or (-) error */ 462153323Srodrigcxfs_write( 463153323Srodrigc bhv_desc_t *bdp, 464159451Srodrigc uio_t *uio, 465159451Srodrigc int ioflag, 466153323Srodrigc cred_t *credp) 467153323Srodrigc{ 468153323Srodrigc xfs_inode_t *xip; 469153323Srodrigc xfs_mount_t *mp; 470159451Srodrigc ssize_t ret = 0; 471153323Srodrigc int error = 0; 472153323Srodrigc xfs_fsize_t isize, new_size; 473153323Srodrigc xfs_fsize_t n, limit; 474159451Srodrigc xfs_fsize_t size; 475153323Srodrigc xfs_iocore_t *io; 476153323Srodrigc xfs_vnode_t *vp; 477153323Srodrigc int iolock; 478159451Srodrigc //int eventsent = 0; 479153323Srodrigc vrwlock_t locktype; 480159451Srodrigc xfs_off_t offset_c; 481159451Srodrigc xfs_off_t *offset; 482159451Srodrigc xfs_off_t pos; 483153323Srodrigc 484153323Srodrigc XFS_STATS_INC(xs_write_calls); 485153323Srodrigc 486153323Srodrigc vp = BHV_TO_VNODE(bdp); 487153323Srodrigc xip = XFS_BHVTOI(bdp); 488153323Srodrigc 489153323Srodrigc io = &xip->i_iocore; 490153323Srodrigc mp = io->io_mount; 491153323Srodrigc 492153323Srodrigc if (XFS_FORCED_SHUTDOWN(xip->i_mount)) { 493153323Srodrigc return EIO; 494153323Srodrigc } 495153323Srodrigc 496159451Srodrigc size = uio->uio_resid; 497159451Srodrigc pos = offset_c = uio->uio_offset; 498159451Srodrigc offset = &offset_c; 499159451Srodrigc 500159451Srodrigc if (unlikely(ioflag & IO_ISDIRECT)) { 501153323Srodrigc if (((__psint_t)buf & BBMASK) || 502153323Srodrigc (*offset & mp->m_blockmask) || 503153323Srodrigc (size & mp->m_blockmask)) { 504153323Srodrigc return EINVAL; 505153323Srodrigc } 506153323Srodrigc iolock = XFS_IOLOCK_SHARED; 507153323Srodrigc locktype = VRWLOCK_WRITE_DIRECT; 508153323Srodrigc } else { 509153323Srodrigc if (io->io_flags & XFS_IOCORE_RT) 510153323Srodrigc return EINVAL; 511153323Srodrigc iolock = XFS_IOLOCK_EXCL; 512153323Srodrigc locktype = VRWLOCK_WRITE; 513153323Srodrigc } 514153323Srodrigc 515159451Srodrigc iolock = XFS_IOLOCK_EXCL; 516159451Srodrigc locktype = VRWLOCK_WRITE; 517153323Srodrigc 518153323Srodrigc xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); 519153323Srodrigc 520153323Srodrigc isize = xip->i_d.di_size; 521153323Srodrigc limit = XFS_MAXIOFFSET(mp); 522153323Srodrigc 523159451Srodrigc if (ioflag & O_APPEND) 524153323Srodrigc *offset = isize; 525153323Srodrigc 526159451Srodrigc//start: 527153323Srodrigc n = limit - *offset; 528153323Srodrigc if (n <= 0) { 529153323Srodrigc xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); 530153323Srodrigc return EFBIG; 531153323Srodrigc } 532153323Srodrigc if (n < size) 533153323Srodrigc size = n; 534153323Srodrigc 535153323Srodrigc new_size = *offset + size; 536153323Srodrigc if (new_size > isize) { 537153323Srodrigc io->io_new_size = new_size; 538153323Srodrigc } 539153323Srodrigc 540159451Srodrigc#ifdef RMC 541159451Srodrigc /* probably be a long time before if ever that we do dmapi */ 542153323Srodrigc if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && 543153323Srodrigc !(ioflags & IO_INVIS) && !eventsent)) { 544153323Srodrigc loff_t savedsize = *offset; 545153323Srodrigc int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); 546153323Srodrigc 547153323Srodrigc xfs_iunlock(xip, XFS_ILOCK_EXCL); 548153323Srodrigc error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, 549153323Srodrigc *offset, size, 550153323Srodrigc dmflags, &locktype); 551153323Srodrigc if (error) { 552153323Srodrigc if (iolock) xfs_iunlock(xip, iolock); 553153323Srodrigc return -error; 554153323Srodrigc } 555153323Srodrigc xfs_ilock(xip, XFS_ILOCK_EXCL); 556153323Srodrigc eventsent = 1; 557153323Srodrigc 558153323Srodrigc /* 559153323Srodrigc * The iolock was dropped and reaquired in XFS_SEND_DATA 560153323Srodrigc * so we have to recheck the size when appending. 561153323Srodrigc * We will only "goto start;" once, since having sent the 562153323Srodrigc * event prevents another call to XFS_SEND_DATA, which is 563153323Srodrigc * what allows the size to change in the first place. 564153323Srodrigc */ 565153323Srodrigc if ((file->f_flags & O_APPEND) && 566153323Srodrigc savedsize != xip->i_d.di_size) { 567153323Srodrigc *offset = isize = xip->i_d.di_size; 568153323Srodrigc goto start; 569153323Srodrigc } 570153323Srodrigc } 571159451Srodrigc#endif 572153323Srodrigc 573153323Srodrigc /* 574153323Srodrigc * If the offset is beyond the size of the file, we have a couple 575153323Srodrigc * of things to do. First, if there is already space allocated 576153323Srodrigc * we need to either create holes or zero the disk or ... 577153323Srodrigc * 578153323Srodrigc * If there is a page where the previous size lands, we need 579153323Srodrigc * to zero it out up to the new size. 580153323Srodrigc */ 581153323Srodrigc 582159451Srodrigc if (!(ioflag & IO_ISDIRECT) && (*offset > isize && isize)) { 583153323Srodrigc error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset, 584153323Srodrigc isize, *offset + size); 585153323Srodrigc if (error) { 586153323Srodrigc xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); 587153323Srodrigc return(-error); 588153323Srodrigc } 589153323Srodrigc } 590153323Srodrigc xfs_iunlock(xip, XFS_ILOCK_EXCL); 591153323Srodrigc 592159451Srodrigc#if 0 593153323Srodrigc /* 594153323Srodrigc * If we're writing the file then make sure to clear the 595153323Srodrigc * setuid and setgid bits if the process is not being run 596153323Srodrigc * by root. This keeps people from modifying setuid and 597153323Srodrigc * setgid binaries. 598153323Srodrigc */ 599153323Srodrigc 600153323Srodrigc if (((xip->i_d.di_mode & S_ISUID) || 601153323Srodrigc ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == 602153323Srodrigc (S_ISGID | S_IXGRP))) && 603153323Srodrigc !capable(CAP_FSETID)) { 604153323Srodrigc error = xfs_write_clear_setuid(xip); 605159451Srodrigc if (likely(!error)) 606159451Srodrigc error = -remove_suid(file->f_dentry); 607159451Srodrigc if (unlikely(error)) { 608153323Srodrigc xfs_iunlock(xip, iolock); 609159451Srodrigc goto out_unlock_mutex; 610153323Srodrigc } 611153323Srodrigc } 612159451Srodrigc#endif 613153323Srodrigc 614153323Srodrigc 615159451Srodrigc//retry: 616159451Srodrigc if (unlikely(ioflag & IO_ISDIRECT)) { 617153323Srodrigc 618159451Srodrigc#ifdef RMC 619159451Srodrigc xfs_off_t pos = *offset; 620153323Srodrigc struct address_space *mapping = file->f_dentry->d_inode->i_mapping; 621153323Srodrigc struct inode *inode = mapping->host; 622153323Srodrigc 623153323Srodrigc ret = precheck_file_write(file, inode, &size, &pos); 624153323Srodrigc if (ret || size == 0) 625153323Srodrigc goto error; 626153323Srodrigc 627153323Srodrigc xfs_inval_cached_pages(vp, io, pos, 1, 1); 628153323Srodrigc inode->i_ctime = inode->i_mtime = CURRENT_TIME; 629153323Srodrigc /* mark_inode_dirty_sync(inode); - we do this later */ 630153323Srodrigc 631153323Srodrigc xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, buf, size, pos, ioflags); 632153323Srodrigc ret = generic_file_direct_IO(WRITE, file, (char *)buf, size, pos); 633159451Srodrigc xfs_inval_cached_pages(vp, io, pos, 1, 1); 634153323Srodrigc if (ret > 0) 635153323Srodrigc *offset += ret; 636159451Srodrigc#endif 637153323Srodrigc } else { 638153323Srodrigc xfs_rw_enter_trace(XFS_WRITE_ENTER, io, buf, size, *offset, ioflags); 639159451Srodrigc ret = xfs_write_file(xip,uio,ioflag); 640153323Srodrigc } 641153323Srodrigc 642159451Srodrigc xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 643153323Srodrigc 644153323Srodrigc 645159451Srodrigc//error: 646153323Srodrigc if (ret <= 0) { 647153323Srodrigc if (iolock) 648153323Srodrigc xfs_rwunlock(bdp, locktype); 649153323Srodrigc return ret; 650153323Srodrigc } 651153323Srodrigc 652153323Srodrigc XFS_STATS_ADD(xs_write_bytes, ret); 653153323Srodrigc 654153323Srodrigc if (*offset > xip->i_d.di_size) { 655153323Srodrigc xfs_ilock(xip, XFS_ILOCK_EXCL); 656153323Srodrigc if (*offset > xip->i_d.di_size) { 657159451Srodrigc printf("xfs_write look at doing more here %s:%d\n",__FILE__,__LINE__); 658159451Srodrigc#ifdef RMC 659153323Srodrigc struct inode *inode = LINVFS_GET_IP(vp); 660159451Srodrigc i_size_write(inode, *offset); 661159451Srodrigc mark_inode_dirty_sync(inode); 662159451Srodrigc#endif 663153323Srodrigc 664153323Srodrigc xip->i_d.di_size = *offset; 665153323Srodrigc xip->i_update_core = 1; 666153323Srodrigc xip->i_update_size = 1; 667153323Srodrigc } 668153323Srodrigc xfs_iunlock(xip, XFS_ILOCK_EXCL); 669153323Srodrigc } 670153323Srodrigc 671153323Srodrigc /* Handle various SYNC-type writes */ 672159451Srodrigc#if 0 673159451Srodrigc// if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { 674159451Srodrigc#endif 675159451Srodrigc if (ioflag & IO_SYNC) { 676153323Srodrigc /* 677153323Srodrigc * If we're treating this as O_DSYNC and we have not updated the 678153323Srodrigc * size, force the log. 679153323Srodrigc */ 680159451Srodrigc if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && 681159451Srodrigc !(xip->i_update_size)) { 682159451Srodrigc xfs_inode_log_item_t *iip = xip->i_itemp; 683153323Srodrigc 684153323Srodrigc /* 685153323Srodrigc * If an allocation transaction occurred 686153323Srodrigc * without extending the size, then we have to force 687153323Srodrigc * the log up the proper point to ensure that the 688153323Srodrigc * allocation is permanent. We can't count on 689153323Srodrigc * the fact that buffered writes lock out direct I/O 690153323Srodrigc * writes - the direct I/O write could have extended 691153323Srodrigc * the size nontransactionally, then finished before 692153323Srodrigc * we started. xfs_write_file will think that the file 693153323Srodrigc * didn't grow but the update isn't safe unless the 694153323Srodrigc * size change is logged. 695153323Srodrigc * 696153323Srodrigc * Force the log if we've committed a transaction 697153323Srodrigc * against the inode or if someone else has and 698153323Srodrigc * the commit record hasn't gone to disk (e.g. 699153323Srodrigc * the inode is pinned). This guarantees that 700153323Srodrigc * all changes affecting the inode are permanent 701153323Srodrigc * when we return. 702153323Srodrigc */ 703153323Srodrigc if (iip && iip->ili_last_lsn) { 704159451Srodrigc xfs_log_force(mp, iip->ili_last_lsn, 705153323Srodrigc XFS_LOG_FORCE | XFS_LOG_SYNC); 706153323Srodrigc } else if (xfs_ipincount(xip) > 0) { 707153323Srodrigc xfs_log_force(mp, (xfs_lsn_t)0, 708153323Srodrigc XFS_LOG_FORCE | XFS_LOG_SYNC); 709153323Srodrigc } 710153323Srodrigc 711153323Srodrigc } else { 712153323Srodrigc xfs_trans_t *tp; 713153323Srodrigc 714153323Srodrigc /* 715153323Srodrigc * O_SYNC or O_DSYNC _with_ a size update are handled 716153323Srodrigc * the same way. 717153323Srodrigc * 718153323Srodrigc * If the write was synchronous then we need to make 719153323Srodrigc * sure that the inode modification time is permanent. 720153323Srodrigc * We'll have updated the timestamp above, so here 721153323Srodrigc * we use a synchronous transaction to log the inode. 722153323Srodrigc * It's not fast, but it's necessary. 723153323Srodrigc * 724153323Srodrigc * If this a dsync write and the size got changed 725153323Srodrigc * non-transactionally, then we need to ensure that 726153323Srodrigc * the size change gets logged in a synchronous 727153323Srodrigc * transaction. 728153323Srodrigc */ 729153323Srodrigc 730153323Srodrigc tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); 731153323Srodrigc if ((error = xfs_trans_reserve(tp, 0, 732153323Srodrigc XFS_SWRITE_LOG_RES(mp), 733153323Srodrigc 0, 0, 0))) { 734153323Srodrigc /* Transaction reserve failed */ 735153323Srodrigc xfs_trans_cancel(tp, 0); 736153323Srodrigc } else { 737153323Srodrigc /* Transaction reserve successful */ 738153323Srodrigc xfs_ilock(xip, XFS_ILOCK_EXCL); 739153323Srodrigc xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL); 740153323Srodrigc xfs_trans_ihold(tp, xip); 741153323Srodrigc xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE); 742153323Srodrigc xfs_trans_set_sync(tp); 743159451Srodrigc error = xfs_trans_commit(tp, 0, NULL); 744153323Srodrigc xfs_iunlock(xip, XFS_ILOCK_EXCL); 745153323Srodrigc } 746159451Srodrigc if (error) 747159451Srodrigc goto out_unlock_internal; 748153323Srodrigc } 749153323Srodrigc 750159451Srodrigc xfs_rwunlock(bdp, locktype); 751159451Srodrigc return ret; 752153323Srodrigc 753159451Srodrigc } /* (ioflags & O_SYNC) */ 754153323Srodrigc 755159451Srodrigcout_unlock_internal: 756159451Srodrigc xfs_rwunlock(bdp, locktype); 757159451Srodrigc#if 0 758159451Srodrigcout_unlock_mutex: 759159451Srodrigc if (need_i_mutex) 760159451Srodrigc mutex_unlock(&inode->i_mutex); 761159451Srodrigc#endif 762159451Srodrigc //out_nounlocks: 763159451Srodrigc return -error; 764153323Srodrigc} 765153323Srodrigc 766153323Srodrigc/* 767153323Srodrigc * Initiate IO on given buffer. 768153323Srodrigc */ 769153323Srodrigcint 770153323Srodrigcxfs_buf_iorequest(struct xfs_buf *bp) 771153323Srodrigc{ 772153323Srodrigc bp->b_flags &= ~(B_INVAL|B_DONE); 773153323Srodrigc bp->b_ioflags &= ~BIO_ERROR; 774153323Srodrigc 775153323Srodrigc if (bp->b_flags & B_ASYNC) 776153323Srodrigc BUF_KERNPROC(bp); 777153323Srodrigc 778153323Srodrigc if (bp->b_vp == NULL) { 779153323Srodrigc if (bp->b_iocmd == BIO_WRITE) { 780153323Srodrigc bp->b_flags &= ~(B_DELWRI | B_DEFERRED); 781153323Srodrigc bufobj_wref(bp->b_bufobj); 782153323Srodrigc } 783153323Srodrigc 784153323Srodrigc bp->b_iooffset = (bp->b_blkno << BBSHIFT); 785153323Srodrigc bstrategy(bp); 786153323Srodrigc } else { 787153323Srodrigc if (bp->b_iocmd == BIO_WRITE) { 788153323Srodrigc /* Mark the buffer clean */ 789153323Srodrigc bundirty(bp); 790153323Srodrigc bufobj_wref(bp->b_bufobj); 791153323Srodrigc vfs_busy_pages(bp, 1); 792153323Srodrigc } else if (bp->b_iocmd == BIO_READ) { 793153323Srodrigc vfs_busy_pages(bp, 0); 794153323Srodrigc } 795153323Srodrigc bp->b_iooffset = dbtob(bp->b_blkno); 796153323Srodrigc bstrategy(bp); 797153323Srodrigc } 798153323Srodrigc return 0; 799153323Srodrigc} 800153323Srodrigc 801153323Srodrigc/* 802153323Srodrigc * All xfs metadata buffers except log state machine buffers 803153323Srodrigc * get this attached as their b_bdstrat callback function. 804153323Srodrigc * This is so that we can catch a buffer 805153323Srodrigc * after prematurely unpinning it to forcibly shutdown the filesystem. 806153323Srodrigc */ 807153323Srodrigcint 808153323Srodrigcxfs_bdstrat_cb(struct xfs_buf *bp) 809153323Srodrigc{ 810153323Srodrigc xfs_mount_t *mp; 811153323Srodrigc 812153323Srodrigc mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *); 813153323Srodrigc if (!XFS_FORCED_SHUTDOWN(mp)) { 814153323Srodrigc xfs_buf_iorequest(bp); 815153323Srodrigc return 0; 816153323Srodrigc } else { 817153323Srodrigc xfs_buftrace("XFS__BDSTRAT IOERROR", bp); 818153323Srodrigc /* 819153323Srodrigc * Metadata write that didn't get logged but 820153323Srodrigc * written delayed anyway. These aren't associated 821153323Srodrigc * with a transaction, and can be ignored. 822153323Srodrigc */ 823153323Srodrigc if (XFS_BUF_IODONE_FUNC(bp) == NULL && 824153323Srodrigc (XFS_BUF_ISREAD(bp)) == 0) 825153323Srodrigc return (xfs_bioerror_relse(bp)); 826153323Srodrigc else 827153323Srodrigc return (xfs_bioerror(bp)); 828153323Srodrigc } 829153323Srodrigc} 830153323Srodrigc 831153323Srodrigc 832153323Srodrigcint 833153323Srodrigcxfs_bmap(bhv_desc_t *bdp, 834153323Srodrigc xfs_off_t offset, 835153323Srodrigc ssize_t count, 836153323Srodrigc int flags, 837153323Srodrigc xfs_iomap_t *iomapp, 838153323Srodrigc int *niomaps) 839153323Srodrigc{ 840153323Srodrigc xfs_inode_t *ip = XFS_BHVTOI(bdp); 841153323Srodrigc xfs_iocore_t *io = &ip->i_iocore; 842153323Srodrigc 843153323Srodrigc ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); 844153323Srodrigc ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) == 845153323Srodrigc ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0)); 846153323Srodrigc 847153323Srodrigc return xfs_iomap(io, offset, count, flags, iomapp, niomaps); 848153323Srodrigc} 849153323Srodrigc 850153323Srodrigc/* 851153323Srodrigc * Wrapper around bdstrat so that we can stop data 852153323Srodrigc * from going to disk in case we are shutting down the filesystem. 853153323Srodrigc * Typically user data goes thru this path; one of the exceptions 854153323Srodrigc * is the superblock. 855153323Srodrigc */ 856153323Srodrigcint 857153323Srodrigcxfsbdstrat( 858153323Srodrigc struct xfs_mount *mp, 859153323Srodrigc struct xfs_buf *bp) 860153323Srodrigc{ 861153323Srodrigc ASSERT(mp); 862153323Srodrigc if (!XFS_FORCED_SHUTDOWN(mp)) { 863153323Srodrigc 864159451Srodrigc xfs_buf_iorequest(bp); 865153323Srodrigc return 0; 866153323Srodrigc } 867153323Srodrigc 868153323Srodrigc xfs_buftrace("XFSBDSTRAT IOERROR", bp); 869153323Srodrigc return (xfs_bioerror_relse(bp)); 870153323Srodrigc} 871153323Srodrigc 872153323Srodrigc/* 873153323Srodrigc * If the underlying (data/log/rt) device is readonly, there are some 874153323Srodrigc * operations that cannot proceed. 875153323Srodrigc */ 876153323Srodrigcint 877153323Srodrigcxfs_dev_is_read_only( 878153323Srodrigc xfs_mount_t *mp, 879153323Srodrigc char *message) 880153323Srodrigc{ 881153323Srodrigc if (xfs_readonly_buftarg(mp->m_ddev_targp) || 882153323Srodrigc xfs_readonly_buftarg(mp->m_logdev_targp) || 883153323Srodrigc (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { 884153323Srodrigc cmn_err(CE_NOTE, 885153323Srodrigc "XFS: %s required on read-only device.", message); 886153323Srodrigc cmn_err(CE_NOTE, 887153323Srodrigc "XFS: write access unavailable, cannot proceed."); 888153323Srodrigc return EROFS; 889153323Srodrigc } 890153323Srodrigc return 0; 891153323Srodrigc} 892