xfs_inode.c revision 153323
1/* 2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify it 5 * under the terms of version 2 of the GNU General Public License as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it would be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 * 12 * Further, this software is distributed without any warranty that it is 13 * free of the rightful claim of any third person regarding infringement 14 * or the like. Any license provided herein, whether implied or 15 * otherwise, applies only to this software file. Patent licenses, if 16 * any, provided herein do not apply to combinations of this program with 17 * other software, or any other product whatsoever. 18 * 19 * You should have received a copy of the GNU General Public License along 20 * with this program; if not, write the Free Software Foundation, Inc., 59 21 * Temple Place - Suite 330, Boston MA 02111-1307, USA. 22 * 23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, 24 * Mountain View, CA 94043, or: 25 * 26 * http://www.sgi.com 27 * 28 * For further information regarding this notice, see: 29 * 30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ 31 */ 32 33#include "xfs.h" 34#include "xfs_macros.h" 35#include "xfs_types.h" 36#include "xfs_inum.h" 37#include "xfs_log.h" 38#include "xfs_trans.h" 39#include "xfs_trans_priv.h" 40#include "xfs_sb.h" 41#include "xfs_ag.h" 42#include "xfs_dir.h" 43#include "xfs_dir2.h" 44#include "xfs_dmapi.h" 45#include "xfs_mount.h" 46#include "xfs_alloc_btree.h" 47#include "xfs_bmap_btree.h" 48#include "xfs_ialloc_btree.h" 49#include "xfs_btree.h" 50#include "xfs_imap.h" 51#include "xfs_alloc.h" 52#include "xfs_ialloc.h" 53#include "xfs_attr_sf.h" 54#include "xfs_dir_sf.h" 55#include "xfs_dir2_sf.h" 56#include "xfs_dinode.h" 57#include "xfs_inode_item.h" 58#include "xfs_inode.h" 59#include "xfs_bmap.h" 60#include "xfs_buf_item.h" 61#include "xfs_rw.h" 62#include "xfs_error.h" 63#include "xfs_bit.h" 64#include "xfs_utils.h" 65#include "xfs_dir2_trace.h" 66#include "xfs_quota.h" 67#include "xfs_mac.h" 68#include "xfs_acl.h" 69 70 71kmem_zone_t *xfs_ifork_zone; 72kmem_zone_t *xfs_inode_zone; 73kmem_zone_t *xfs_chashlist_zone; 74 75/* 76 * Used in xfs_itruncate(). This is the maximum number of extents 77 * freed from a file in a single transaction. 78 */ 79#define XFS_ITRUNC_MAX_EXTENTS 2 80 81STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 82STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); 83STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); 84STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); 85 86 87#ifdef DEBUG 88/* 89 * Make sure that the extents in the given memory buffer 90 * are valid. 91 */ 92STATIC void 93xfs_validate_extents( 94 xfs_bmbt_rec_t *ep, 95 int nrecs, 96 int disk, 97 xfs_exntfmt_t fmt) 98{ 99 xfs_bmbt_irec_t irec; 100 xfs_bmbt_rec_t rec; 101 int i; 102 103 for (i = 0; i < nrecs; i++) { 104 rec.l0 = get_unaligned((__uint64_t*)&ep->l0); 105 rec.l1 = get_unaligned((__uint64_t*)&ep->l1); 106 if (disk) 107 xfs_bmbt_disk_get_all(&rec, &irec); 108 else 109 xfs_bmbt_get_all(&rec, &irec); 110 if (fmt == XFS_EXTFMT_NOSTATE) 111 ASSERT(irec.br_state == XFS_EXT_NORM); 112 ep++; 113 } 114} 115#else /* DEBUG */ 116#define xfs_validate_extents(ep, nrecs, disk, fmt) 117#endif /* DEBUG */ 118 119/* 120 * Check that none of the inode's in the buffer have a next 121 * unlinked field of 0. 122 */ 123#if defined(DEBUG) 124void 125xfs_inobp_check( 126 xfs_mount_t *mp, 127 xfs_buf_t *bp) 128{ 129 int i; 130 int j; 131 xfs_dinode_t *dip; 132 133 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; 134 135 for (i = 0; i < j; i++) { 136 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 137 i * mp->m_sb.sb_inodesize); 138 if (INT_ISZERO(dip->di_next_unlinked, ARCH_CONVERT)) { 139 xfs_fs_cmn_err(CE_ALERT, mp, 140 "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p. About to pop an ASSERT.", 141 bp); 142 ASSERT(!INT_ISZERO(dip->di_next_unlinked, ARCH_CONVERT)); 143 } 144 } 145} 146#endif 147 148/* 149 * This routine is called to map an inode number within a file 150 * system to the buffer containing the on-disk version of the 151 * inode. It returns a pointer to the buffer containing the 152 * on-disk inode in the bpp parameter, and in the dip parameter 153 * it returns a pointer to the on-disk inode within that buffer. 154 * 155 * If a non-zero error is returned, then the contents of bpp and 156 * dipp are undefined. 157 * 158 * Use xfs_imap() to determine the size and location of the 159 * buffer to read from disk. 160 */ 161int 162xfs_inotobp( 163 xfs_mount_t *mp, 164 xfs_trans_t *tp, 165 xfs_ino_t ino, 166 xfs_dinode_t **dipp, 167 xfs_buf_t **bpp, 168 int *offset) 169{ 170 int di_ok; 171 xfs_imap_t imap; 172 xfs_buf_t *bp; 173 int error; 174 xfs_dinode_t *dip; 175 176 /* 177 * Call the space managment code to find the location of the 178 * inode on disk. 179 */ 180 imap.im_blkno = 0; 181 error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP); 182 if (error != 0) { 183 cmn_err(CE_WARN, 184 "xfs_inotobp: xfs_imap() returned an " 185 "error %d on %s. Returning error.", error, mp->m_fsname); 186 return error; 187 } 188 189 /* 190 * If the inode number maps to a block outside the bounds of the 191 * file system then return NULL rather than calling read_buf 192 * and panicing when we get an error from the driver. 193 */ 194 if ((imap.im_blkno + imap.im_len) > 195 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { 196 cmn_err(CE_WARN, 197 "xfs_inotobp: inode number (%d + %d) maps to a block outside the bounds " 198 "of the file system %s. Returning EINVAL.", 199 imap.im_blkno, imap.im_len,mp->m_fsname); 200 return XFS_ERROR(EINVAL); 201 } 202 203 /* 204 * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will 205 * default to just a read_buf() call. 206 */ 207 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, 208 (int)imap.im_len, XFS_BUF_LOCK, &bp); 209 210 if (error) { 211 cmn_err(CE_WARN, 212 "xfs_inotobp: xfs_trans_read_buf() returned an " 213 "error %d on %s. Returning error.", error, mp->m_fsname); 214 return error; 215 } 216 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0); 217 di_ok = 218 INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && 219 XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); 220 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, 221 XFS_RANDOM_ITOBP_INOTOBP))) { 222 XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip); 223 xfs_trans_brelse(tp, bp); 224 cmn_err(CE_WARN, 225 "xfs_inotobp: XFS_TEST_ERROR() returned an " 226 "error on %s. Returning EFSCORRUPTED.", mp->m_fsname); 227 return XFS_ERROR(EFSCORRUPTED); 228 } 229 230 xfs_inobp_check(mp, bp); 231 232 /* 233 * Set *dipp to point to the on-disk inode in the buffer. 234 */ 235 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 236 *bpp = bp; 237 *offset = imap.im_boffset; 238 return 0; 239} 240 241 242/* 243 * This routine is called to map an inode to the buffer containing 244 * the on-disk version of the inode. It returns a pointer to the 245 * buffer containing the on-disk inode in the bpp parameter, and in 246 * the dip parameter it returns a pointer to the on-disk inode within 247 * that buffer. 248 * 249 * If a non-zero error is returned, then the contents of bpp and 250 * dipp are undefined. 251 * 252 * If the inode is new and has not yet been initialized, use xfs_imap() 253 * to determine the size and location of the buffer to read from disk. 254 * If the inode has already been mapped to its buffer and read in once, 255 * then use the mapping information stored in the inode rather than 256 * calling xfs_imap(). This allows us to avoid the overhead of looking 257 * at the inode btree for small block file systems (see xfs_dilocate()). 258 * We can tell whether the inode has been mapped in before by comparing 259 * its disk block address to 0. Only uninitialized inodes will have 260 * 0 for the disk block address. 261 */ 262int 263xfs_itobp( 264 xfs_mount_t *mp, 265 xfs_trans_t *tp, 266 xfs_inode_t *ip, 267 xfs_dinode_t **dipp, 268 xfs_buf_t **bpp, 269 xfs_daddr_t bno) 270{ 271 xfs_buf_t *bp; 272 int error; 273 xfs_imap_t imap; 274#ifdef __KERNEL__ 275 int i; 276 int ni; 277#endif 278 279 if (ip->i_blkno == (xfs_daddr_t)0) { 280 /* 281 * Call the space management code to find the location of the 282 * inode on disk. 283 */ 284 imap.im_blkno = bno; 285 error = xfs_imap(mp, tp, ip->i_ino, &imap, XFS_IMAP_LOOKUP); 286 if (error != 0) { 287 return error; 288 } 289 290 /* 291 * If the inode number maps to a block outside the bounds 292 * of the file system then return NULL rather than calling 293 * read_buf and panicing when we get an error from the 294 * driver. 295 */ 296 if ((imap.im_blkno + imap.im_len) > 297 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { 298#ifdef DEBUG 299 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " 300 "(imap.im_blkno (0x%llx) " 301 "+ imap.im_len (0x%llx)) > " 302 " XFS_FSB_TO_BB(mp, " 303 "mp->m_sb.sb_dblocks) (0x%llx)", 304 (unsigned long long) imap.im_blkno, 305 (unsigned long long) imap.im_len, 306 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); 307#endif /* DEBUG */ 308 return XFS_ERROR(EINVAL); 309 } 310 311 /* 312 * Fill in the fields in the inode that will be used to 313 * map the inode to its buffer from now on. 314 */ 315 ip->i_blkno = imap.im_blkno; 316 ip->i_len = imap.im_len; 317 ip->i_boffset = imap.im_boffset; 318 } else { 319 /* 320 * We've already mapped the inode once, so just use the 321 * mapping that we saved the first time. 322 */ 323 imap.im_blkno = ip->i_blkno; 324 imap.im_len = ip->i_len; 325 imap.im_boffset = ip->i_boffset; 326 } 327 ASSERT(bno == 0 || bno == imap.im_blkno); 328 329 /* 330 * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will 331 * default to just a read_buf() call. 332 */ 333 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, 334 (int)imap.im_len, XFS_BUF_LOCK, &bp); 335 336 if (error) { 337#ifdef DEBUG 338 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " 339 "xfs_trans_read_buf() returned error %d, " 340 "imap.im_blkno 0x%llx, imap.im_len 0x%llx", 341 error, (unsigned long long) imap.im_blkno, 342 (unsigned long long) imap.im_len); 343#endif /* DEBUG */ 344 return error; 345 } 346#ifdef __KERNEL__ 347 /* 348 * Validate the magic number and version of every inode in the buffer 349 * (if DEBUG kernel) or the first inode in the buffer, otherwise. 350 */ 351#ifdef DEBUG 352 ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog; 353#else 354 ni = 1; 355#endif 356 for (i = 0; i < ni; i++) { 357 int di_ok; 358 xfs_dinode_t *dip; 359 360 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 361 (i << mp->m_sb.sb_inodelog)); 362 di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && 363 XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); 364 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, 365 XFS_RANDOM_ITOBP_INOTOBP))) { 366#ifdef DEBUG 367 prdev("bad inode magic/vsn daddr %lld #%d (magic=%x)", 368 mp->m_ddev_targp, 369 (unsigned long long)imap.im_blkno, i, 370 INT_GET(dip->di_core.di_magic, ARCH_CONVERT)); 371#endif 372 XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH, 373 mp, dip); 374 xfs_trans_brelse(tp, bp); 375 return XFS_ERROR(EFSCORRUPTED); 376 } 377 } 378#endif /* __KERNEL__ */ 379 380 xfs_inobp_check(mp, bp); 381 382 /* 383 * Mark the buffer as an inode buffer now that it looks good 384 */ 385 XFS_BUF_SET_VTYPE(bp, B_FS_INO); 386 387 /* 388 * Set *dipp to point to the on-disk inode in the buffer. 389 */ 390 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 391 *bpp = bp; 392 return 0; 393} 394 395/* 396 * Move inode type and inode format specific information from the 397 * on-disk inode to the in-core inode. For fifos, devs, and sockets 398 * this means set if_rdev to the proper value. For files, directories, 399 * and symlinks this means to bring in the in-line data or extent 400 * pointers. For a file in B-tree format, only the root is immediately 401 * brought in-core. The rest will be in-lined in if_extents when it 402 * is first referenced (see xfs_iread_extents()). 403 */ 404STATIC int 405xfs_iformat( 406 xfs_inode_t *ip, 407 xfs_dinode_t *dip) 408{ 409 xfs_attr_shortform_t *atp; 410 int size; 411 int error; 412 xfs_fsize_t di_size; 413 ip->i_df.if_ext_max = 414 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 415 error = 0; 416 417 if (unlikely( 418 INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) + 419 INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) > 420 INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) { 421 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 422 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu." 423 " Unmount and run xfs_repair.", 424 (unsigned long long)ip->i_ino, 425 (int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) 426 + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)), 427 (unsigned long long) 428 INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT)); 429 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, 430 ip->i_mount, dip); 431 return XFS_ERROR(EFSCORRUPTED); 432 } 433 434 if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) { 435 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 436 "corrupt dinode %Lu, forkoff = 0x%x." 437 " Unmount and run xfs_repair.", 438 (unsigned long long)ip->i_ino, 439 (int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT))); 440 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 441 ip->i_mount, dip); 442 return XFS_ERROR(EFSCORRUPTED); 443 } 444 445 switch (ip->i_d.di_mode & S_IFMT) { 446 case S_IFIFO: 447 case S_IFCHR: 448 case S_IFBLK: 449 case S_IFSOCK: 450 if (unlikely(INT_GET(dip->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_DEV)) { 451 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, 452 ip->i_mount, dip); 453 return XFS_ERROR(EFSCORRUPTED); 454 } 455 ip->i_d.di_size = 0; 456 ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT); 457 break; 458 459 case S_IFREG: 460 case S_IFLNK: 461 case S_IFDIR: 462 switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) { 463 case XFS_DINODE_FMT_LOCAL: 464 /* 465 * no local regular files yet 466 */ 467 if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) { 468 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 469 "corrupt inode (local format for regular file) %Lu. Unmount and run xfs_repair.", 470 (unsigned long long) ip->i_ino); 471 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 472 XFS_ERRLEVEL_LOW, 473 ip->i_mount, dip); 474 return XFS_ERROR(EFSCORRUPTED); 475 } 476 477 di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT); 478 if (unlikely(di_size > 479 XFS_DFORK_DSIZE_ARCH(dip, ip->i_mount, ARCH_CONVERT))) { 480 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 481 "corrupt inode %Lu (bad size %Ld for local inode). Unmount and run xfs_repair.", 482 (unsigned long long) ip->i_ino, 483 (long long) di_size); 484 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 485 XFS_ERRLEVEL_LOW, 486 ip->i_mount, dip); 487 return XFS_ERROR(EFSCORRUPTED); 488 } 489 490 size = (int)di_size; 491 error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); 492 break; 493 case XFS_DINODE_FMT_EXTENTS: 494 error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); 495 break; 496 case XFS_DINODE_FMT_BTREE: 497 error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); 498 break; 499 default: 500 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, 501 ip->i_mount); 502 return XFS_ERROR(EFSCORRUPTED); 503 } 504 break; 505 506 default: 507 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); 508 return XFS_ERROR(EFSCORRUPTED); 509 } 510 if (error) { 511 return error; 512 } 513 if (!XFS_DFORK_Q_ARCH(dip, ARCH_CONVERT)) 514 return 0; 515 ASSERT(ip->i_afp == NULL); 516 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); 517 ip->i_afp->if_ext_max = 518 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 519 switch (INT_GET(dip->di_core.di_aformat, ARCH_CONVERT)) { 520 case XFS_DINODE_FMT_LOCAL: 521 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR_ARCH(dip, ARCH_CONVERT); 522 size = (int)INT_GET(atp->hdr.totsize, ARCH_CONVERT); 523 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); 524 break; 525 case XFS_DINODE_FMT_EXTENTS: 526 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); 527 break; 528 case XFS_DINODE_FMT_BTREE: 529 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); 530 break; 531 default: 532 error = XFS_ERROR(EFSCORRUPTED); 533 break; 534 } 535 if (error) { 536 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 537 ip->i_afp = NULL; 538 xfs_idestroy_fork(ip, XFS_DATA_FORK); 539 } 540 return error; 541} 542 543/* 544 * The file is in-lined in the on-disk inode. 545 * If it fits into if_inline_data, then copy 546 * it there, otherwise allocate a buffer for it 547 * and copy the data there. Either way, set 548 * if_data to point at the data. 549 * If we allocate a buffer for the data, make 550 * sure that its size is a multiple of 4 and 551 * record the real size in i_real_bytes. 552 */ 553STATIC int 554xfs_iformat_local( 555 xfs_inode_t *ip, 556 xfs_dinode_t *dip, 557 int whichfork, 558 int size) 559{ 560 xfs_ifork_t *ifp; 561 int real_size; 562 563 /* 564 * If the size is unreasonable, then something 565 * is wrong and we just bail out rather than crash in 566 * kmem_alloc() or memcpy() below. 567 */ 568 if (unlikely(size > XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT))) { 569 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 570 "corrupt inode %Lu (bad size %d for local fork, size = %d). Unmount and run xfs_repair.", 571 (unsigned long long) ip->i_ino, size, 572 XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT)); 573 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, 574 ip->i_mount, dip); 575 return XFS_ERROR(EFSCORRUPTED); 576 } 577 ifp = XFS_IFORK_PTR(ip, whichfork); 578 real_size = 0; 579 if (size == 0) 580 ifp->if_u1.if_data = NULL; 581 else if (size <= sizeof(ifp->if_u2.if_inline_data)) 582 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 583 else { 584 real_size = roundup(size, 4); 585 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 586 } 587 ifp->if_bytes = size; 588 ifp->if_real_bytes = real_size; 589 if (size) 590 memcpy(ifp->if_u1.if_data, 591 XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT), size); 592 ifp->if_flags &= ~XFS_IFEXTENTS; 593 ifp->if_flags |= XFS_IFINLINE; 594 return 0; 595} 596 597/* 598 * The file consists of a set of extents all 599 * of which fit into the on-disk inode. 600 * If there are few enough extents to fit into 601 * the if_inline_ext, then copy them there. 602 * Otherwise allocate a buffer for them and copy 603 * them into it. Either way, set if_extents 604 * to point at the extents. 605 */ 606STATIC int 607xfs_iformat_extents( 608 xfs_inode_t *ip, 609 xfs_dinode_t *dip, 610 int whichfork) 611{ 612 xfs_bmbt_rec_t *ep, *dp; 613 xfs_ifork_t *ifp; 614 int nex; 615 int real_size; 616 int size; 617 int i; 618 619 ifp = XFS_IFORK_PTR(ip, whichfork); 620 nex = XFS_DFORK_NEXTENTS_ARCH(dip, whichfork, ARCH_CONVERT); 621 size = nex * (uint)sizeof(xfs_bmbt_rec_t); 622 623 /* 624 * If the number of extents is unreasonable, then something 625 * is wrong and we just bail out rather than crash in 626 * kmem_alloc() or memcpy() below. 627 */ 628 if (unlikely(size < 0 || size > XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT))) { 629 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 630 "corrupt inode %Lu ((a)extents = %d). Unmount and run xfs_repair.", 631 (unsigned long long) ip->i_ino, nex); 632 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, 633 ip->i_mount, dip); 634 return XFS_ERROR(EFSCORRUPTED); 635 } 636 637 real_size = 0; 638 if (nex == 0) 639 ifp->if_u1.if_extents = NULL; 640 else if (nex <= XFS_INLINE_EXTS) 641 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 642 else { 643 ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP); 644 ASSERT(ifp->if_u1.if_extents != NULL); 645 real_size = size; 646 } 647 ifp->if_bytes = size; 648 ifp->if_real_bytes = real_size; 649 if (size) { 650 dp = (xfs_bmbt_rec_t *) 651 XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT); 652 xfs_validate_extents(dp, nex, 1, XFS_EXTFMT_INODE(ip)); 653 ep = ifp->if_u1.if_extents; 654 for (i = 0; i < nex; i++, ep++, dp++) { 655 ep->l0 = INT_GET(get_unaligned((__uint64_t*)&dp->l0), 656 ARCH_CONVERT); 657 ep->l1 = INT_GET(get_unaligned((__uint64_t*)&dp->l1), 658 ARCH_CONVERT); 659 } 660 xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex, 661 whichfork); 662 if (whichfork != XFS_DATA_FORK || 663 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) 664 if (unlikely(xfs_check_nostate_extents( 665 ifp->if_u1.if_extents, nex))) { 666 XFS_ERROR_REPORT("xfs_iformat_extents(2)", 667 XFS_ERRLEVEL_LOW, 668 ip->i_mount); 669 return XFS_ERROR(EFSCORRUPTED); 670 } 671 } 672 ifp->if_flags |= XFS_IFEXTENTS; 673 return 0; 674} 675 676/* 677 * The file has too many extents to fit into 678 * the inode, so they are in B-tree format. 679 * Allocate a buffer for the root of the B-tree 680 * and copy the root into it. The i_extents 681 * field will remain NULL until all of the 682 * extents are read in (when they are needed). 683 */ 684STATIC int 685xfs_iformat_btree( 686 xfs_inode_t *ip, 687 xfs_dinode_t *dip, 688 int whichfork) 689{ 690 xfs_bmdr_block_t *dfp; 691 xfs_ifork_t *ifp; 692 /* REFERENCED */ 693 int nrecs; 694 int size; 695 696 ifp = XFS_IFORK_PTR(ip, whichfork); 697 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT); 698 size = XFS_BMAP_BROOT_SPACE(dfp); 699 nrecs = XFS_BMAP_BROOT_NUMRECS(dfp); 700 701 /* 702 * blow out if -- fork has less extents than can fit in 703 * fork (fork shouldn't be a btree format), root btree 704 * block has more records than can fit into the fork, 705 * or the number of extents is greater than the number of 706 * blocks. 707 */ 708 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max 709 || XFS_BMDR_SPACE_CALC(nrecs) > 710 XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT) 711 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 712 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 713 "corrupt inode %Lu (btree). Unmount and run xfs_repair.", 714 (unsigned long long) ip->i_ino); 715 XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 716 ip->i_mount); 717 return XFS_ERROR(EFSCORRUPTED); 718 } 719 720 ifp->if_broot_bytes = size; 721 ifp->if_broot = kmem_alloc(size, KM_SLEEP); 722 ASSERT(ifp->if_broot != NULL); 723 /* 724 * Copy and convert from the on-disk structure 725 * to the in-memory structure. 726 */ 727 xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT), 728 ifp->if_broot, size); 729 ifp->if_flags &= ~XFS_IFEXTENTS; 730 ifp->if_flags |= XFS_IFBROOT; 731 732 return 0; 733} 734 735/* 736 * xfs_xlate_dinode_core - translate an xfs_inode_core_t between ondisk 737 * and native format 738 * 739 * buf = on-disk representation 740 * dip = native representation 741 * dir = direction - +ve -> disk to native 742 * -ve -> native to disk 743 * arch = on-disk architecture 744 */ 745void 746xfs_xlate_dinode_core( 747 xfs_caddr_t buf, 748 xfs_dinode_core_t *dip, 749 int dir, 750 xfs_arch_t arch) 751{ 752 xfs_dinode_core_t *buf_core = (xfs_dinode_core_t *)buf; 753 xfs_dinode_core_t *mem_core = (xfs_dinode_core_t *)dip; 754 755 ASSERT(dir); 756 if (arch == ARCH_NOCONVERT) { 757 if (dir > 0) { 758 memcpy((xfs_caddr_t)mem_core, (xfs_caddr_t)buf_core, 759 sizeof(xfs_dinode_core_t)); 760 } else { 761 memcpy((xfs_caddr_t)buf_core, (xfs_caddr_t)mem_core, 762 sizeof(xfs_dinode_core_t)); 763 } 764 return; 765 } 766 767 INT_XLATE(buf_core->di_magic, mem_core->di_magic, dir, arch); 768 INT_XLATE(buf_core->di_mode, mem_core->di_mode, dir, arch); 769 INT_XLATE(buf_core->di_version, mem_core->di_version, dir, arch); 770 INT_XLATE(buf_core->di_format, mem_core->di_format, dir, arch); 771 INT_XLATE(buf_core->di_onlink, mem_core->di_onlink, dir, arch); 772 INT_XLATE(buf_core->di_uid, mem_core->di_uid, dir, arch); 773 INT_XLATE(buf_core->di_gid, mem_core->di_gid, dir, arch); 774 INT_XLATE(buf_core->di_nlink, mem_core->di_nlink, dir, arch); 775 INT_XLATE(buf_core->di_projid, mem_core->di_projid, dir, arch); 776 777 if (dir > 0) { 778 memcpy(mem_core->di_pad, buf_core->di_pad, 779 sizeof(buf_core->di_pad)); 780 } else { 781 memcpy(buf_core->di_pad, mem_core->di_pad, 782 sizeof(buf_core->di_pad)); 783 } 784 785 INT_XLATE(buf_core->di_flushiter, mem_core->di_flushiter, dir, arch); 786 787 INT_XLATE(buf_core->di_atime.t_sec, mem_core->di_atime.t_sec, 788 dir, arch); 789 INT_XLATE(buf_core->di_atime.t_nsec, mem_core->di_atime.t_nsec, 790 dir, arch); 791 INT_XLATE(buf_core->di_mtime.t_sec, mem_core->di_mtime.t_sec, 792 dir, arch); 793 INT_XLATE(buf_core->di_mtime.t_nsec, mem_core->di_mtime.t_nsec, 794 dir, arch); 795 INT_XLATE(buf_core->di_ctime.t_sec, mem_core->di_ctime.t_sec, 796 dir, arch); 797 INT_XLATE(buf_core->di_ctime.t_nsec, mem_core->di_ctime.t_nsec, 798 dir, arch); 799 INT_XLATE(buf_core->di_size, mem_core->di_size, dir, arch); 800 INT_XLATE(buf_core->di_nblocks, mem_core->di_nblocks, dir, arch); 801 INT_XLATE(buf_core->di_extsize, mem_core->di_extsize, dir, arch); 802 INT_XLATE(buf_core->di_nextents, mem_core->di_nextents, dir, arch); 803 INT_XLATE(buf_core->di_anextents, mem_core->di_anextents, dir, arch); 804 INT_XLATE(buf_core->di_forkoff, mem_core->di_forkoff, dir, arch); 805 INT_XLATE(buf_core->di_aformat, mem_core->di_aformat, dir, arch); 806 INT_XLATE(buf_core->di_dmevmask, mem_core->di_dmevmask, dir, arch); 807 INT_XLATE(buf_core->di_dmstate, mem_core->di_dmstate, dir, arch); 808 INT_XLATE(buf_core->di_flags, mem_core->di_flags, dir, arch); 809 INT_XLATE(buf_core->di_gen, mem_core->di_gen, dir, arch); 810} 811 812/* 813 * Given a mount structure and an inode number, return a pointer 814 * to a newly allocated in-core inode coresponding to the given 815 * inode number. 816 * 817 * Initialize the inode's attributes and extent pointers if it 818 * already has them (it will not if the inode has no links). 819 */ 820int 821xfs_iread( 822 xfs_mount_t *mp, 823 xfs_trans_t *tp, 824 xfs_ino_t ino, 825 xfs_inode_t **ipp, 826 xfs_daddr_t bno) 827{ 828 xfs_buf_t *bp; 829 xfs_dinode_t *dip; 830 xfs_inode_t *ip; 831 int error; 832 833 ASSERT(xfs_inode_zone != NULL); 834 835 ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP); 836 ip->i_ino = ino; 837 ip->i_mount = mp; 838 839 /* 840 * Get pointer's to the on-disk inode and the buffer containing it. 841 * If the inode number refers to a block outside the file system 842 * then xfs_itobp() will return NULL. In this case we should 843 * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will 844 * know that this is a new incore inode. 845 */ 846 error = xfs_itobp(mp, tp, ip, &dip, &bp, bno); 847 848 if (error != 0) { 849 kmem_zone_free(xfs_inode_zone, ip); 850 return error; 851 } 852 853 /* 854 * Initialize inode's trace buffers. 855 * Do this before xfs_iformat in case it adds entries. 856 */ 857#ifdef XFS_BMAP_TRACE 858 ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP); 859#endif 860#ifdef XFS_BMBT_TRACE 861 ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP); 862#endif 863#ifdef XFS_RW_TRACE 864 ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP); 865#endif 866#ifdef XFS_ILOCK_TRACE 867 ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP); 868#endif 869#ifdef XFS_DIR2_TRACE 870 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP); 871#endif 872 873 /* 874 * If we got something that isn't an inode it means someone 875 * (nfs or dmi) has a stale handle. 876 */ 877 if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) { 878 kmem_zone_free(xfs_inode_zone, ip); 879 xfs_trans_brelse(tp, bp); 880#ifdef DEBUG 881 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 882 "dip->di_core.di_magic (0x%x) != " 883 "XFS_DINODE_MAGIC (0x%x)", 884 INT_GET(dip->di_core.di_magic, ARCH_CONVERT), 885 XFS_DINODE_MAGIC); 886#endif /* DEBUG */ 887 return XFS_ERROR(EINVAL); 888 } 889 890 /* 891 * If the on-disk inode is already linked to a directory 892 * entry, copy all of the inode into the in-core inode. 893 * xfs_iformat() handles copying in the inode format 894 * specific information. 895 * Otherwise, just get the truly permanent information. 896 */ 897 if (!INT_ISZERO(dip->di_core.di_mode, ARCH_CONVERT)) { 898 xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core, 899 &(ip->i_d), 1, ARCH_CONVERT); 900 error = xfs_iformat(ip, dip); 901 if (error) { 902 kmem_zone_free(xfs_inode_zone, ip); 903 xfs_trans_brelse(tp, bp); 904#ifdef DEBUG 905 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 906 "xfs_iformat() returned error %d", 907 error); 908#endif /* DEBUG */ 909 return error; 910 } 911 } else { 912 ip->i_d.di_magic = INT_GET(dip->di_core.di_magic, ARCH_CONVERT); 913 ip->i_d.di_version = INT_GET(dip->di_core.di_version, ARCH_CONVERT); 914 ip->i_d.di_gen = INT_GET(dip->di_core.di_gen, ARCH_CONVERT); 915 ip->i_d.di_flushiter = INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT); 916 /* 917 * Make sure to pull in the mode here as well in 918 * case the inode is released without being used. 919 * This ensures that xfs_inactive() will see that 920 * the inode is already free and not try to mess 921 * with the uninitialized part of it. 922 */ 923 ip->i_d.di_mode = 0; 924 /* 925 * Initialize the per-fork minima and maxima for a new 926 * inode here. xfs_iformat will do it for old inodes. 927 */ 928 ip->i_df.if_ext_max = 929 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 930 } 931 932 /* XXXKAN: initialize i_reclaim */ 933 bzero(&ip->i_reclaim, sizeof(&ip->i_reclaim)); 934 935 /* 936 * The inode format changed when we moved the link count and 937 * made it 32 bits long. If this is an old format inode, 938 * convert it in memory to look like a new one. If it gets 939 * flushed to disk we will convert back before flushing or 940 * logging it. We zero out the new projid field and the old link 941 * count field. We'll handle clearing the pad field (the remains 942 * of the old uuid field) when we actually convert the inode to 943 * the new format. We don't change the version number so that we 944 * can distinguish this from a real new format inode. 945 */ 946 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { 947 ip->i_d.di_nlink = ip->i_d.di_onlink; 948 ip->i_d.di_onlink = 0; 949 ip->i_d.di_projid = 0; 950 } 951 952 ip->i_delayed_blks = 0; 953 954 /* 955 * Mark the buffer containing the inode as something to keep 956 * around for a while. This helps to keep recently accessed 957 * meta-data in-core longer. 958 */ 959 XFS_BUF_SET_REF(bp, XFS_INO_REF); 960 961 /* 962 * Use xfs_trans_brelse() to release the buffer containing the 963 * on-disk inode, because it was acquired with xfs_trans_read_buf() 964 * in xfs_itobp() above. If tp is NULL, this is just a normal 965 * brelse(). If we're within a transaction, then xfs_trans_brelse() 966 * will only release the buffer if it is not dirty within the 967 * transaction. It will be OK to release the buffer in this case, 968 * because inodes on disk are never destroyed and we will be 969 * locking the new in-core inode before putting it in the hash 970 * table where other processes can find it. Thus we don't have 971 * to worry about the inode being changed just because we released 972 * the buffer. 973 */ 974 xfs_trans_brelse(tp, bp); 975 *ipp = ip; 976 return 0; 977} 978 979/* 980 * Read in extents from a btree-format inode. 981 * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. 982 */ 983int 984xfs_iread_extents( 985 xfs_trans_t *tp, 986 xfs_inode_t *ip, 987 int whichfork) 988{ 989 int error; 990 xfs_ifork_t *ifp; 991 size_t size; 992 993 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { 994 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, 995 ip->i_mount); 996 return XFS_ERROR(EFSCORRUPTED); 997 } 998 size = XFS_IFORK_NEXTENTS(ip, whichfork) * (uint)sizeof(xfs_bmbt_rec_t); 999 ifp = XFS_IFORK_PTR(ip, whichfork); 1000 /* 1001 * We know that the size is valid (it's checked in iformat_btree) 1002 */ 1003 ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP); 1004 ASSERT(ifp->if_u1.if_extents != NULL); 1005 ifp->if_lastex = NULLEXTNUM; 1006 ifp->if_bytes = ifp->if_real_bytes = (int)size; 1007 ifp->if_flags |= XFS_IFEXTENTS; 1008 error = xfs_bmap_read_extents(tp, ip, whichfork); 1009 if (error) { 1010 kmem_free(ifp->if_u1.if_extents, size); 1011 ifp->if_u1.if_extents = NULL; 1012 ifp->if_bytes = ifp->if_real_bytes = 0; 1013 ifp->if_flags &= ~XFS_IFEXTENTS; 1014 return error; 1015 } 1016 xfs_validate_extents((xfs_bmbt_rec_t *)ifp->if_u1.if_extents, 1017 XFS_IFORK_NEXTENTS(ip, whichfork), 0, XFS_EXTFMT_INODE(ip)); 1018 return 0; 1019} 1020 1021/* 1022 * Allocate an inode on disk and return a copy of its in-core version. 1023 * The in-core inode is locked exclusively. Set mode, nlink, and rdev 1024 * appropriately within the inode. The uid and gid for the inode are 1025 * set according to the contents of the given cred structure. 1026 * 1027 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() 1028 * has a free inode available, call xfs_iget() 1029 * to obtain the in-core version of the allocated inode. Finally, 1030 * fill in the inode and log its initial contents. In this case, 1031 * ialloc_context would be set to NULL and call_again set to false. 1032 * 1033 * If xfs_dialloc() does not have an available inode, 1034 * it will replenish its supply by doing an allocation. Since we can 1035 * only do one allocation within a transaction without deadlocks, we 1036 * must commit the current transaction before returning the inode itself. 1037 * In this case, therefore, we will set call_again to true and return. 1038 * The caller should then commit the current transaction, start a new 1039 * transaction, and call xfs_ialloc() again to actually get the inode. 1040 * 1041 * To ensure that some other process does not grab the inode that 1042 * was allocated during the first call to xfs_ialloc(), this routine 1043 * also returns the [locked] bp pointing to the head of the freelist 1044 * as ialloc_context. The caller should hold this buffer across 1045 * the commit and pass it back into this routine on the second call. 1046 */ 1047int 1048xfs_ialloc( 1049 xfs_trans_t *tp, 1050 xfs_inode_t *pip, 1051 mode_t mode, 1052 nlink_t nlink, 1053 xfs_dev_t rdev, 1054 cred_t *cr, 1055 xfs_prid_t prid, 1056 int okalloc, 1057 xfs_buf_t **ialloc_context, 1058 boolean_t *call_again, 1059 xfs_inode_t **ipp) 1060{ 1061 xfs_ino_t ino; 1062 xfs_inode_t *ip; 1063 xfs_vnode_t *vp; 1064 uint flags; 1065 int error; 1066 1067 /* 1068 * Call the space management code to pick 1069 * the on-disk inode to be allocated. 1070 */ 1071 ASSERT(pip != NULL); 1072 error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, 1073 ialloc_context, call_again, &ino); 1074 if (error != 0) { 1075 return error; 1076 } 1077 if (*call_again || ino == NULLFSINO) { 1078 *ipp = NULL; 1079 return 0; 1080 } 1081 ASSERT(*ialloc_context == NULL); 1082 1083 /* 1084 * Get the in-core inode with the lock held exclusively. 1085 * This is because we're setting fields here we need 1086 * to prevent others from looking at until we're done. 1087 */ 1088 error = xfs_trans_iget(tp->t_mountp, tp, ino, XFS_ILOCK_EXCL, &ip); 1089 if (error != 0) { 1090 return error; 1091 } 1092 ASSERT(ip != NULL); 1093 vp = XFS_ITOV(ip); 1094 ASSERT(vp != NULL); 1095 vp->v_type = IFTOVT(mode); 1096 ip->i_d.di_mode = (__uint16_t)mode; 1097 ip->i_d.di_onlink = 0; 1098 ip->i_d.di_nlink = nlink; 1099 ASSERT(ip->i_d.di_nlink == nlink); 1100 ip->i_d.di_uid = curthread->td_ucred->cr_uid; 1101 ip->i_d.di_gid = curthread->td_ucred->cr_groups[0]; 1102 ip->i_d.di_projid = prid; 1103 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1104 1105 /* 1106 * If the superblock version is up to where we support new format 1107 * inodes and this is currently an old format inode, then change 1108 * the inode version number now. This way we only do the conversion 1109 * here rather than here and in the flush/logging code. 1110 */ 1111 if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) && 1112 ip->i_d.di_version == XFS_DINODE_VERSION_1) { 1113 ip->i_d.di_version = XFS_DINODE_VERSION_2; 1114 /* 1115 * We've already zeroed the old link count, the projid field, 1116 * and the pad field. 1117 */ 1118 } 1119 1120 /* 1121 * Project ids won't be stored on disk if we are using a version 1 inode. 1122 */ 1123 if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) 1124 xfs_bump_ino_vers2(tp, ip); 1125 1126 if (XFS_INHERIT_GID(pip, vp->v_vfsp)) { 1127 ip->i_d.di_gid = pip->i_d.di_gid; 1128 if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { 1129 ip->i_d.di_mode |= S_ISGID; 1130 } 1131 } 1132 1133 /* 1134 * If the group ID of the new file does not match the effective group 1135 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 1136 * (and only if the irix_sgid_inherit compatibility variable is set). 1137 */ 1138 if ((irix_sgid_inherit) && 1139 (ip->i_d.di_mode & S_ISGID) && 1140 (!groupmember((gid_t)ip->i_d.di_gid, curthread->td_ucred))) { 1141 ip->i_d.di_mode &= ~S_ISGID; 1142 } 1143 1144 ip->i_d.di_size = 0; 1145 ip->i_d.di_nextents = 0; 1146 ASSERT(ip->i_d.di_nblocks == 0); 1147 xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD); 1148 /* 1149 * di_gen will have been taken care of in xfs_iread. 1150 */ 1151 ip->i_d.di_extsize = 0; 1152 ip->i_d.di_dmevmask = 0; 1153 ip->i_d.di_dmstate = 0; 1154 ip->i_d.di_flags = 0; 1155 flags = XFS_ILOG_CORE; 1156 switch (mode & S_IFMT) { 1157 case S_IFIFO: 1158 case S_IFCHR: 1159 case S_IFBLK: 1160 case S_IFSOCK: 1161 ip->i_d.di_format = XFS_DINODE_FMT_DEV; 1162 ip->i_df.if_u2.if_rdev = rdev; 1163 ip->i_df.if_flags = 0; 1164 flags |= XFS_ILOG_DEV; 1165 break; 1166 case S_IFREG: 1167 case S_IFDIR: 1168 if (pip->i_d.di_flags & 1169 (XFS_DIFLAG_NOATIME|XFS_DIFLAG_NODUMP|XFS_DIFLAG_SYNC)) { 1170 if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && 1171 xfs_inherit_noatime) 1172 ip->i_d.di_flags |= XFS_DIFLAG_NOATIME; 1173 if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && 1174 xfs_inherit_nodump) 1175 ip->i_d.di_flags |= XFS_DIFLAG_NODUMP; 1176 if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && 1177 xfs_inherit_sync) 1178 ip->i_d.di_flags |= XFS_DIFLAG_SYNC; 1179 } 1180 case S_IFLNK: 1181 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 1182 ip->i_df.if_flags = XFS_IFEXTENTS; 1183 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; 1184 ip->i_df.if_u1.if_extents = NULL; 1185 break; 1186 default: 1187 ASSERT(0); 1188 } 1189 /* 1190 * Attribute fork settings for new inode. 1191 */ 1192 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 1193 ip->i_d.di_anextents = 0; 1194 1195 /* 1196 * Log the new values stuffed into the inode. 1197 */ 1198 xfs_trans_log_inode(tp, ip, flags); 1199 1200 /* now that we have a v_type we can set Linux inode ops (& unlock) */ 1201 XVFS_INIT_VNODE(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1); 1202 1203 *ipp = ip; 1204 return 0; 1205} 1206 1207/* 1208 * Check to make sure that there are no blocks allocated to the 1209 * file beyond the size of the file. We don't check this for 1210 * files with fixed size extents or real time extents, but we 1211 * at least do it for regular files. 1212 */ 1213#ifdef DEBUG 1214void 1215xfs_isize_check( 1216 xfs_mount_t *mp, 1217 xfs_inode_t *ip, 1218 xfs_fsize_t isize) 1219{ 1220 xfs_fileoff_t map_first; 1221 int nimaps; 1222 xfs_bmbt_irec_t imaps[2]; 1223 1224 if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) 1225 return; 1226 1227 if ( ip->i_d.di_flags & XFS_DIFLAG_REALTIME ) 1228 return; 1229 1230 nimaps = 2; 1231 map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); 1232 /* 1233 * The filesystem could be shutting down, so bmapi may return 1234 * an error. 1235 */ 1236 if (xfs_bmapi(NULL, ip, map_first, 1237 (XFS_B_TO_FSB(mp, 1238 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - 1239 map_first), 1240 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, 1241 NULL)) 1242 return; 1243 ASSERT(nimaps == 1); 1244 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); 1245} 1246#endif /* DEBUG */ 1247 1248/* 1249 * Calculate the last possible buffered byte in a file. This must 1250 * include data that was buffered beyond the EOF by the write code. 1251 * This also needs to deal with overflowing the xfs_fsize_t type 1252 * which can happen for sizes near the limit. 1253 * 1254 * We also need to take into account any blocks beyond the EOF. It 1255 * may be the case that they were buffered by a write which failed. 1256 * In that case the pages will still be in memory, but the inode size 1257 * will never have been updated. 1258 */ 1259xfs_fsize_t 1260xfs_file_last_byte( 1261 xfs_inode_t *ip) 1262{ 1263 xfs_mount_t *mp; 1264 xfs_fsize_t last_byte; 1265 xfs_fileoff_t last_block; 1266 xfs_fileoff_t size_last_block; 1267 int error; 1268 1269 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS)); 1270 1271 mp = ip->i_mount; 1272 /* 1273 * Only check for blocks beyond the EOF if the extents have 1274 * been read in. This eliminates the need for the inode lock, 1275 * and it also saves us from looking when it really isn't 1276 * necessary. 1277 */ 1278 if (ip->i_df.if_flags & XFS_IFEXTENTS) { 1279 error = xfs_bmap_last_offset(NULL, ip, &last_block, 1280 XFS_DATA_FORK); 1281 if (error) { 1282 last_block = 0; 1283 } 1284 } else { 1285 last_block = 0; 1286 } 1287 size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size); 1288 last_block = XFS_FILEOFF_MAX(last_block, size_last_block); 1289 1290 last_byte = XFS_FSB_TO_B(mp, last_block); 1291 if (last_byte < 0) { 1292 return XFS_MAXIOFFSET(mp); 1293 } 1294 last_byte += (1 << mp->m_writeio_log); 1295 if (last_byte < 0) { 1296 return XFS_MAXIOFFSET(mp); 1297 } 1298 return last_byte; 1299} 1300 1301#if defined(XFS_RW_TRACE) 1302STATIC void 1303xfs_itrunc_trace( 1304 int tag, 1305 xfs_inode_t *ip, 1306 int flag, 1307 xfs_fsize_t new_size, 1308 xfs_off_t toss_start, 1309 xfs_off_t toss_finish) 1310{ 1311 if (ip->i_rwtrace == NULL) { 1312 return; 1313 } 1314 1315 ktrace_enter(ip->i_rwtrace, 1316 (void*)((long)tag), 1317 (void*)ip, 1318 (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff), 1319 (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff), 1320 (void*)((long)flag), 1321 (void*)(unsigned long)((new_size >> 32) & 0xffffffff), 1322 (void*)(unsigned long)(new_size & 0xffffffff), 1323 (void*)(unsigned long)((toss_start >> 32) & 0xffffffff), 1324 (void*)(unsigned long)(toss_start & 0xffffffff), 1325 (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff), 1326 (void*)(unsigned long)(toss_finish & 0xffffffff), 1327 (void*)(unsigned long)current_cpu(), 1328 (void*)0, 1329 (void*)0, 1330 (void*)0, 1331 (void*)0); 1332} 1333#else 1334#define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish) 1335#endif 1336 1337/* 1338 * Start the truncation of the file to new_size. The new size 1339 * must be smaller than the current size. This routine will 1340 * clear the buffer and page caches of file data in the removed 1341 * range, and xfs_itruncate_finish() will remove the underlying 1342 * disk blocks. 1343 * 1344 * The inode must have its I/O lock locked EXCLUSIVELY, and it 1345 * must NOT have the inode lock held at all. This is because we're 1346 * calling into the buffer/page cache code and we can't hold the 1347 * inode lock when we do so. 1348 * 1349 * The flags parameter can have either the value XFS_ITRUNC_DEFINITE 1350 * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used 1351 * in the case that the caller is locking things out of order and 1352 * may not be able to call xfs_itruncate_finish() with the inode lock 1353 * held without dropping the I/O lock. If the caller must drop the 1354 * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start() 1355 * must be called again with all the same restrictions as the initial 1356 * call. 1357 */ 1358void 1359xfs_itruncate_start( 1360 xfs_inode_t *ip, 1361 uint flags, 1362 xfs_fsize_t new_size) 1363{ 1364 xfs_fsize_t last_byte; 1365 xfs_off_t toss_start; 1366 xfs_mount_t *mp; 1367 xfs_vnode_t *vp; 1368 1369 ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); 1370 ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size)); 1371 ASSERT((flags == XFS_ITRUNC_DEFINITE) || 1372 (flags == XFS_ITRUNC_MAYBE)); 1373 1374 mp = ip->i_mount; 1375 vp = XFS_ITOV(ip); 1376 /* 1377 * Call VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES() to get rid of pages and buffers 1378 * overlapping the region being removed. We have to use 1379 * the less efficient VOP_FLUSHINVAL_PAGES() in the case that the 1380 * caller may not be able to finish the truncate without 1381 * dropping the inode's I/O lock. Make sure 1382 * to catch any pages brought in by buffers overlapping 1383 * the EOF by searching out beyond the isize by our 1384 * block size. We round new_size up to a block boundary 1385 * so that we don't toss things on the same block as 1386 * new_size but before it. 1387 * 1388 * Before calling VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES(), make sure to 1389 * call remapf() over the same region if the file is mapped. 1390 * This frees up mapped file references to the pages in the 1391 * given range and for the VOP_FLUSHINVAL_PAGES() case it ensures 1392 * that we get the latest mapped changes flushed out. 1393 */ 1394 toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1395 toss_start = XFS_FSB_TO_B(mp, toss_start); 1396 if (toss_start < 0) { 1397 /* 1398 * The place to start tossing is beyond our maximum 1399 * file size, so there is no way that the data extended 1400 * out there. 1401 */ 1402 return; 1403 } 1404 last_byte = xfs_file_last_byte(ip); 1405 xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start, 1406 last_byte); 1407 if (last_byte > toss_start) { 1408 if (flags & XFS_ITRUNC_DEFINITE) { 1409 XVOP_TOSS_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED); 1410 } else { 1411 XVOP_FLUSHINVAL_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED); 1412 } 1413 } 1414 1415#ifdef DEBUG 1416 if (new_size == 0) { 1417 ASSERT(VN_CACHED(vp) == 0); 1418 } 1419#endif 1420} 1421 1422/* 1423 * Shrink the file to the given new_size. The new 1424 * size must be smaller than the current size. 1425 * This will free up the underlying blocks 1426 * in the removed range after a call to xfs_itruncate_start() 1427 * or xfs_atruncate_start(). 1428 * 1429 * The transaction passed to this routine must have made 1430 * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES. 1431 * This routine may commit the given transaction and 1432 * start new ones, so make sure everything involved in 1433 * the transaction is tidy before calling here. 1434 * Some transaction will be returned to the caller to be 1435 * committed. The incoming transaction must already include 1436 * the inode, and both inode locks must be held exclusively. 1437 * The inode must also be "held" within the transaction. On 1438 * return the inode will be "held" within the returned transaction. 1439 * This routine does NOT require any disk space to be reserved 1440 * for it within the transaction. 1441 * 1442 * The fork parameter must be either xfs_attr_fork or xfs_data_fork, 1443 * and it indicates the fork which is to be truncated. For the 1444 * attribute fork we only support truncation to size 0. 1445 * 1446 * We use the sync parameter to indicate whether or not the first 1447 * transaction we perform might have to be synchronous. For the attr fork, 1448 * it needs to be so if the unlink of the inode is not yet known to be 1449 * permanent in the log. This keeps us from freeing and reusing the 1450 * blocks of the attribute fork before the unlink of the inode becomes 1451 * permanent. 1452 * 1453 * For the data fork, we normally have to run synchronously if we're 1454 * being called out of the inactive path or we're being called 1455 * out of the create path where we're truncating an existing file. 1456 * Either way, the truncate needs to be sync so blocks don't reappear 1457 * in the file with altered data in case of a crash. wsync filesystems 1458 * can run the first case async because anything that shrinks the inode 1459 * has to run sync so by the time we're called here from inactive, the 1460 * inode size is permanently set to 0. 1461 * 1462 * Calls from the truncate path always need to be sync unless we're 1463 * in a wsync filesystem and the file has already been unlinked. 1464 * 1465 * The caller is responsible for correctly setting the sync parameter. 1466 * It gets too hard for us to guess here which path we're being called 1467 * out of just based on inode state. 1468 */ 1469int 1470xfs_itruncate_finish( 1471 xfs_trans_t **tp, 1472 xfs_inode_t *ip, 1473 xfs_fsize_t new_size, 1474 int fork, 1475 int sync) 1476{ 1477 xfs_fsblock_t first_block; 1478 xfs_fileoff_t first_unmap_block; 1479 xfs_fileoff_t last_block; 1480 xfs_filblks_t unmap_len=0; 1481 xfs_mount_t *mp; 1482 xfs_trans_t *ntp; 1483 int done; 1484 int committed; 1485 xfs_bmap_free_t free_list; 1486 int error; 1487 1488 ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); 1489 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); 1490 ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size)); 1491 ASSERT(*tp != NULL); 1492 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 1493 ASSERT(ip->i_transp == *tp); 1494 ASSERT(ip->i_itemp != NULL); 1495 ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD); 1496 1497 1498 ntp = *tp; 1499 mp = (ntp)->t_mountp; 1500 ASSERT(! XFS_NOT_DQATTACHED(mp, ip)); 1501 1502 /* 1503 * We only support truncating the entire attribute fork. 1504 */ 1505 if (fork == XFS_ATTR_FORK) { 1506 new_size = 0LL; 1507 } 1508 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1509 xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0); 1510 /* 1511 * The first thing we do is set the size to new_size permanently 1512 * on disk. This way we don't have to worry about anyone ever 1513 * being able to look at the data being freed even in the face 1514 * of a crash. What we're getting around here is the case where 1515 * we free a block, it is allocated to another file, it is written 1516 * to, and then we crash. If the new data gets written to the 1517 * file but the log buffers containing the free and reallocation 1518 * don't, then we'd end up with garbage in the blocks being freed. 1519 * As long as we make the new_size permanent before actually 1520 * freeing any blocks it doesn't matter if they get writtten to. 1521 * 1522 * The callers must signal into us whether or not the size 1523 * setting here must be synchronous. There are a few cases 1524 * where it doesn't have to be synchronous. Those cases 1525 * occur if the file is unlinked and we know the unlink is 1526 * permanent or if the blocks being truncated are guaranteed 1527 * to be beyond the inode eof (regardless of the link count) 1528 * and the eof value is permanent. Both of these cases occur 1529 * only on wsync-mounted filesystems. In those cases, we're 1530 * guaranteed that no user will ever see the data in the blocks 1531 * that are being truncated so the truncate can run async. 1532 * In the free beyond eof case, the file may wind up with 1533 * more blocks allocated to it than it needs if we crash 1534 * and that won't get fixed until the next time the file 1535 * is re-opened and closed but that's ok as that shouldn't 1536 * be too many blocks. 1537 * 1538 * However, we can't just make all wsync xactions run async 1539 * because there's one call out of the create path that needs 1540 * to run sync where it's truncating an existing file to size 1541 * 0 whose size is > 0. 1542 * 1543 * It's probably possible to come up with a test in this 1544 * routine that would correctly distinguish all the above 1545 * cases from the values of the function parameters and the 1546 * inode state but for sanity's sake, I've decided to let the 1547 * layers above just tell us. It's simpler to correctly figure 1548 * out in the layer above exactly under what conditions we 1549 * can run async and I think it's easier for others read and 1550 * follow the logic in case something has to be changed. 1551 * cscope is your friend -- rcc. 1552 * 1553 * The attribute fork is much simpler. 1554 * 1555 * For the attribute fork we allow the caller to tell us whether 1556 * the unlink of the inode that led to this call is yet permanent 1557 * in the on disk log. If it is not and we will be freeing extents 1558 * in this inode then we make the first transaction synchronous 1559 * to make sure that the unlink is permanent by the time we free 1560 * the blocks. 1561 */ 1562 if (fork == XFS_DATA_FORK) { 1563 if (ip->i_d.di_nextents > 0) { 1564 ip->i_d.di_size = new_size; 1565 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1566 } 1567 } else if (sync) { 1568 ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC)); 1569 if (ip->i_d.di_anextents > 0) 1570 xfs_trans_set_sync(ntp); 1571 } 1572 ASSERT(fork == XFS_DATA_FORK || 1573 (fork == XFS_ATTR_FORK && 1574 ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) || 1575 (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC))))); 1576 1577 /* 1578 * Since it is possible for space to become allocated beyond 1579 * the end of the file (in a crash where the space is allocated 1580 * but the inode size is not yet updated), simply remove any 1581 * blocks which show up between the new EOF and the maximum 1582 * possible file size. If the first block to be removed is 1583 * beyond the maximum file size (ie it is the same as last_block), 1584 * then there is nothing to do. 1585 */ 1586 last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 1587 ASSERT(first_unmap_block <= last_block); 1588 done = 0; 1589 if (last_block == first_unmap_block) { 1590 done = 1; 1591 } else { 1592 unmap_len = last_block - first_unmap_block + 1; 1593 } 1594 while (!done) { 1595 /* 1596 * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi() 1597 * will tell us whether it freed the entire range or 1598 * not. If this is a synchronous mount (wsync), 1599 * then we can tell bunmapi to keep all the 1600 * transactions asynchronous since the unlink 1601 * transaction that made this inode inactive has 1602 * already hit the disk. There's no danger of 1603 * the freed blocks being reused, there being a 1604 * crash, and the reused blocks suddenly reappearing 1605 * in this file with garbage in them once recovery 1606 * runs. 1607 */ 1608 XFS_BMAP_INIT(&free_list, &first_block); 1609 error = xfs_bunmapi(ntp, ip, first_unmap_block, 1610 unmap_len, 1611 XFS_BMAPI_AFLAG(fork) | 1612 (sync ? 0 : XFS_BMAPI_ASYNC), 1613 XFS_ITRUNC_MAX_EXTENTS, 1614 &first_block, &free_list, &done); 1615 if (error) { 1616 /* 1617 * If the bunmapi call encounters an error, 1618 * return to the caller where the transaction 1619 * can be properly aborted. We just need to 1620 * make sure we're not holding any resources 1621 * that we were not when we came in. 1622 */ 1623 xfs_bmap_cancel(&free_list); 1624 return error; 1625 } 1626 1627 /* 1628 * Duplicate the transaction that has the permanent 1629 * reservation and commit the old transaction. 1630 */ 1631 error = xfs_bmap_finish(tp, &free_list, first_block, 1632 &committed); 1633 ntp = *tp; 1634 if (error) { 1635 /* 1636 * If the bmap finish call encounters an error, 1637 * return to the caller where the transaction 1638 * can be properly aborted. We just need to 1639 * make sure we're not holding any resources 1640 * that we were not when we came in. 1641 * 1642 * Aborting from this point might lose some 1643 * blocks in the file system, but oh well. 1644 */ 1645 xfs_bmap_cancel(&free_list); 1646 if (committed) { 1647 /* 1648 * If the passed in transaction committed 1649 * in xfs_bmap_finish(), then we want to 1650 * add the inode to this one before returning. 1651 * This keeps things simple for the higher 1652 * level code, because it always knows that 1653 * the inode is locked and held in the 1654 * transaction that returns to it whether 1655 * errors occur or not. We don't mark the 1656 * inode dirty so that this transaction can 1657 * be easily aborted if possible. 1658 */ 1659 xfs_trans_ijoin(ntp, ip, 1660 XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1661 xfs_trans_ihold(ntp, ip); 1662 } 1663 return error; 1664 } 1665 1666 if (committed) { 1667 /* 1668 * The first xact was committed, 1669 * so add the inode to the new one. 1670 * Mark it dirty so it will be logged 1671 * and moved forward in the log as 1672 * part of every commit. 1673 */ 1674 xfs_trans_ijoin(ntp, ip, 1675 XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1676 xfs_trans_ihold(ntp, ip); 1677 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1678 } 1679 ntp = xfs_trans_dup(ntp); 1680 (void) xfs_trans_commit(*tp, 0, NULL); 1681 *tp = ntp; 1682 error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 1683 XFS_TRANS_PERM_LOG_RES, 1684 XFS_ITRUNCATE_LOG_COUNT); 1685 /* 1686 * Add the inode being truncated to the next chained 1687 * transaction. 1688 */ 1689 xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1690 xfs_trans_ihold(ntp, ip); 1691 if (error) 1692 return (error); 1693 } 1694 /* 1695 * Only update the size in the case of the data fork, but 1696 * always re-log the inode so that our permanent transaction 1697 * can keep on rolling it forward in the log. 1698 */ 1699 if (fork == XFS_DATA_FORK) { 1700 xfs_isize_check(mp, ip, new_size); 1701 ip->i_d.di_size = new_size; 1702 } 1703 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1704 ASSERT((new_size != 0) || 1705 (fork == XFS_ATTR_FORK) || 1706 (ip->i_delayed_blks == 0)); 1707 ASSERT((new_size != 0) || 1708 (fork == XFS_ATTR_FORK) || 1709 (ip->i_d.di_nextents == 0)); 1710 xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0); 1711 return 0; 1712} 1713 1714 1715/* 1716 * xfs_igrow_start 1717 * 1718 * Do the first part of growing a file: zero any data in the last 1719 * block that is beyond the old EOF. We need to do this before 1720 * the inode is joined to the transaction to modify the i_size. 1721 * That way we can drop the inode lock and call into the buffer 1722 * cache to get the buffer mapping the EOF. 1723 */ 1724int 1725xfs_igrow_start( 1726 xfs_inode_t *ip, 1727 xfs_fsize_t new_size, 1728 cred_t *credp) 1729{ 1730 xfs_fsize_t isize; 1731 int error; 1732 1733 ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); 1734 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); 1735 ASSERT(new_size > ip->i_d.di_size); 1736 1737 error = 0; 1738 isize = ip->i_d.di_size; 1739 /* 1740 * Zero any pages that may have been created by 1741 * xfs_write_file() beyond the end of the file 1742 * and any blocks between the old and new file sizes. 1743 */ 1744 error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, isize, 1745 new_size); 1746 return error; 1747} 1748 1749/* 1750 * xfs_igrow_finish 1751 * 1752 * This routine is called to extend the size of a file. 1753 * The inode must have both the iolock and the ilock locked 1754 * for update and it must be a part of the current transaction. 1755 * The xfs_igrow_start() function must have been called previously. 1756 * If the change_flag is not zero, the inode change timestamp will 1757 * be updated. 1758 */ 1759void 1760xfs_igrow_finish( 1761 xfs_trans_t *tp, 1762 xfs_inode_t *ip, 1763 xfs_fsize_t new_size, 1764 int change_flag) 1765{ 1766 ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); 1767 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); 1768 ASSERT(ip->i_transp == tp); 1769 ASSERT(new_size > ip->i_d.di_size); 1770 1771 /* 1772 * Update the file size. Update the inode change timestamp 1773 * if change_flag set. 1774 */ 1775 ip->i_d.di_size = new_size; 1776 if (change_flag) 1777 xfs_ichgtime(ip, XFS_ICHGTIME_CHG); 1778 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1779 1780} 1781 1782 1783/* 1784 * This is called when the inode's link count goes to 0. 1785 * We place the on-disk inode on a list in the AGI. It 1786 * will be pulled from this list when the inode is freed. 1787 */ 1788int 1789xfs_iunlink( 1790 xfs_trans_t *tp, 1791 xfs_inode_t *ip) 1792{ 1793 xfs_mount_t *mp; 1794 xfs_agi_t *agi; 1795 xfs_dinode_t *dip; 1796 xfs_buf_t *agibp; 1797 xfs_buf_t *ibp; 1798 xfs_agnumber_t agno; 1799 xfs_daddr_t agdaddr; 1800 xfs_agino_t agino; 1801 short bucket_index; 1802 int offset; 1803 int error; 1804 int agi_ok; 1805 1806 ASSERT(ip->i_d.di_nlink == 0); 1807 ASSERT(ip->i_d.di_mode != 0); 1808 ASSERT(ip->i_transp == tp); 1809 1810 mp = tp->t_mountp; 1811 1812 agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 1813 agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); 1814 1815 /* 1816 * Get the agi buffer first. It ensures lock ordering 1817 * on the list. 1818 */ 1819 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, 1820 XFS_FSS_TO_BB(mp, 1), 0, &agibp); 1821 if (error) { 1822 return error; 1823 } 1824 /* 1825 * Validate the magic number of the agi block. 1826 */ 1827 agi = XFS_BUF_TO_AGI(agibp); 1828 agi_ok = 1829 INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC && 1830 XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT)); 1831 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK, 1832 XFS_RANDOM_IUNLINK))) { 1833 XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi); 1834 xfs_trans_brelse(tp, agibp); 1835 return XFS_ERROR(EFSCORRUPTED); 1836 } 1837 /* 1838 * Get the index into the agi hash table for the 1839 * list this inode will go on. 1840 */ 1841 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1842 ASSERT(agino != 0); 1843 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1844 ASSERT(!INT_ISZERO(agi->agi_unlinked[bucket_index], ARCH_CONVERT)); 1845 ASSERT(INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != agino); 1846 1847 if (INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != NULLAGINO) { 1848 /* 1849 * There is already another inode in the bucket we need 1850 * to add ourselves to. Add us at the front of the list. 1851 * Here we put the head pointer into our next pointer, 1852 * and then we fall through to point the head at us. 1853 */ 1854 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); 1855 if (error) { 1856 return error; 1857 } 1858 ASSERT(INT_GET(dip->di_next_unlinked, ARCH_CONVERT) == NULLAGINO); 1859 ASSERT(!INT_ISZERO(dip->di_next_unlinked, ARCH_CONVERT)); 1860 /* both on-disk, don't endian flip twice */ 1861 dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; 1862 offset = ip->i_boffset + 1863 offsetof(xfs_dinode_t, di_next_unlinked); 1864 xfs_trans_inode_buf(tp, ibp); 1865 xfs_trans_log_buf(tp, ibp, offset, 1866 (offset + sizeof(xfs_agino_t) - 1)); 1867 xfs_inobp_check(mp, ibp); 1868 } 1869 1870 /* 1871 * Point the bucket head pointer at the inode being inserted. 1872 */ 1873 ASSERT(agino != 0); 1874 INT_SET(agi->agi_unlinked[bucket_index], ARCH_CONVERT, agino); 1875 offset = offsetof(xfs_agi_t, agi_unlinked) + 1876 (sizeof(xfs_agino_t) * bucket_index); 1877 xfs_trans_log_buf(tp, agibp, offset, 1878 (offset + sizeof(xfs_agino_t) - 1)); 1879 return 0; 1880} 1881 1882/* 1883 * Pull the on-disk inode from the AGI unlinked list. 1884 */ 1885STATIC int 1886xfs_iunlink_remove( 1887 xfs_trans_t *tp, 1888 xfs_inode_t *ip) 1889{ 1890 xfs_ino_t next_ino; 1891 xfs_mount_t *mp; 1892 xfs_agi_t *agi; 1893 xfs_dinode_t *dip; 1894 xfs_buf_t *agibp; 1895 xfs_buf_t *ibp; 1896 xfs_agnumber_t agno; 1897 xfs_daddr_t agdaddr; 1898 xfs_agino_t agino; 1899 xfs_agino_t next_agino; 1900 xfs_buf_t *last_ibp; 1901 xfs_dinode_t *last_dip; 1902 short bucket_index; 1903 int offset, last_offset; 1904 int error; 1905 int agi_ok; 1906 1907 /* 1908 * First pull the on-disk inode from the AGI unlinked list. 1909 */ 1910 mp = tp->t_mountp; 1911 1912 agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 1913 agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); 1914 1915 /* 1916 * Get the agi buffer first. It ensures lock ordering 1917 * on the list. 1918 */ 1919 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, 1920 XFS_FSS_TO_BB(mp, 1), 0, &agibp); 1921 if (error) { 1922 cmn_err(CE_WARN, 1923 "xfs_iunlink_remove: xfs_trans_read_buf() returned an error %d on %s. Returning error.", 1924 error, mp->m_fsname); 1925 return error; 1926 } 1927 /* 1928 * Validate the magic number of the agi block. 1929 */ 1930 agi = XFS_BUF_TO_AGI(agibp); 1931 agi_ok = 1932 INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC && 1933 XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT)); 1934 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE, 1935 XFS_RANDOM_IUNLINK_REMOVE))) { 1936 XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW, 1937 mp, agi); 1938 xfs_trans_brelse(tp, agibp); 1939 cmn_err(CE_WARN, 1940 "xfs_iunlink_remove: XFS_TEST_ERROR() returned an error on %s. Returning EFSCORRUPTED.", 1941 mp->m_fsname); 1942 return XFS_ERROR(EFSCORRUPTED); 1943 } 1944 /* 1945 * Get the index into the agi hash table for the 1946 * list this inode will go on. 1947 */ 1948 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1949 ASSERT(agino != 0); 1950 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1951 ASSERT(INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != NULLAGINO); 1952 ASSERT(!INT_ISZERO(agi->agi_unlinked[bucket_index], ARCH_CONVERT)); 1953 1954 if (INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) == agino) { 1955 /* 1956 * We're at the head of the list. Get the inode's 1957 * on-disk buffer to see if there is anyone after us 1958 * on the list. Only modify our next pointer if it 1959 * is not already NULLAGINO. This saves us the overhead 1960 * of dealing with the buffer when there is no need to 1961 * change it. 1962 */ 1963 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); 1964 if (error) { 1965 cmn_err(CE_WARN, 1966 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1967 error, mp->m_fsname); 1968 return error; 1969 } 1970 next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT); 1971 ASSERT(next_agino != 0); 1972 if (next_agino != NULLAGINO) { 1973 INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO); 1974 offset = ip->i_boffset + 1975 offsetof(xfs_dinode_t, di_next_unlinked); 1976 xfs_trans_inode_buf(tp, ibp); 1977 xfs_trans_log_buf(tp, ibp, offset, 1978 (offset + sizeof(xfs_agino_t) - 1)); 1979 xfs_inobp_check(mp, ibp); 1980 } else { 1981 xfs_trans_brelse(tp, ibp); 1982 } 1983 /* 1984 * Point the bucket head pointer at the next inode. 1985 */ 1986 ASSERT(next_agino != 0); 1987 ASSERT(next_agino != agino); 1988 INT_SET(agi->agi_unlinked[bucket_index], ARCH_CONVERT, next_agino); 1989 offset = offsetof(xfs_agi_t, agi_unlinked) + 1990 (sizeof(xfs_agino_t) * bucket_index); 1991 xfs_trans_log_buf(tp, agibp, offset, 1992 (offset + sizeof(xfs_agino_t) - 1)); 1993 } else { 1994 /* 1995 * We need to search the list for the inode being freed. 1996 */ 1997 next_agino = INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT); 1998 last_ibp = NULL; 1999 while (next_agino != agino) { 2000 /* 2001 * If the last inode wasn't the one pointing to 2002 * us, then release its buffer since we're not 2003 * going to do anything with it. 2004 */ 2005 if (last_ibp != NULL) { 2006 xfs_trans_brelse(tp, last_ibp); 2007 } 2008 next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); 2009 error = xfs_inotobp(mp, tp, next_ino, &last_dip, 2010 &last_ibp, &last_offset); 2011 if (error) { 2012 cmn_err(CE_WARN, 2013 "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.", 2014 error, mp->m_fsname); 2015 return error; 2016 } 2017 next_agino = INT_GET(last_dip->di_next_unlinked, ARCH_CONVERT); 2018 ASSERT(next_agino != NULLAGINO); 2019 ASSERT(next_agino != 0); 2020 } 2021 /* 2022 * Now last_ibp points to the buffer previous to us on 2023 * the unlinked list. Pull us from the list. 2024 */ 2025 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); 2026 if (error) { 2027 cmn_err(CE_WARN, 2028 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 2029 error, mp->m_fsname); 2030 return error; 2031 } 2032 next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT); 2033 ASSERT(next_agino != 0); 2034 ASSERT(next_agino != agino); 2035 if (next_agino != NULLAGINO) { 2036 INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO); 2037 offset = ip->i_boffset + 2038 offsetof(xfs_dinode_t, di_next_unlinked); 2039 xfs_trans_inode_buf(tp, ibp); 2040 xfs_trans_log_buf(tp, ibp, offset, 2041 (offset + sizeof(xfs_agino_t) - 1)); 2042 xfs_inobp_check(mp, ibp); 2043 } else { 2044 xfs_trans_brelse(tp, ibp); 2045 } 2046 /* 2047 * Point the previous inode on the list to the next inode. 2048 */ 2049 INT_SET(last_dip->di_next_unlinked, ARCH_CONVERT, next_agino); 2050 ASSERT(next_agino != 0); 2051 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); 2052 xfs_trans_inode_buf(tp, last_ibp); 2053 xfs_trans_log_buf(tp, last_ibp, offset, 2054 (offset + sizeof(xfs_agino_t) - 1)); 2055 xfs_inobp_check(mp, last_ibp); 2056 } 2057 return 0; 2058} 2059 2060static __inline__ int xfs_inode_clean(xfs_inode_t *ip) 2061{ 2062 return (((ip->i_itemp == NULL) || 2063 !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && 2064 (ip->i_update_core == 0)); 2065} 2066 2067STATIC void 2068xfs_ifree_cluster( 2069 xfs_inode_t *free_ip, 2070 xfs_trans_t *tp, 2071 xfs_ino_t inum) 2072{ 2073 xfs_mount_t *mp = free_ip->i_mount; 2074 int blks_per_cluster; 2075 int nbufs; 2076 int ninodes; 2077 int i, j, found, pre_flushed; 2078 xfs_daddr_t blkno; 2079 xfs_buf_t *bp; 2080 xfs_ihash_t *ih; 2081 xfs_inode_t *ip, **ip_found; 2082 xfs_inode_log_item_t *iip; 2083 xfs_log_item_t *lip; 2084 SPLDECL(s); 2085 2086 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 2087 blks_per_cluster = 1; 2088 ninodes = mp->m_sb.sb_inopblock; 2089 nbufs = XFS_IALLOC_BLOCKS(mp); 2090 } else { 2091 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / 2092 mp->m_sb.sb_blocksize; 2093 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; 2094 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; 2095 } 2096 2097 ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS); 2098 2099 for (j = 0; j < nbufs; j++, inum += ninodes) { 2100 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 2101 XFS_INO_TO_AGBNO(mp, inum)); 2102 2103 2104 /* 2105 * Look for each inode in memory and attempt to lock it, 2106 * we can be racing with flush and tail pushing here. 2107 * any inode we get the locks on, add to an array of 2108 * inode items to process later. 2109 * 2110 * The get the buffer lock, we could beat a flush 2111 * or tail pushing thread to the lock here, in which 2112 * case they will go looking for the inode buffer 2113 * and fail, we need some other form of interlock 2114 * here. 2115 */ 2116 found = 0; 2117 for (i = 0; i < ninodes; i++) { 2118 ih = XFS_IHASH(mp, inum + i); 2119 read_lock(&ih->ih_lock); 2120 for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { 2121 if (ip->i_ino == inum + i) 2122 break; 2123 } 2124 2125 /* Inode not in memory or we found it already, 2126 * nothing to do 2127 */ 2128 if (!ip || (ip->i_flags & XFS_ISTALE)) { 2129 read_unlock(&ih->ih_lock); 2130 continue; 2131 } 2132 2133 if (xfs_inode_clean(ip)) { 2134 read_unlock(&ih->ih_lock); 2135 continue; 2136 } 2137 2138 /* If we can get the locks then add it to the 2139 * list, otherwise by the time we get the bp lock 2140 * below it will already be attached to the 2141 * inode buffer. 2142 */ 2143 2144 /* This inode will already be locked - by us, lets 2145 * keep it that way. 2146 */ 2147 2148 if (ip == free_ip) { 2149 if (xfs_iflock_nowait(ip)) { 2150 ip->i_flags |= XFS_ISTALE; 2151 2152 if (xfs_inode_clean(ip)) { 2153 xfs_ifunlock(ip); 2154 } else { 2155 ip_found[found++] = ip; 2156 } 2157 } 2158 read_unlock(&ih->ih_lock); 2159 continue; 2160 } 2161 2162 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2163 if (xfs_iflock_nowait(ip)) { 2164 ip->i_flags |= XFS_ISTALE; 2165 2166 if (xfs_inode_clean(ip)) { 2167 xfs_ifunlock(ip); 2168 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2169 } else { 2170 ip_found[found++] = ip; 2171 } 2172 } else { 2173 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2174 } 2175 } 2176 2177 read_unlock(&ih->ih_lock); 2178 } 2179 2180 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2181 mp->m_bsize * blks_per_cluster, 2182 XFS_BUF_LOCK); 2183 2184 pre_flushed = 0; 2185 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 2186 while (lip) { 2187 if (lip->li_type == XFS_LI_INODE) { 2188 iip = (xfs_inode_log_item_t *)lip; 2189 ASSERT(iip->ili_logged == 1); 2190 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; 2191 AIL_LOCK(mp,s); 2192 iip->ili_flush_lsn = iip->ili_item.li_lsn; 2193 AIL_UNLOCK(mp, s); 2194 iip->ili_inode->i_flags |= XFS_ISTALE; 2195 pre_flushed++; 2196 } 2197 lip = lip->li_bio_list; 2198 } 2199 2200 for (i = 0; i < found; i++) { 2201 ip = ip_found[i]; 2202 iip = ip->i_itemp; 2203 2204 if (!iip) { 2205 ip->i_update_core = 0; 2206 xfs_ifunlock(ip); 2207 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2208 continue; 2209 } 2210 2211 iip->ili_last_fields = iip->ili_format.ilf_fields; 2212 iip->ili_format.ilf_fields = 0; 2213 iip->ili_logged = 1; 2214 AIL_LOCK(mp,s); 2215 iip->ili_flush_lsn = iip->ili_item.li_lsn; 2216 AIL_UNLOCK(mp, s); 2217 2218 xfs_buf_attach_iodone(bp, 2219 (void(*)(xfs_buf_t*,xfs_log_item_t*)) 2220 xfs_istale_done, (xfs_log_item_t *)iip); 2221 if (ip != free_ip) { 2222 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2223 } 2224 } 2225 2226 if (found || pre_flushed) 2227 xfs_trans_stale_inode_buf(tp, bp); 2228 xfs_trans_binval(tp, bp); 2229 } 2230 2231 kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *)); 2232} 2233 2234/* 2235 * This is called to return an inode to the inode free list. 2236 * The inode should already be truncated to 0 length and have 2237 * no pages associated with it. This routine also assumes that 2238 * the inode is already a part of the transaction. 2239 * 2240 * The on-disk copy of the inode will have been added to the list 2241 * of unlinked inodes in the AGI. We need to remove the inode from 2242 * that list atomically with respect to freeing it here. 2243 */ 2244int 2245xfs_ifree( 2246 xfs_trans_t *tp, 2247 xfs_inode_t *ip, 2248 xfs_bmap_free_t *flist) 2249{ 2250 int error; 2251 int delete; 2252 xfs_ino_t first_ino; 2253 2254 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); 2255 ASSERT(ip->i_transp == tp); 2256 ASSERT(ip->i_d.di_nlink == 0); 2257 ASSERT(ip->i_d.di_nextents == 0); 2258 ASSERT(ip->i_d.di_anextents == 0); 2259 ASSERT((ip->i_d.di_size == 0) || 2260 ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); 2261 ASSERT(ip->i_d.di_nblocks == 0); 2262 2263 /* 2264 * Pull the on-disk inode from the AGI unlinked list. 2265 */ 2266 error = xfs_iunlink_remove(tp, ip); 2267 if (error != 0) { 2268 return error; 2269 } 2270 2271 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 2272 if (error != 0) { 2273 return error; 2274 } 2275 ip->i_d.di_mode = 0; /* mark incore inode as free */ 2276 ip->i_d.di_flags = 0; 2277 ip->i_d.di_dmevmask = 0; 2278 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ 2279 ip->i_df.if_ext_max = 2280 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 2281 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 2282 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 2283 /* 2284 * Bump the generation count so no one will be confused 2285 * by reincarnations of this inode. 2286 */ 2287 ip->i_d.di_gen++; 2288 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2289 2290 if (delete) { 2291 xfs_ifree_cluster(ip, tp, first_ino); 2292 } 2293 2294 return 0; 2295} 2296 2297/* 2298 * Reallocate the space for if_broot based on the number of records 2299 * being added or deleted as indicated in rec_diff. Move the records 2300 * and pointers in if_broot to fit the new size. When shrinking this 2301 * will eliminate holes between the records and pointers created by 2302 * the caller. When growing this will create holes to be filled in 2303 * by the caller. 2304 * 2305 * The caller must not request to add more records than would fit in 2306 * the on-disk inode root. If the if_broot is currently NULL, then 2307 * if we adding records one will be allocated. The caller must also 2308 * not request that the number of records go below zero, although 2309 * it can go to zero. 2310 * 2311 * ip -- the inode whose if_broot area is changing 2312 * ext_diff -- the change in the number of records, positive or negative, 2313 * requested for the if_broot array. 2314 */ 2315void 2316xfs_iroot_realloc( 2317 xfs_inode_t *ip, 2318 int rec_diff, 2319 int whichfork) 2320{ 2321 int cur_max; 2322 xfs_ifork_t *ifp; 2323 xfs_bmbt_block_t *new_broot; 2324 int new_max; 2325 size_t new_size; 2326 char *np; 2327 char *op; 2328 2329 /* 2330 * Handle the degenerate case quietly. 2331 */ 2332 if (rec_diff == 0) { 2333 return; 2334 } 2335 2336 ifp = XFS_IFORK_PTR(ip, whichfork); 2337 if (rec_diff > 0) { 2338 /* 2339 * If there wasn't any memory allocated before, just 2340 * allocate it now and get out. 2341 */ 2342 if (ifp->if_broot_bytes == 0) { 2343 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); 2344 ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size, 2345 KM_SLEEP); 2346 ifp->if_broot_bytes = (int)new_size; 2347 return; 2348 } 2349 2350 /* 2351 * If there is already an existing if_broot, then we need 2352 * to realloc() it and shift the pointers to their new 2353 * location. The records don't change location because 2354 * they are kept butted up against the btree block header. 2355 */ 2356 cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes); 2357 new_max = cur_max + rec_diff; 2358 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2359 ifp->if_broot = (xfs_bmbt_block_t *) 2360 kmem_realloc(ifp->if_broot, 2361 new_size, 2362 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ 2363 KM_SLEEP); 2364 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, 2365 ifp->if_broot_bytes); 2366 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, 2367 (int)new_size); 2368 ifp->if_broot_bytes = (int)new_size; 2369 ASSERT(ifp->if_broot_bytes <= 2370 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 2371 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); 2372 return; 2373 } 2374 2375 /* 2376 * rec_diff is less than 0. In this case, we are shrinking the 2377 * if_broot buffer. It must already exist. If we go to zero 2378 * records, just get rid of the root and clear the status bit. 2379 */ 2380 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); 2381 cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes); 2382 new_max = cur_max + rec_diff; 2383 ASSERT(new_max >= 0); 2384 if (new_max > 0) 2385 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2386 else 2387 new_size = 0; 2388 if (new_size > 0) { 2389 new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP); 2390 /* 2391 * First copy over the btree block header. 2392 */ 2393 memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t)); 2394 } else { 2395 new_broot = NULL; 2396 ifp->if_flags &= ~XFS_IFBROOT; 2397 } 2398 2399 /* 2400 * Only copy the records and pointers if there are any. 2401 */ 2402 if (new_max > 0) { 2403 /* 2404 * First copy the records. 2405 */ 2406 op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1, 2407 ifp->if_broot_bytes); 2408 np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1, 2409 (int)new_size); 2410 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); 2411 2412 /* 2413 * Then copy the pointers. 2414 */ 2415 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, 2416 ifp->if_broot_bytes); 2417 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1, 2418 (int)new_size); 2419 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); 2420 } 2421 kmem_free(ifp->if_broot, ifp->if_broot_bytes); 2422 ifp->if_broot = new_broot; 2423 ifp->if_broot_bytes = (int)new_size; 2424 ASSERT(ifp->if_broot_bytes <= 2425 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 2426 return; 2427} 2428 2429 2430/* 2431 * This is called when the amount of space needed for if_extents 2432 * is increased or decreased. The change in size is indicated by 2433 * the number of extents that need to be added or deleted in the 2434 * ext_diff parameter. 2435 * 2436 * If the amount of space needed has decreased below the size of the 2437 * inline buffer, then switch to using the inline buffer. Otherwise, 2438 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer 2439 * to what is needed. 2440 * 2441 * ip -- the inode whose if_extents area is changing 2442 * ext_diff -- the change in the number of extents, positive or negative, 2443 * requested for the if_extents array. 2444 */ 2445void 2446xfs_iext_realloc( 2447 xfs_inode_t *ip, 2448 int ext_diff, 2449 int whichfork) 2450{ 2451 int byte_diff; 2452 xfs_ifork_t *ifp; 2453 int new_size; 2454 uint rnew_size; 2455 2456 if (ext_diff == 0) { 2457 return; 2458 } 2459 2460 ifp = XFS_IFORK_PTR(ip, whichfork); 2461 byte_diff = ext_diff * (uint)sizeof(xfs_bmbt_rec_t); 2462 new_size = (int)ifp->if_bytes + byte_diff; 2463 ASSERT(new_size >= 0); 2464 2465 if (new_size == 0) { 2466 if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) { 2467 ASSERT(ifp->if_real_bytes != 0); 2468 kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes); 2469 } 2470 ifp->if_u1.if_extents = NULL; 2471 rnew_size = 0; 2472 } else if (new_size <= sizeof(ifp->if_u2.if_inline_ext)) { 2473 /* 2474 * If the valid extents can fit in if_inline_ext, 2475 * copy them from the malloc'd vector and free it. 2476 */ 2477 if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) { 2478 /* 2479 * For now, empty files are format EXTENTS, 2480 * so the if_extents pointer is null. 2481 */ 2482 if (ifp->if_u1.if_extents) { 2483 memcpy(ifp->if_u2.if_inline_ext, 2484 ifp->if_u1.if_extents, new_size); 2485 kmem_free(ifp->if_u1.if_extents, 2486 ifp->if_real_bytes); 2487 } 2488 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 2489 } 2490 rnew_size = 0; 2491 } else { 2492 rnew_size = new_size; 2493 if ((rnew_size & (rnew_size - 1)) != 0) 2494 rnew_size = xfs_iroundup(rnew_size); 2495 /* 2496 * Stuck with malloc/realloc. 2497 */ 2498 if (ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext) { 2499 ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) 2500 kmem_alloc(rnew_size, KM_SLEEP); 2501 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, 2502 sizeof(ifp->if_u2.if_inline_ext)); 2503 } else if (rnew_size != ifp->if_real_bytes) { 2504 ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) 2505 kmem_realloc(ifp->if_u1.if_extents, 2506 rnew_size, 2507 ifp->if_real_bytes, 2508 KM_NOFS); 2509 } 2510 } 2511 ifp->if_real_bytes = rnew_size; 2512 ifp->if_bytes = new_size; 2513} 2514 2515 2516/* 2517 * This is called when the amount of space needed for if_data 2518 * is increased or decreased. The change in size is indicated by 2519 * the number of bytes that need to be added or deleted in the 2520 * byte_diff parameter. 2521 * 2522 * If the amount of space needed has decreased below the size of the 2523 * inline buffer, then switch to using the inline buffer. Otherwise, 2524 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer 2525 * to what is needed. 2526 * 2527 * ip -- the inode whose if_data area is changing 2528 * byte_diff -- the change in the number of bytes, positive or negative, 2529 * requested for the if_data array. 2530 */ 2531void 2532xfs_idata_realloc( 2533 xfs_inode_t *ip, 2534 int byte_diff, 2535 int whichfork) 2536{ 2537 xfs_ifork_t *ifp; 2538 int new_size; 2539 int real_size; 2540 2541 if (byte_diff == 0) { 2542 return; 2543 } 2544 2545 ifp = XFS_IFORK_PTR(ip, whichfork); 2546 new_size = (int)ifp->if_bytes + byte_diff; 2547 ASSERT(new_size >= 0); 2548 2549 if (new_size == 0) { 2550 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2551 kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); 2552 } 2553 ifp->if_u1.if_data = NULL; 2554 real_size = 0; 2555 } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { 2556 /* 2557 * If the valid extents/data can fit in if_inline_ext/data, 2558 * copy them from the malloc'd vector and free it. 2559 */ 2560 if (ifp->if_u1.if_data == NULL) { 2561 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 2562 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2563 ASSERT(ifp->if_real_bytes != 0); 2564 memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, 2565 new_size); 2566 kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); 2567 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 2568 } 2569 real_size = 0; 2570 } else { 2571 /* 2572 * Stuck with malloc/realloc. 2573 * For inline data, the underlying buffer must be 2574 * a multiple of 4 bytes in size so that it can be 2575 * logged and stay on word boundaries. We enforce 2576 * that here. 2577 */ 2578 real_size = roundup(new_size, 4); 2579 if (ifp->if_u1.if_data == NULL) { 2580 ASSERT(ifp->if_real_bytes == 0); 2581 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 2582 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2583 /* 2584 * Only do the realloc if the underlying size 2585 * is really changing. 2586 */ 2587 if (ifp->if_real_bytes != real_size) { 2588 ifp->if_u1.if_data = 2589 kmem_realloc(ifp->if_u1.if_data, 2590 real_size, 2591 ifp->if_real_bytes, 2592 KM_SLEEP); 2593 } 2594 } else { 2595 ASSERT(ifp->if_real_bytes == 0); 2596 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 2597 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, 2598 ifp->if_bytes); 2599 } 2600 } 2601 ifp->if_real_bytes = real_size; 2602 ifp->if_bytes = new_size; 2603 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2604} 2605 2606 2607 2608 2609/* 2610 * Map inode to disk block and offset. 2611 * 2612 * mp -- the mount point structure for the current file system 2613 * tp -- the current transaction 2614 * ino -- the inode number of the inode to be located 2615 * imap -- this structure is filled in with the information necessary 2616 * to retrieve the given inode from disk 2617 * flags -- flags to pass to xfs_dilocate indicating whether or not 2618 * lookups in the inode btree were OK or not 2619 */ 2620int 2621xfs_imap( 2622 xfs_mount_t *mp, 2623 xfs_trans_t *tp, 2624 xfs_ino_t ino, 2625 xfs_imap_t *imap, 2626 uint flags) 2627{ 2628 xfs_fsblock_t fsbno; 2629 int len; 2630 int off; 2631 int error; 2632 2633 fsbno = imap->im_blkno ? 2634 XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK; 2635 error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags); 2636 if (error != 0) { 2637 return error; 2638 } 2639 imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno); 2640 imap->im_len = XFS_FSB_TO_BB(mp, len); 2641 imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno); 2642 imap->im_ioffset = (ushort)off; 2643 imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog); 2644 return 0; 2645} 2646 2647void 2648xfs_idestroy_fork( 2649 xfs_inode_t *ip, 2650 int whichfork) 2651{ 2652 xfs_ifork_t *ifp; 2653 2654 ifp = XFS_IFORK_PTR(ip, whichfork); 2655 if (ifp->if_broot != NULL) { 2656 kmem_free(ifp->if_broot, ifp->if_broot_bytes); 2657 ifp->if_broot = NULL; 2658 } 2659 2660 /* 2661 * If the format is local, then we can't have an extents 2662 * array so just look for an inline data array. If we're 2663 * not local then we may or may not have an extents list, 2664 * so check and free it up if we do. 2665 */ 2666 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { 2667 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && 2668 (ifp->if_u1.if_data != NULL)) { 2669 ASSERT(ifp->if_real_bytes != 0); 2670 kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); 2671 ifp->if_u1.if_data = NULL; 2672 ifp->if_real_bytes = 0; 2673 } 2674 } else if ((ifp->if_flags & XFS_IFEXTENTS) && 2675 (ifp->if_u1.if_extents != NULL) && 2676 (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)) { 2677 ASSERT(ifp->if_real_bytes != 0); 2678 kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes); 2679 ifp->if_u1.if_extents = NULL; 2680 ifp->if_real_bytes = 0; 2681 } 2682 ASSERT(ifp->if_u1.if_extents == NULL || 2683 ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); 2684 ASSERT(ifp->if_real_bytes == 0); 2685 if (whichfork == XFS_ATTR_FORK) { 2686 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 2687 ip->i_afp = NULL; 2688 } 2689} 2690 2691/* 2692 * This is called free all the memory associated with an inode. 2693 * It must free the inode itself and any buffers allocated for 2694 * if_extents/if_data and if_broot. It must also free the lock 2695 * associated with the inode. 2696 */ 2697void 2698xfs_idestroy( 2699 xfs_inode_t *ip) 2700{ 2701 2702 switch (ip->i_d.di_mode & S_IFMT) { 2703 case S_IFREG: 2704 case S_IFDIR: 2705 case S_IFLNK: 2706 xfs_idestroy_fork(ip, XFS_DATA_FORK); 2707 break; 2708 } 2709 if (ip->i_afp) 2710 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 2711 mrfree(&ip->i_lock); 2712 mrfree(&ip->i_iolock); 2713 freesema(&ip->i_flock); 2714#ifdef XFS_BMAP_TRACE 2715 ktrace_free(ip->i_xtrace); 2716#endif 2717#ifdef XFS_BMBT_TRACE 2718 ktrace_free(ip->i_btrace); 2719#endif 2720#ifdef XFS_RW_TRACE 2721 ktrace_free(ip->i_rwtrace); 2722#endif 2723#ifdef XFS_ILOCK_TRACE 2724 ktrace_free(ip->i_lock_trace); 2725#endif 2726#ifdef XFS_DIR2_TRACE 2727 ktrace_free(ip->i_dir_trace); 2728#endif 2729 if (ip->i_itemp) { 2730 /* XXXdpd should be able to assert this but shutdown 2731 * is leaving the AIL behind. */ 2732 ASSERT(((ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL) == 0) || 2733 XFS_FORCED_SHUTDOWN(ip->i_mount)); 2734 xfs_inode_item_destroy(ip); 2735 } 2736 kmem_zone_free(xfs_inode_zone, ip); 2737} 2738 2739 2740/* 2741 * Increment the pin count of the given buffer. 2742 * This value is protected by ipinlock spinlock in the mount structure. 2743 */ 2744void 2745xfs_ipin( 2746 xfs_inode_t *ip) 2747{ 2748 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); 2749 2750 atomic_inc(&ip->i_pincount); 2751} 2752 2753/* 2754 * Decrement the pin count of the given inode, and wake up 2755 * anyone in xfs_iwait_unpin() if the count goes to 0. The 2756 * inode must have been previoulsy pinned with a call to xfs_ipin(). 2757 */ 2758void 2759xfs_iunpin( 2760 xfs_inode_t *ip) 2761{ 2762 ASSERT(atomic_read(&ip->i_pincount) > 0); 2763 2764 if (atomic_dec_and_test(&ip->i_pincount)) { 2765#if XXXKAN 2766 /* 2767 * Should I mark FreeBSD vnode as dirty here? 2768 */ 2769 printf("%s:%d: Should I mark FreeBSD vnode as dirty here?\n", 2770 __FILE__, __LINE__); 2771 xfs_vnode_t *vp = XFS_ITOV_NULL(ip); 2772 2773 /* make sync come back and flush this inode */ 2774 if (vp) { 2775 struct inode *inode = LINVFS_GET_IP(vp); 2776 2777 if (!(inode->i_state & I_NEW)) 2778 mark_inode_dirty_sync(inode); 2779 } 2780#endif 2781 2782 wakeup(&ip->i_ipin_wait); 2783 } 2784} 2785 2786/* 2787 * This is called to wait for the given inode to be unpinned. 2788 * It will sleep until this happens. The caller must have the 2789 * inode locked in at least shared mode so that the buffer cannot 2790 * be subsequently pinned once someone is waiting for it to be 2791 * unpinned. 2792 */ 2793STATIC void 2794xfs_iunpin_wait( 2795 xfs_inode_t *ip) 2796{ 2797 xfs_inode_log_item_t *iip; 2798 xfs_lsn_t lsn; 2799 2800 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS)); 2801 2802 if (atomic_read(&ip->i_pincount) == 0) { 2803 return; 2804 } 2805 2806 iip = ip->i_itemp; 2807 if (iip && iip->ili_last_lsn) { 2808 lsn = iip->ili_last_lsn; 2809 } else { 2810 lsn = (xfs_lsn_t)0; 2811 } 2812 2813 /* 2814 * Give the log a push so we don't wait here too long. 2815 */ 2816 xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE); 2817 2818 /* 2819 * XXXKAN: xfs_iunpin is not locking inode 2820 * at all? 2821 */ 2822 while(atomic_read(&ip->i_pincount) != 0) 2823 tsleep(&ip->i_ipin_wait, PRIBIO, "iunpin", 0); 2824} 2825 2826 2827/* 2828 * xfs_iextents_copy() 2829 * 2830 * This is called to copy the REAL extents (as opposed to the delayed 2831 * allocation extents) from the inode into the given buffer. It 2832 * returns the number of bytes copied into the buffer. 2833 * 2834 * If there are no delayed allocation extents, then we can just 2835 * memcpy() the extents into the buffer. Otherwise, we need to 2836 * examine each extent in turn and skip those which are delayed. 2837 */ 2838int 2839xfs_iextents_copy( 2840 xfs_inode_t *ip, 2841 xfs_bmbt_rec_t *buffer, 2842 int whichfork) 2843{ 2844 int copied; 2845 xfs_bmbt_rec_t *dest_ep; 2846 xfs_bmbt_rec_t *ep; 2847#ifdef XFS_BMAP_TRACE 2848 static char fname[] = "xfs_iextents_copy"; 2849#endif 2850 int i; 2851 xfs_ifork_t *ifp; 2852 int nrecs; 2853 xfs_fsblock_t start_block; 2854 2855 ifp = XFS_IFORK_PTR(ip, whichfork); 2856 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); 2857 ASSERT(ifp->if_bytes > 0); 2858 2859 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2860 xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork); 2861 ASSERT(nrecs > 0); 2862 2863 /* 2864 * There are some delayed allocation extents in the 2865 * inode, so copy the extents one at a time and skip 2866 * the delayed ones. There must be at least one 2867 * non-delayed extent. 2868 */ 2869 ep = ifp->if_u1.if_extents; 2870 dest_ep = buffer; 2871 copied = 0; 2872 for (i = 0; i < nrecs; i++) { 2873 start_block = xfs_bmbt_get_startblock(ep); 2874 if (ISNULLSTARTBLOCK(start_block)) { 2875 /* 2876 * It's a delayed allocation extent, so skip it. 2877 */ 2878 ep++; 2879 continue; 2880 } 2881 2882 /* Translate to on disk format */ 2883 put_unaligned(INT_GET(ep->l0, ARCH_CONVERT), 2884 (__uint64_t*)&dest_ep->l0); 2885 put_unaligned(INT_GET(ep->l1, ARCH_CONVERT), 2886 (__uint64_t*)&dest_ep->l1); 2887 dest_ep++; 2888 ep++; 2889 copied++; 2890 } 2891 ASSERT(copied != 0); 2892 xfs_validate_extents(buffer, copied, 1, XFS_EXTFMT_INODE(ip)); 2893 2894 return (copied * (uint)sizeof(xfs_bmbt_rec_t)); 2895} 2896 2897/* 2898 * Each of the following cases stores data into the same region 2899 * of the on-disk inode, so only one of them can be valid at 2900 * any given time. While it is possible to have conflicting formats 2901 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is 2902 * in EXTENTS format, this can only happen when the fork has 2903 * changed formats after being modified but before being flushed. 2904 * In these cases, the format always takes precedence, because the 2905 * format indicates the current state of the fork. 2906 */ 2907/*ARGSUSED*/ 2908STATIC int 2909xfs_iflush_fork( 2910 xfs_inode_t *ip, 2911 xfs_dinode_t *dip, 2912 xfs_inode_log_item_t *iip, 2913 int whichfork, 2914 xfs_buf_t *bp) 2915{ 2916 char *cp; 2917 xfs_ifork_t *ifp; 2918 xfs_mount_t *mp; 2919#ifdef XFS_TRANS_DEBUG 2920 int first; 2921#endif 2922 static const short brootflag[2] = 2923 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; 2924 static const short dataflag[2] = 2925 { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; 2926 static const short extflag[2] = 2927 { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; 2928 2929 if (iip == NULL) 2930 return 0; 2931 ifp = XFS_IFORK_PTR(ip, whichfork); 2932 /* 2933 * This can happen if we gave up in iformat in an error path, 2934 * for the attribute fork. 2935 */ 2936 if (ifp == NULL) { 2937 ASSERT(whichfork == XFS_ATTR_FORK); 2938 return 0; 2939 } 2940 cp = XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT); 2941 mp = ip->i_mount; 2942 switch (XFS_IFORK_FORMAT(ip, whichfork)) { 2943 case XFS_DINODE_FMT_LOCAL: 2944 if ((iip->ili_format.ilf_fields & dataflag[whichfork]) && 2945 (ifp->if_bytes > 0)) { 2946 ASSERT(ifp->if_u1.if_data != NULL); 2947 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2948 memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); 2949 } 2950 if (whichfork == XFS_DATA_FORK) { 2951 if (unlikely(XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip))) { 2952 XFS_ERROR_REPORT("xfs_iflush_fork", 2953 XFS_ERRLEVEL_LOW, mp); 2954 return XFS_ERROR(EFSCORRUPTED); 2955 } 2956 } 2957 break; 2958 2959 case XFS_DINODE_FMT_EXTENTS: 2960 ASSERT((ifp->if_flags & XFS_IFEXTENTS) || 2961 !(iip->ili_format.ilf_fields & extflag[whichfork])); 2962 ASSERT((ifp->if_u1.if_extents != NULL) || (ifp->if_bytes == 0)); 2963 ASSERT((ifp->if_u1.if_extents == NULL) || (ifp->if_bytes > 0)); 2964 if ((iip->ili_format.ilf_fields & extflag[whichfork]) && 2965 (ifp->if_bytes > 0)) { 2966 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); 2967 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, 2968 whichfork); 2969 } 2970 break; 2971 2972 case XFS_DINODE_FMT_BTREE: 2973 if ((iip->ili_format.ilf_fields & brootflag[whichfork]) && 2974 (ifp->if_broot_bytes > 0)) { 2975 ASSERT(ifp->if_broot != NULL); 2976 ASSERT(ifp->if_broot_bytes <= 2977 (XFS_IFORK_SIZE(ip, whichfork) + 2978 XFS_BROOT_SIZE_ADJ)); 2979 xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes, 2980 (xfs_bmdr_block_t *)cp, 2981 XFS_DFORK_SIZE_ARCH(dip, mp, whichfork, ARCH_CONVERT)); 2982 } 2983 break; 2984 2985 case XFS_DINODE_FMT_DEV: 2986 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { 2987 ASSERT(whichfork == XFS_DATA_FORK); 2988 INT_SET(dip->di_u.di_dev, ARCH_CONVERT, ip->i_df.if_u2.if_rdev); 2989 } 2990 break; 2991 2992 case XFS_DINODE_FMT_UUID: 2993 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { 2994 ASSERT(whichfork == XFS_DATA_FORK); 2995 memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid, 2996 sizeof(uuid_t)); 2997 } 2998 break; 2999 3000 default: 3001 ASSERT(0); 3002 break; 3003 } 3004 3005 return 0; 3006} 3007 3008/* 3009 * xfs_iflush() will write a modified inode's changes out to the 3010 * inode's on disk home. The caller must have the inode lock held 3011 * in at least shared mode and the inode flush semaphore must be 3012 * held as well. The inode lock will still be held upon return from 3013 * the call and the caller is free to unlock it. 3014 * The inode flush lock will be unlocked when the inode reaches the disk. 3015 * The flags indicate how the inode's buffer should be written out. 3016 */ 3017int 3018xfs_iflush( 3019 xfs_inode_t *ip, 3020 uint flags) 3021{ 3022 xfs_inode_log_item_t *iip; 3023 xfs_buf_t *bp; 3024 xfs_dinode_t *dip; 3025 xfs_mount_t *mp; 3026 int error; 3027 /* REFERENCED */ 3028 xfs_chash_t *ch; 3029 xfs_inode_t *iq; 3030 int clcount; /* count of inodes clustered */ 3031 int bufwasdelwri; 3032 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; 3033 SPLDECL(s); 3034 3035 XFS_STATS_INC(xs_iflush_count); 3036 3037 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); 3038 ASSERT(valusema(&ip->i_flock) <= 0); 3039 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 3040 ip->i_d.di_nextents > ip->i_df.if_ext_max); 3041 3042 iip = ip->i_itemp; 3043 mp = ip->i_mount; 3044 3045 /* 3046 * If the inode isn't dirty, then just release the inode 3047 * flush lock and do nothing. 3048 */ 3049 if ((ip->i_update_core == 0) && 3050 ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { 3051 ASSERT((iip != NULL) ? 3052 !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1); 3053 xfs_ifunlock(ip); 3054 return 0; 3055 } 3056 3057 /* 3058 * We can't flush the inode until it is unpinned, so 3059 * wait for it. We know noone new can pin it, because 3060 * we are holding the inode lock shared and you need 3061 * to hold it exclusively to pin the inode. 3062 */ 3063 xfs_iunpin_wait(ip); 3064 3065 /* 3066 * This may have been unpinned because the filesystem is shutting 3067 * down forcibly. If that's the case we must not write this inode 3068 * to disk, because the log record didn't make it to disk! 3069 */ 3070 if (XFS_FORCED_SHUTDOWN(mp)) { 3071 ip->i_update_core = 0; 3072 if (iip) 3073 iip->ili_format.ilf_fields = 0; 3074 xfs_ifunlock(ip); 3075 return XFS_ERROR(EIO); 3076 } 3077 3078 /* 3079 * Get the buffer containing the on-disk inode. 3080 */ 3081 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0); 3082 if (error != 0) { 3083 xfs_ifunlock(ip); 3084 return error; 3085 } 3086 3087 /* 3088 * Decide how buffer will be flushed out. This is done before 3089 * the call to xfs_iflush_int because this field is zeroed by it. 3090 */ 3091 if (iip != NULL && iip->ili_format.ilf_fields != 0) { 3092 /* 3093 * Flush out the inode buffer according to the directions 3094 * of the caller. In the cases where the caller has given 3095 * us a choice choose the non-delwri case. This is because 3096 * the inode is in the AIL and we need to get it out soon. 3097 */ 3098 switch (flags) { 3099 case XFS_IFLUSH_SYNC: 3100 case XFS_IFLUSH_DELWRI_ELSE_SYNC: 3101 flags = 0; 3102 break; 3103 case XFS_IFLUSH_ASYNC: 3104 case XFS_IFLUSH_DELWRI_ELSE_ASYNC: 3105 flags = INT_ASYNC; 3106 break; 3107 case XFS_IFLUSH_DELWRI: 3108 flags = INT_DELWRI; 3109 break; 3110 default: 3111 ASSERT(0); 3112 flags = 0; 3113 break; 3114 } 3115 } else { 3116 switch (flags) { 3117 case XFS_IFLUSH_DELWRI_ELSE_SYNC: 3118 case XFS_IFLUSH_DELWRI_ELSE_ASYNC: 3119 case XFS_IFLUSH_DELWRI: 3120 flags = INT_DELWRI; 3121 break; 3122 case XFS_IFLUSH_ASYNC: 3123 flags = INT_ASYNC; 3124 break; 3125 case XFS_IFLUSH_SYNC: 3126 flags = 0; 3127 break; 3128 default: 3129 ASSERT(0); 3130 flags = 0; 3131 break; 3132 } 3133 } 3134 3135 /* 3136 * First flush out the inode that xfs_iflush was called with. 3137 */ 3138 error = xfs_iflush_int(ip, bp); 3139 if (error) { 3140 goto corrupt_out; 3141 } 3142 3143 /* 3144 * inode clustering: 3145 * see if other inodes can be gathered into this write 3146 */ 3147 3148 ip->i_chash->chl_buf = bp; 3149 3150 ch = XFS_CHASH(mp, ip->i_blkno); 3151 s = mutex_spinlock(&ch->ch_lock); 3152 3153 clcount = 0; 3154 for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) { 3155 /* 3156 * Do an un-protected check to see if the inode is dirty and 3157 * is a candidate for flushing. These checks will be repeated 3158 * later after the appropriate locks are acquired. 3159 */ 3160 iip = iq->i_itemp; 3161 if ((iq->i_update_core == 0) && 3162 ((iip == NULL) || 3163 !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) && 3164 xfs_ipincount(iq) == 0) { 3165 continue; 3166 } 3167 3168 /* 3169 * Try to get locks. If any are unavailable, 3170 * then this inode cannot be flushed and is skipped. 3171 */ 3172 3173 /* get inode locks (just i_lock) */ 3174 if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) { 3175 /* get inode flush lock */ 3176 if (xfs_iflock_nowait(iq)) { 3177 /* check if pinned */ 3178 if (xfs_ipincount(iq) == 0) { 3179 /* arriving here means that 3180 * this inode can be flushed. 3181 * first re-check that it's 3182 * dirty 3183 */ 3184 iip = iq->i_itemp; 3185 if ((iq->i_update_core != 0)|| 3186 ((iip != NULL) && 3187 (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { 3188 clcount++; 3189 error = xfs_iflush_int(iq, bp); 3190 if (error) { 3191 xfs_iunlock(iq, 3192 XFS_ILOCK_SHARED); 3193 goto cluster_corrupt_out; 3194 } 3195 } else { 3196 xfs_ifunlock(iq); 3197 } 3198 } else { 3199 xfs_ifunlock(iq); 3200 } 3201 } 3202 xfs_iunlock(iq, XFS_ILOCK_SHARED); 3203 } 3204 } 3205 mutex_spinunlock(&ch->ch_lock, s); 3206 3207 if (clcount) { 3208 XFS_STATS_INC(xs_icluster_flushcnt); 3209 XFS_STATS_ADD(xs_icluster_flushinode, clcount); 3210 } 3211 3212 /* 3213 * If the buffer is pinned then push on the log so we won't 3214 * get stuck waiting in the write for too long. 3215 */ 3216 if (XFS_BUF_ISPINNED(bp)){ 3217 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 3218 } 3219 3220 if (flags & INT_DELWRI) { 3221 xfs_bdwrite(mp, bp); 3222 } else if (flags & INT_ASYNC) { 3223 xfs_bawrite(mp, bp); 3224 } else { 3225 error = xfs_bwrite(mp, bp); 3226 } 3227 return error; 3228 3229corrupt_out: 3230 xfs_buf_relse(bp); 3231 xfs_force_shutdown(mp, XFS_CORRUPT_INCORE); 3232 xfs_iflush_abort(ip); 3233 /* 3234 * Unlocks the flush lock 3235 */ 3236 return XFS_ERROR(EFSCORRUPTED); 3237 3238cluster_corrupt_out: 3239 /* Corruption detected in the clustering loop. Invalidate the 3240 * inode buffer and shut down the filesystem. 3241 */ 3242 mutex_spinunlock(&ch->ch_lock, s); 3243 3244 /* 3245 * Clean up the buffer. If it was B_DELWRI, just release it -- 3246 * brelse can handle it with no problems. If not, shut down the 3247 * filesystem before releasing the buffer. 3248 */ 3249 if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) { 3250 xfs_buf_relse(bp); 3251 } 3252 3253 xfs_force_shutdown(mp, XFS_CORRUPT_INCORE); 3254 3255 if(!bufwasdelwri) { 3256 /* 3257 * Just like incore_relse: if we have b_iodone functions, 3258 * mark the buffer as an error and call them. Otherwise 3259 * mark it as stale and brelse. 3260 */ 3261 if (XFS_BUF_IODONE_FUNC(bp)) { 3262 XFS_BUF_CLR_BDSTRAT_FUNC(bp); 3263 XFS_BUF_UNDONE(bp); 3264 XFS_BUF_STALE(bp); 3265 XFS_BUF_SHUT(bp); 3266 XFS_BUF_ERROR(bp,EIO); 3267 xfs_biodone(bp); 3268 } else { 3269 XFS_BUF_STALE(bp); 3270 xfs_buf_relse(bp); 3271 } 3272 } 3273 3274 xfs_iflush_abort(iq); 3275 /* 3276 * Unlocks the flush lock 3277 */ 3278 return XFS_ERROR(EFSCORRUPTED); 3279} 3280 3281 3282STATIC int 3283xfs_iflush_int( 3284 xfs_inode_t *ip, 3285 xfs_buf_t *bp) 3286{ 3287 xfs_inode_log_item_t *iip; 3288 xfs_dinode_t *dip; 3289 xfs_mount_t *mp; 3290#ifdef XFS_TRANS_DEBUG 3291 int first; 3292#endif 3293 SPLDECL(s); 3294 3295 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); 3296 ASSERT(valusema(&ip->i_flock) <= 0); 3297 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 3298 ip->i_d.di_nextents > ip->i_df.if_ext_max); 3299 3300 iip = ip->i_itemp; 3301 mp = ip->i_mount; 3302 3303 3304 /* 3305 * If the inode isn't dirty, then just release the inode 3306 * flush lock and do nothing. 3307 */ 3308 if ((ip->i_update_core == 0) && 3309 ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { 3310 xfs_ifunlock(ip); 3311 return 0; 3312 } 3313 3314 /* set *dip = inode's place in the buffer */ 3315 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset); 3316 3317 /* 3318 * Clear i_update_core before copying out the data. 3319 * This is for coordination with our timestamp updates 3320 * that don't hold the inode lock. They will always 3321 * update the timestamps BEFORE setting i_update_core, 3322 * so if we clear i_update_core after they set it we 3323 * are guaranteed to see their updates to the timestamps. 3324 * I believe that this depends on strongly ordered memory 3325 * semantics, but we have that. We use the SYNCHRONIZE 3326 * macro to make sure that the compiler does not reorder 3327 * the i_update_core access below the data copy below. 3328 */ 3329 ip->i_update_core = 0; 3330 SYNCHRONIZE(); 3331 3332 if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC, 3333 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 3334 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3335 "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p", 3336 ip->i_ino, (int) INT_GET(dip->di_core.di_magic, ARCH_CONVERT), dip); 3337 goto corrupt_out; 3338 } 3339 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 3340 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { 3341 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3342 "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x", 3343 ip->i_ino, ip, ip->i_d.di_magic); 3344 goto corrupt_out; 3345 } 3346 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { 3347 if (XFS_TEST_ERROR( 3348 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 3349 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 3350 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { 3351 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3352 "xfs_iflush: Bad regular inode %Lu, ptr 0x%p", 3353 ip->i_ino, ip); 3354 goto corrupt_out; 3355 } 3356 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 3357 if (XFS_TEST_ERROR( 3358 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 3359 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 3360 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), 3361 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { 3362 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3363 "xfs_iflush: Bad directory inode %Lu, ptr 0x%p", 3364 ip->i_ino, ip); 3365 goto corrupt_out; 3366 } 3367 } 3368 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > 3369 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, 3370 XFS_RANDOM_IFLUSH_5)) { 3371 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3372 "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p", 3373 ip->i_ino, 3374 ip->i_d.di_nextents + ip->i_d.di_anextents, 3375 ip->i_d.di_nblocks, 3376 ip); 3377 goto corrupt_out; 3378 } 3379 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, 3380 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { 3381 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3382 "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p", 3383 ip->i_ino, ip->i_d.di_forkoff, ip); 3384 goto corrupt_out; 3385 } 3386 /* 3387 * bump the flush iteration count, used to detect flushes which 3388 * postdate a log record during recovery. 3389 */ 3390 3391 ip->i_d.di_flushiter++; 3392 3393 /* 3394 * Copy the dirty parts of the inode into the on-disk 3395 * inode. We always copy out the core of the inode, 3396 * because if the inode is dirty at all the core must 3397 * be. 3398 */ 3399 xfs_xlate_dinode_core((xfs_caddr_t)&(dip->di_core), &(ip->i_d), 3400 -1, ARCH_CONVERT); 3401 3402 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 3403 if (ip->i_d.di_flushiter == DI_MAX_FLUSH) 3404 ip->i_d.di_flushiter = 0; 3405 3406 /* 3407 * If this is really an old format inode and the superblock version 3408 * has not been updated to support only new format inodes, then 3409 * convert back to the old inode format. If the superblock version 3410 * has been updated, then make the conversion permanent. 3411 */ 3412 ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || 3413 XFS_SB_VERSION_HASNLINK(&mp->m_sb)); 3414 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { 3415 if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { 3416 /* 3417 * Convert it back. 3418 */ 3419 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); 3420 INT_SET(dip->di_core.di_onlink, ARCH_CONVERT, ip->i_d.di_nlink); 3421 } else { 3422 /* 3423 * The superblock version has already been bumped, 3424 * so just make the conversion to the new inode 3425 * format permanent. 3426 */ 3427 ip->i_d.di_version = XFS_DINODE_VERSION_2; 3428 INT_SET(dip->di_core.di_version, ARCH_CONVERT, XFS_DINODE_VERSION_2); 3429 ip->i_d.di_onlink = 0; 3430 INT_ZERO(dip->di_core.di_onlink, ARCH_CONVERT); 3431 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 3432 memset(&(dip->di_core.di_pad[0]), 0, 3433 sizeof(dip->di_core.di_pad)); 3434 ASSERT(ip->i_d.di_projid == 0); 3435 } 3436 } 3437 3438 if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) { 3439 goto corrupt_out; 3440 } 3441 3442 if (XFS_IFORK_Q(ip)) { 3443 /* 3444 * The only error from xfs_iflush_fork is on the data fork. 3445 */ 3446 (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); 3447 } 3448 xfs_inobp_check(mp, bp); 3449 3450 /* 3451 * We've recorded everything logged in the inode, so we'd 3452 * like to clear the ilf_fields bits so we don't log and 3453 * flush things unnecessarily. However, we can't stop 3454 * logging all this information until the data we've copied 3455 * into the disk buffer is written to disk. If we did we might 3456 * overwrite the copy of the inode in the log with all the 3457 * data after re-logging only part of it, and in the face of 3458 * a crash we wouldn't have all the data we need to recover. 3459 * 3460 * What we do is move the bits to the ili_last_fields field. 3461 * When logging the inode, these bits are moved back to the 3462 * ilf_fields field. In the xfs_iflush_done() routine we 3463 * clear ili_last_fields, since we know that the information 3464 * those bits represent is permanently on disk. As long as 3465 * the flush completes before the inode is logged again, then 3466 * both ilf_fields and ili_last_fields will be cleared. 3467 * 3468 * We can play with the ilf_fields bits here, because the inode 3469 * lock must be held exclusively in order to set bits there 3470 * and the flush lock protects the ili_last_fields bits. 3471 * Set ili_logged so the flush done 3472 * routine can tell whether or not to look in the AIL. 3473 * Also, store the current LSN of the inode so that we can tell 3474 * whether the item has moved in the AIL from xfs_iflush_done(). 3475 * In order to read the lsn we need the AIL lock, because 3476 * it is a 64 bit value that cannot be read atomically. 3477 */ 3478 if (iip != NULL && iip->ili_format.ilf_fields != 0) { 3479 iip->ili_last_fields = iip->ili_format.ilf_fields; 3480 iip->ili_format.ilf_fields = 0; 3481 iip->ili_logged = 1; 3482 3483 ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ 3484 AIL_LOCK(mp,s); 3485 iip->ili_flush_lsn = iip->ili_item.li_lsn; 3486 AIL_UNLOCK(mp, s); 3487 3488 /* 3489 * Attach the function xfs_iflush_done to the inode's 3490 * buffer. This will remove the inode from the AIL 3491 * and unlock the inode's flush lock when the inode is 3492 * completely written to disk. 3493 */ 3494 xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*)) 3495 xfs_iflush_done, (xfs_log_item_t *)iip); 3496 3497 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 3498 ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); 3499 } else { 3500 /* 3501 * We're flushing an inode which is not in the AIL and has 3502 * not been logged but has i_update_core set. For this 3503 * case we can use a B_DELWRI flush and immediately drop 3504 * the inode flush lock because we can avoid the whole 3505 * AIL state thing. It's OK to drop the flush lock now, 3506 * because we've already locked the buffer and to do anything 3507 * you really need both. 3508 */ 3509 if (iip != NULL) { 3510 ASSERT(iip->ili_logged == 0); 3511 ASSERT(iip->ili_last_fields == 0); 3512 ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); 3513 } 3514 xfs_ifunlock(ip); 3515 } 3516 3517 return 0; 3518 3519corrupt_out: 3520 return XFS_ERROR(EFSCORRUPTED); 3521} 3522 3523/* 3524 * Flush all inactive inodes in mp. Return true if no user references 3525 * were found, false otherwise. 3526 */ 3527int 3528xfs_iflush_all( 3529 xfs_mount_t *mp, 3530 int flag) 3531{ 3532 int busy; 3533 int done; 3534 int purged; 3535 xfs_inode_t *ip; 3536 vmap_t vmap; 3537 xfs_vnode_t *vp; 3538 3539 busy = done = 0; 3540 while (!done) { 3541 purged = 0; 3542 XFS_MOUNT_ILOCK(mp); 3543 ip = mp->m_inodes; 3544 if (ip == NULL) { 3545 break; 3546 } 3547 do { 3548 /* Make sure we skip markers inserted by sync */ 3549 if (ip->i_mount == NULL) { 3550 ip = ip->i_mnext; 3551 continue; 3552 } 3553 3554 /* 3555 * It's up to our caller to purge the root 3556 * and quota vnodes later. 3557 */ 3558 vp = XFS_ITOV_NULL(ip); 3559 3560 if (!vp) { 3561 XFS_MOUNT_IUNLOCK(mp); 3562 xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC); 3563 purged = 1; 3564 break; 3565 } 3566 3567 if (vn_count(vp) != 0) { 3568 if (vn_count(vp) == 1 && 3569 (ip == mp->m_rootip || 3570 (mp->m_quotainfo && 3571 (ip->i_ino == mp->m_sb.sb_uquotino || 3572 ip->i_ino == mp->m_sb.sb_gquotino)))) { 3573 3574 ip = ip->i_mnext; 3575 continue; 3576 } 3577 if (!(flag & XFS_FLUSH_ALL)) { 3578 busy = 1; 3579 done = 1; 3580 break; 3581 } 3582 /* 3583 * Ignore busy inodes but continue flushing 3584 * others. 3585 */ 3586 ip = ip->i_mnext; 3587 continue; 3588 } 3589 /* 3590 * Sample vp mapping while holding mp locked on MP 3591 * systems, so we don't purge a reclaimed or 3592 * nonexistent vnode. We break from the loop 3593 * since we know that we modify 3594 * it by pulling ourselves from it in xfs_reclaim() 3595 * called via vn_purge() below. Set ip to the next 3596 * entry in the list anyway so we'll know below 3597 * whether we reached the end or not. 3598 */ 3599 VMAP(vp, vmap); 3600 XFS_MOUNT_IUNLOCK(mp); 3601 3602 vn_purge(vp, &vmap); 3603 3604 purged = 1; 3605 break; 3606 } while (ip != mp->m_inodes); 3607 /* 3608 * We need to distinguish between when we exit the loop 3609 * after a purge and when we simply hit the end of the 3610 * list. We can't use the (ip == mp->m_inodes) test, 3611 * because when we purge an inode at the start of the list 3612 * the next inode on the list becomes mp->m_inodes. That 3613 * would cause such a test to bail out early. The purged 3614 * variable tells us how we got out of the loop. 3615 */ 3616 if (!purged) { 3617 done = 1; 3618 } 3619 } 3620 XFS_MOUNT_IUNLOCK(mp); 3621 return !busy; 3622} 3623 3624 3625/* 3626 * xfs_iaccess: check accessibility of inode for mode. 3627 */ 3628int 3629xfs_iaccess( 3630 xfs_inode_t *ip, 3631 mode_t mode, 3632 cred_t *cr) 3633{ 3634 xfs_vnode_t *vp; 3635 int error; 3636 mode_t imode; 3637 3638 vp = XFS_ITOV(ip); 3639 imode = (ip->i_d.di_mode & MODEMASK) | VTTOIF(vp->v_type); 3640 3641 if (mode & S_IWUSR) { 3642 xfs_mount_t *mp = ip->i_mount; 3643 3644 if ((XVFSTOMNT(XFS_MTOVFS(mp))->mnt_flag & MNT_RDONLY) && 3645 (S_ISREG(imode) || S_ISDIR(imode) || S_ISLNK(imode))) 3646 return XFS_ERROR(EROFS); 3647 3648#if XXXKAN 3649 if (IS_IMMUTABLE(inode)) 3650 return XFS_ERROR(EACCES); 3651#endif 3652 } 3653 3654 /* 3655 * If there's an Access Control List it's used instead of 3656 * the mode bits. 3657 */ 3658 if ((error = _ACL_XFS_IACCESS(ip, mode, cr)) != -1) 3659 return error ? XFS_ERROR(error) : 0; 3660 3661 3662 error = vaccess(vp->v_type, imode, ip->i_d.di_uid, ip->i_d.di_gid, 3663 mode, cr, NULL); 3664 3665 return (error); 3666} 3667 3668/* 3669 * xfs_iroundup: round up argument to next power of two 3670 */ 3671uint 3672xfs_iroundup( 3673 uint v) 3674{ 3675 int i; 3676 uint m; 3677 3678 if ((v & (v - 1)) == 0) 3679 return v; 3680 ASSERT((v & 0x80000000) == 0); 3681 if ((v & (v + 1)) == 0) 3682 return v + 1; 3683 for (i = 0, m = 1; i < 31; i++, m <<= 1) { 3684 if (v & m) 3685 continue; 3686 v |= m; 3687 if ((v & (v + 1)) == 0) 3688 return v + 1; 3689 } 3690 ASSERT(0); 3691 return( 0 ); 3692} 3693 3694/* 3695 * Change the requested timestamp in the given inode. 3696 * We don't lock across timestamp updates, and we don't log them but 3697 * we do record the fact that there is dirty information in core. 3698 * 3699 * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG 3700 * with XFS_ICHGTIME_ACC to be sure that access time 3701 * update will take. Calling first with XFS_ICHGTIME_ACC 3702 * and then XFS_ICHGTIME_MOD may fail to modify the access 3703 * timestamp if the filesystem is mounted noacctm. 3704 */ 3705void 3706xfs_ichgtime(xfs_inode_t *ip, 3707 int flags) 3708{ 3709 timespec_t tv; 3710 xfs_vnode_t *vp = XFS_ITOV(ip); 3711 /* 3712 * We're not supposed to change timestamps in readonly-mounted 3713 * filesystems. Throw it away if anyone asks us. 3714 */ 3715 if (unlikely(vp->v_vfsp->vfs_flag & VFS_RDONLY)) 3716 return; 3717 3718 /* 3719 * Don't update access timestamps on reads if mounted "noatime" 3720 * Throw it away if anyone asks us. 3721 */ 3722 if ((ip->i_mount->m_flags & XFS_MOUNT_NOATIME || 3723 (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)) && 3724 ((flags & (XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD|XFS_ICHGTIME_CHG)) 3725 == XFS_ICHGTIME_ACC)) 3726 return; 3727 3728 nanotime(&tv); 3729 if (flags & XFS_ICHGTIME_MOD) { 3730 VN_MTIMESET(vp, &tv); 3731 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; 3732 } 3733 if (flags & XFS_ICHGTIME_ACC) { 3734 VN_ATIMESET(vp, &tv); 3735 ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec; 3736 } 3737 if (flags & XFS_ICHGTIME_CHG) { 3738 VN_CTIMESET(vp, &tv); 3739 ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec; 3740 } 3741 3742 /* 3743 * We update the i_update_core field _after_ changing 3744 * the timestamps in order to coordinate properly with 3745 * xfs_iflush() so that we don't lose timestamp updates. 3746 * This keeps us from having to hold the inode lock 3747 * while doing this. We use the SYNCHRONIZE macro to 3748 * ensure that the compiler does not reorder the update 3749 * of i_update_core above the timestamp updates above. 3750 */ 3751 SYNCHRONIZE(); 3752 ip->i_update_core = 1; 3753#if XXXKAN 3754 if (!(inode->i_state & I_LOCK)) 3755 mark_inode_dirty_sync(inode); 3756 3757 printf("xfs_ichgtime mark vnode dirty\n"); 3758#endif 3759} 3760 3761#ifdef XFS_ILOCK_TRACE 3762ktrace_t *xfs_ilock_trace_buf; 3763 3764void 3765xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra) 3766{ 3767 ktrace_enter(ip->i_lock_trace, 3768 (void *)ip, 3769 (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */ 3770 (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */ 3771 (void *)ra, /* caller of ilock */ 3772 (void *)(unsigned long)current_cpu(), 3773 (void *)(unsigned long)current_pid(), 3774 0,0,0,0,0,0,0,0,0,0); 3775} 3776#endif 3777