1/* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18#include "xfs.h" 19#include "xfs_fs.h" 20#include "xfs_types.h" 21#include "xfs_bit.h" 22#include "xfs_log.h" 23#include "xfs_inum.h" 24#include "xfs_imap.h" 25#include "xfs_trans.h" 26#include "xfs_trans_priv.h" 27#include "xfs_sb.h" 28#include "xfs_ag.h" 29#include "xfs_dir.h" 30#include "xfs_dir2.h" 31#include "xfs_dmapi.h" 32#include "xfs_mount.h" 33#include "xfs_bmap_btree.h" 34#include "xfs_alloc_btree.h" 35#include "xfs_ialloc_btree.h" 36#include "xfs_dir_sf.h" 37#include "xfs_dir2_sf.h" 38#include "xfs_attr_sf.h" 39#include "xfs_dinode.h" 40#include "xfs_inode.h" 41#include "xfs_buf_item.h" 42#include "xfs_inode_item.h" 43#include "xfs_btree.h" 44#include "xfs_alloc.h" 45#include "xfs_ialloc.h" 46#include "xfs_bmap.h" 47#include "xfs_rw.h" 48#include "xfs_error.h" 49#include "xfs_utils.h" 50#include "xfs_dir2_trace.h" 51#include "xfs_quota.h" 52#include "xfs_mac.h" 53#include "xfs_acl.h" 54 55 56kmem_zone_t *xfs_ifork_zone; 57kmem_zone_t *xfs_inode_zone; 58kmem_zone_t *xfs_chashlist_zone; 59 60/* 61 * Used in xfs_itruncate(). This is the maximum number of extents 62 * freed from a file in a single transaction. 63 */ 64#define XFS_ITRUNC_MAX_EXTENTS 2 65 66STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 67STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); 68STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); 69STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); 70 71#ifdef DEBUG 72/* 73 * Make sure that the extents in the given memory buffer 74 * are valid. 75 */ 76STATIC void 77xfs_validate_extents( 78 xfs_ifork_t *ifp, 79 int nrecs, 80 int disk, 81 xfs_exntfmt_t fmt) 82{ 83 xfs_bmbt_rec_t *ep; 84 xfs_bmbt_irec_t irec; 85 xfs_bmbt_rec_t rec; 86 int i; 87 88 for (i = 0; i < nrecs; i++) { 89 ep = xfs_iext_get_ext(ifp, i); 90 rec.l0 = get_unaligned((__uint64_t*)&ep->l0); 91 rec.l1 = get_unaligned((__uint64_t*)&ep->l1); 92 if (disk) 93 xfs_bmbt_disk_get_all(&rec, &irec); 94 else 95 xfs_bmbt_get_all(&rec, &irec); 96 if (fmt == XFS_EXTFMT_NOSTATE) 97 ASSERT(irec.br_state == XFS_EXT_NORM); 98 } 99} 100#else /* DEBUG */ 101#define xfs_validate_extents(ifp, nrecs, disk, fmt) 102#endif /* DEBUG */ 103 104/* 105 * Check that none of the inode's in the buffer have a next 106 * unlinked field of 0. 107 */ 108#if defined(DEBUG) 109void 110xfs_inobp_check( 111 xfs_mount_t *mp, 112 xfs_buf_t *bp) 113{ 114 int i; 115 int j; 116 xfs_dinode_t *dip; 117 118 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; 119 120 for (i = 0; i < j; i++) { 121 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 122 i * mp->m_sb.sb_inodesize); 123 if (!dip->di_next_unlinked) { 124 xfs_fs_cmn_err(CE_ALERT, mp, 125 "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p. About to pop an ASSERT.", 126 bp); 127 ASSERT(dip->di_next_unlinked); 128 } 129 } 130} 131#endif 132 133/* 134 * This routine is called to map an inode number within a file 135 * system to the buffer containing the on-disk version of the 136 * inode. It returns a pointer to the buffer containing the 137 * on-disk inode in the bpp parameter, and in the dip parameter 138 * it returns a pointer to the on-disk inode within that buffer. 139 * 140 * If a non-zero error is returned, then the contents of bpp and 141 * dipp are undefined. 142 * 143 * Use xfs_imap() to determine the size and location of the 144 * buffer to read from disk. 145 */ 146STATIC int 147xfs_inotobp( 148 xfs_mount_t *mp, 149 xfs_trans_t *tp, 150 xfs_ino_t ino, 151 xfs_dinode_t **dipp, 152 xfs_buf_t **bpp, 153 int *offset) 154{ 155 int di_ok; 156 xfs_imap_t imap; 157 xfs_buf_t *bp; 158 int error; 159 xfs_dinode_t *dip; 160 161 /* 162 * Call the space management code to find the location of the 163 * inode on disk. 164 */ 165 imap.im_blkno = 0; 166 error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP); 167 if (error != 0) { 168 cmn_err(CE_WARN, 169 "xfs_inotobp: xfs_imap() returned an " 170 "error %d on %s. Returning error.", error, mp->m_fsname); 171 return error; 172 } 173 174 /* 175 * If the inode number maps to a block outside the bounds of the 176 * file system then return NULL rather than calling read_buf 177 * and panicing when we get an error from the driver. 178 */ 179 if ((imap.im_blkno + imap.im_len) > 180 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { 181 cmn_err(CE_WARN, 182 "xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds " 183 "of the file system %s. Returning EINVAL.", 184 (unsigned long long)imap.im_blkno, 185 imap.im_len, mp->m_fsname); 186 return XFS_ERROR(EINVAL); 187 } 188 189 /* 190 * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will 191 * default to just a read_buf() call. 192 */ 193 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, 194 (int)imap.im_len, XFS_BUF_LOCK, &bp); 195 196 if (error) { 197 cmn_err(CE_WARN, 198 "xfs_inotobp: xfs_trans_read_buf() returned an " 199 "error %d on %s. Returning error.", error, mp->m_fsname); 200 return error; 201 } 202 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0); 203 di_ok = 204 INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && 205 XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); 206 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, 207 XFS_RANDOM_ITOBP_INOTOBP))) { 208 XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip); 209 xfs_trans_brelse(tp, bp); 210 cmn_err(CE_WARN, 211 "xfs_inotobp: XFS_TEST_ERROR() returned an " 212 "error on %s. Returning EFSCORRUPTED.", mp->m_fsname); 213 return XFS_ERROR(EFSCORRUPTED); 214 } 215 216 xfs_inobp_check(mp, bp); 217 218 /* 219 * Set *dipp to point to the on-disk inode in the buffer. 220 */ 221 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 222 *bpp = bp; 223 *offset = imap.im_boffset; 224 return 0; 225} 226 227 228/* 229 * This routine is called to map an inode to the buffer containing 230 * the on-disk version of the inode. It returns a pointer to the 231 * buffer containing the on-disk inode in the bpp parameter, and in 232 * the dip parameter it returns a pointer to the on-disk inode within 233 * that buffer. 234 * 235 * If a non-zero error is returned, then the contents of bpp and 236 * dipp are undefined. 237 * 238 * If the inode is new and has not yet been initialized, use xfs_imap() 239 * to determine the size and location of the buffer to read from disk. 240 * If the inode has already been mapped to its buffer and read in once, 241 * then use the mapping information stored in the inode rather than 242 * calling xfs_imap(). This allows us to avoid the overhead of looking 243 * at the inode btree for small block file systems (see xfs_dilocate()). 244 * We can tell whether the inode has been mapped in before by comparing 245 * its disk block address to 0. Only uninitialized inodes will have 246 * 0 for the disk block address. 247 */ 248int 249xfs_itobp( 250 xfs_mount_t *mp, 251 xfs_trans_t *tp, 252 xfs_inode_t *ip, 253 xfs_dinode_t **dipp, 254 xfs_buf_t **bpp, 255 xfs_daddr_t bno, 256 uint imap_flags) 257{ 258 xfs_buf_t *bp; 259 int error; 260 xfs_imap_t imap; 261#ifdef __KERNEL__ 262 int i; 263 int ni; 264#endif 265 266 if (ip->i_blkno == (xfs_daddr_t)0) { 267 /* 268 * Call the space management code to find the location of the 269 * inode on disk. 270 */ 271 imap.im_blkno = bno; 272 if ((error = xfs_imap(mp, tp, ip->i_ino, &imap, 273 XFS_IMAP_LOOKUP | imap_flags))) 274 return error; 275 276 /* 277 * If the inode number maps to a block outside the bounds 278 * of the file system then return NULL rather than calling 279 * read_buf and panicing when we get an error from the 280 * driver. 281 */ 282 if ((imap.im_blkno + imap.im_len) > 283 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { 284#ifdef DEBUG 285 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " 286 "(imap.im_blkno (0x%llx) " 287 "+ imap.im_len (0x%llx)) > " 288 " XFS_FSB_TO_BB(mp, " 289 "mp->m_sb.sb_dblocks) (0x%llx)", 290 (unsigned long long) imap.im_blkno, 291 (unsigned long long) imap.im_len, 292 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); 293#endif /* DEBUG */ 294 return XFS_ERROR(EINVAL); 295 } 296 297 /* 298 * Fill in the fields in the inode that will be used to 299 * map the inode to its buffer from now on. 300 */ 301 ip->i_blkno = imap.im_blkno; 302 ip->i_len = imap.im_len; 303 ip->i_boffset = imap.im_boffset; 304 } else { 305 /* 306 * We've already mapped the inode once, so just use the 307 * mapping that we saved the first time. 308 */ 309 imap.im_blkno = ip->i_blkno; 310 imap.im_len = ip->i_len; 311 imap.im_boffset = ip->i_boffset; 312 } 313 ASSERT(bno == 0 || bno == imap.im_blkno); 314 315 /* 316 * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will 317 * default to just a read_buf() call. 318 */ 319 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, 320 (int)imap.im_len, XFS_BUF_LOCK, &bp); 321 322 if (error) { 323#ifdef DEBUG 324 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " 325 "xfs_trans_read_buf() returned error %d, " 326 "imap.im_blkno 0x%llx, imap.im_len 0x%llx", 327 error, (unsigned long long) imap.im_blkno, 328 (unsigned long long) imap.im_len); 329#endif /* DEBUG */ 330 return error; 331 } 332#ifdef __KERNEL__ 333 /* 334 * Validate the magic number and version of every inode in the buffer 335 * (if DEBUG kernel) or the first inode in the buffer, otherwise. 336 */ 337#ifdef DEBUG 338 ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : 339 (BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog); 340#else 341 ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : 1; 342#endif 343 for (i = 0; i < ni; i++) { 344 int di_ok; 345 xfs_dinode_t *dip; 346 347 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 348 (i << mp->m_sb.sb_inodelog)); 349 di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && 350 XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); 351 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, 352 XFS_RANDOM_ITOBP_INOTOBP))) { 353#ifdef DEBUG 354 prdev("bad inode magic/vsn daddr %lld #%d (magic=%x)", 355 mp->m_ddev_targp, 356 (unsigned long long)imap.im_blkno, i, 357 INT_GET(dip->di_core.di_magic, ARCH_CONVERT)); 358#endif 359 XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH, 360 mp, dip); 361 xfs_trans_brelse(tp, bp); 362 return XFS_ERROR(EFSCORRUPTED); 363 } 364 } 365#endif /* __KERNEL__ */ 366 367 xfs_inobp_check(mp, bp); 368 369 /* 370 * Mark the buffer as an inode buffer now that it looks good 371 */ 372 XFS_BUF_SET_VTYPE(bp, B_FS_INO); 373 374 /* 375 * Set *dipp to point to the on-disk inode in the buffer. 376 */ 377 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 378 *bpp = bp; 379 return 0; 380} 381 382/* 383 * Move inode type and inode format specific information from the 384 * on-disk inode to the in-core inode. For fifos, devs, and sockets 385 * this means set if_rdev to the proper value. For files, directories, 386 * and symlinks this means to bring in the in-line data or extent 387 * pointers. For a file in B-tree format, only the root is immediately 388 * brought in-core. The rest will be in-lined in if_extents when it 389 * is first referenced (see xfs_iread_extents()). 390 */ 391STATIC int 392xfs_iformat( 393 xfs_inode_t *ip, 394 xfs_dinode_t *dip) 395{ 396 xfs_attr_shortform_t *atp; 397 int size; 398 int error; 399 xfs_fsize_t di_size; 400 ip->i_df.if_ext_max = 401 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 402 error = 0; 403 404 if (unlikely( 405 INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) + 406 INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) > 407 INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) { 408 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 409 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", 410 (unsigned long long)ip->i_ino, 411 (int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) 412 + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)), 413 (unsigned long long) 414 INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT)); 415 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, 416 ip->i_mount, dip); 417 return XFS_ERROR(EFSCORRUPTED); 418 } 419 420 if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) { 421 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 422 "corrupt dinode %Lu, forkoff = 0x%x.", 423 (unsigned long long)ip->i_ino, 424 (int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT))); 425 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 426 ip->i_mount, dip); 427 return XFS_ERROR(EFSCORRUPTED); 428 } 429 430 switch (ip->i_d.di_mode & S_IFMT) { 431 case S_IFIFO: 432 case S_IFCHR: 433 case S_IFBLK: 434 case S_IFSOCK: 435 if (unlikely(INT_GET(dip->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_DEV)) { 436 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, 437 ip->i_mount, dip); 438 return XFS_ERROR(EFSCORRUPTED); 439 } 440 ip->i_d.di_size = 0; 441 ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT); 442 break; 443 444 case S_IFREG: 445 case S_IFLNK: 446 case S_IFDIR: 447 switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) { 448 case XFS_DINODE_FMT_LOCAL: 449 /* 450 * no local regular files yet 451 */ 452 if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) { 453 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 454 "corrupt inode %Lu " 455 "(local format for regular file).", 456 (unsigned long long) ip->i_ino); 457 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 458 XFS_ERRLEVEL_LOW, 459 ip->i_mount, dip); 460 return XFS_ERROR(EFSCORRUPTED); 461 } 462 463 di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT); 464 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 465 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 466 "corrupt inode %Lu " 467 "(bad size %Ld for local inode).", 468 (unsigned long long) ip->i_ino, 469 (long long) di_size); 470 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 471 XFS_ERRLEVEL_LOW, 472 ip->i_mount, dip); 473 return XFS_ERROR(EFSCORRUPTED); 474 } 475 476 size = (int)di_size; 477 error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); 478 break; 479 case XFS_DINODE_FMT_EXTENTS: 480 error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); 481 break; 482 case XFS_DINODE_FMT_BTREE: 483 error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); 484 break; 485 default: 486 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, 487 ip->i_mount); 488 return XFS_ERROR(EFSCORRUPTED); 489 } 490 break; 491 492 default: 493 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); 494 return XFS_ERROR(EFSCORRUPTED); 495 } 496 if (error) { 497 return error; 498 } 499 if (!XFS_DFORK_Q(dip)) 500 return 0; 501 ASSERT(ip->i_afp == NULL); 502 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); 503 ip->i_afp->if_ext_max = 504 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 505 switch (INT_GET(dip->di_core.di_aformat, ARCH_CONVERT)) { 506 case XFS_DINODE_FMT_LOCAL: 507 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); 508 size = be16_to_cpu(atp->hdr.totsize); 509 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); 510 break; 511 case XFS_DINODE_FMT_EXTENTS: 512 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); 513 break; 514 case XFS_DINODE_FMT_BTREE: 515 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); 516 break; 517 default: 518 error = XFS_ERROR(EFSCORRUPTED); 519 break; 520 } 521 if (error) { 522 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 523 ip->i_afp = NULL; 524 xfs_idestroy_fork(ip, XFS_DATA_FORK); 525 } 526 return error; 527} 528 529/* 530 * The file is in-lined in the on-disk inode. 531 * If it fits into if_inline_data, then copy 532 * it there, otherwise allocate a buffer for it 533 * and copy the data there. Either way, set 534 * if_data to point at the data. 535 * If we allocate a buffer for the data, make 536 * sure that its size is a multiple of 4 and 537 * record the real size in i_real_bytes. 538 */ 539STATIC int 540xfs_iformat_local( 541 xfs_inode_t *ip, 542 xfs_dinode_t *dip, 543 int whichfork, 544 int size) 545{ 546 xfs_ifork_t *ifp; 547 int real_size; 548 549 /* 550 * If the size is unreasonable, then something 551 * is wrong and we just bail out rather than crash in 552 * kmem_alloc() or memcpy() below. 553 */ 554 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 555 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 556 "corrupt inode %Lu " 557 "(bad size %d for local fork, size = %d).", 558 (unsigned long long) ip->i_ino, size, 559 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); 560 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, 561 ip->i_mount, dip); 562 return XFS_ERROR(EFSCORRUPTED); 563 } 564 ifp = XFS_IFORK_PTR(ip, whichfork); 565 real_size = 0; 566 if (size == 0) 567 ifp->if_u1.if_data = NULL; 568 else if (size <= sizeof(ifp->if_u2.if_inline_data)) 569 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 570 else { 571 real_size = roundup(size, 4); 572 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 573 } 574 ifp->if_bytes = size; 575 ifp->if_real_bytes = real_size; 576 if (size) 577 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); 578 ifp->if_flags &= ~XFS_IFEXTENTS; 579 ifp->if_flags |= XFS_IFINLINE; 580 return 0; 581} 582 583/* 584 * The file consists of a set of extents all 585 * of which fit into the on-disk inode. 586 * If there are few enough extents to fit into 587 * the if_inline_ext, then copy them there. 588 * Otherwise allocate a buffer for them and copy 589 * them into it. Either way, set if_extents 590 * to point at the extents. 591 */ 592STATIC int 593xfs_iformat_extents( 594 xfs_inode_t *ip, 595 xfs_dinode_t *dip, 596 int whichfork) 597{ 598 xfs_bmbt_rec_t *ep, *dp; 599 xfs_ifork_t *ifp; 600 int nex; 601 int size; 602 int i; 603 604 ifp = XFS_IFORK_PTR(ip, whichfork); 605 nex = XFS_DFORK_NEXTENTS(dip, whichfork); 606 size = nex * (uint)sizeof(xfs_bmbt_rec_t); 607 608 /* 609 * If the number of extents is unreasonable, then something 610 * is wrong and we just bail out rather than crash in 611 * kmem_alloc() or memcpy() below. 612 */ 613 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 614 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 615 "corrupt inode %Lu ((a)extents = %d).", 616 (unsigned long long) ip->i_ino, nex); 617 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, 618 ip->i_mount, dip); 619 return XFS_ERROR(EFSCORRUPTED); 620 } 621 622 ifp->if_real_bytes = 0; 623 if (nex == 0) 624 ifp->if_u1.if_extents = NULL; 625 else if (nex <= XFS_INLINE_EXTS) 626 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 627 else 628 xfs_iext_add(ifp, 0, nex); 629 630 ifp->if_bytes = size; 631 if (size) { 632 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); 633 xfs_validate_extents(ifp, nex, 1, XFS_EXTFMT_INODE(ip)); 634 for (i = 0; i < nex; i++, dp++) { 635 ep = xfs_iext_get_ext(ifp, i); 636 ep->l0 = INT_GET(get_unaligned((__uint64_t*)&dp->l0), 637 ARCH_CONVERT); 638 ep->l1 = INT_GET(get_unaligned((__uint64_t*)&dp->l1), 639 ARCH_CONVERT); 640 } 641 xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex, 642 whichfork); 643 if (whichfork != XFS_DATA_FORK || 644 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) 645 if (unlikely(xfs_check_nostate_extents( 646 ifp, 0, nex))) { 647 XFS_ERROR_REPORT("xfs_iformat_extents(2)", 648 XFS_ERRLEVEL_LOW, 649 ip->i_mount); 650 return XFS_ERROR(EFSCORRUPTED); 651 } 652 } 653 ifp->if_flags |= XFS_IFEXTENTS; 654 return 0; 655} 656 657/* 658 * The file has too many extents to fit into 659 * the inode, so they are in B-tree format. 660 * Allocate a buffer for the root of the B-tree 661 * and copy the root into it. The i_extents 662 * field will remain NULL until all of the 663 * extents are read in (when they are needed). 664 */ 665STATIC int 666xfs_iformat_btree( 667 xfs_inode_t *ip, 668 xfs_dinode_t *dip, 669 int whichfork) 670{ 671 xfs_bmdr_block_t *dfp; 672 xfs_ifork_t *ifp; 673 /* REFERENCED */ 674 int nrecs; 675 int size; 676 677 ifp = XFS_IFORK_PTR(ip, whichfork); 678 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); 679 size = XFS_BMAP_BROOT_SPACE(dfp); 680 nrecs = XFS_BMAP_BROOT_NUMRECS(dfp); 681 682 /* 683 * blow out if -- fork has less extents than can fit in 684 * fork (fork shouldn't be a btree format), root btree 685 * block has more records than can fit into the fork, 686 * or the number of extents is greater than the number of 687 * blocks. 688 */ 689 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max 690 || XFS_BMDR_SPACE_CALC(nrecs) > 691 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) 692 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 693 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 694 "corrupt inode %Lu (btree).", 695 (unsigned long long) ip->i_ino); 696 XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 697 ip->i_mount); 698 return XFS_ERROR(EFSCORRUPTED); 699 } 700 701 ifp->if_broot_bytes = size; 702 ifp->if_broot = kmem_alloc(size, KM_SLEEP); 703 ASSERT(ifp->if_broot != NULL); 704 /* 705 * Copy and convert from the on-disk structure 706 * to the in-memory structure. 707 */ 708 xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), 709 ifp->if_broot, size); 710 ifp->if_flags &= ~XFS_IFEXTENTS; 711 ifp->if_flags |= XFS_IFBROOT; 712 713 return 0; 714} 715 716/* 717 * xfs_xlate_dinode_core - translate an xfs_inode_core_t between ondisk 718 * and native format 719 * 720 * buf = on-disk representation 721 * dip = native representation 722 * dir = direction - +ve -> disk to native 723 * -ve -> native to disk 724 */ 725void 726xfs_xlate_dinode_core( 727 xfs_caddr_t buf, 728 xfs_dinode_core_t *dip, 729 int dir) 730{ 731 xfs_dinode_core_t *buf_core = (xfs_dinode_core_t *)buf; 732 xfs_dinode_core_t *mem_core = (xfs_dinode_core_t *)dip; 733 xfs_arch_t arch = ARCH_CONVERT; 734 735 ASSERT(dir); 736 737 INT_XLATE(buf_core->di_magic, mem_core->di_magic, dir, arch); 738 INT_XLATE(buf_core->di_mode, mem_core->di_mode, dir, arch); 739 INT_XLATE(buf_core->di_version, mem_core->di_version, dir, arch); 740 INT_XLATE(buf_core->di_format, mem_core->di_format, dir, arch); 741 INT_XLATE(buf_core->di_onlink, mem_core->di_onlink, dir, arch); 742 INT_XLATE(buf_core->di_uid, mem_core->di_uid, dir, arch); 743 INT_XLATE(buf_core->di_gid, mem_core->di_gid, dir, arch); 744 INT_XLATE(buf_core->di_nlink, mem_core->di_nlink, dir, arch); 745 INT_XLATE(buf_core->di_projid, mem_core->di_projid, dir, arch); 746 747 if (dir > 0) { 748 memcpy(mem_core->di_pad, buf_core->di_pad, 749 sizeof(buf_core->di_pad)); 750 } else { 751 memcpy(buf_core->di_pad, mem_core->di_pad, 752 sizeof(buf_core->di_pad)); 753 } 754 755 INT_XLATE(buf_core->di_flushiter, mem_core->di_flushiter, dir, arch); 756 757 INT_XLATE(buf_core->di_atime.t_sec, mem_core->di_atime.t_sec, 758 dir, arch); 759 INT_XLATE(buf_core->di_atime.t_nsec, mem_core->di_atime.t_nsec, 760 dir, arch); 761 INT_XLATE(buf_core->di_mtime.t_sec, mem_core->di_mtime.t_sec, 762 dir, arch); 763 INT_XLATE(buf_core->di_mtime.t_nsec, mem_core->di_mtime.t_nsec, 764 dir, arch); 765 INT_XLATE(buf_core->di_ctime.t_sec, mem_core->di_ctime.t_sec, 766 dir, arch); 767 INT_XLATE(buf_core->di_ctime.t_nsec, mem_core->di_ctime.t_nsec, 768 dir, arch); 769 INT_XLATE(buf_core->di_size, mem_core->di_size, dir, arch); 770 INT_XLATE(buf_core->di_nblocks, mem_core->di_nblocks, dir, arch); 771 INT_XLATE(buf_core->di_extsize, mem_core->di_extsize, dir, arch); 772 INT_XLATE(buf_core->di_nextents, mem_core->di_nextents, dir, arch); 773 INT_XLATE(buf_core->di_anextents, mem_core->di_anextents, dir, arch); 774 INT_XLATE(buf_core->di_forkoff, mem_core->di_forkoff, dir, arch); 775 INT_XLATE(buf_core->di_aformat, mem_core->di_aformat, dir, arch); 776 INT_XLATE(buf_core->di_dmevmask, mem_core->di_dmevmask, dir, arch); 777 INT_XLATE(buf_core->di_dmstate, mem_core->di_dmstate, dir, arch); 778 INT_XLATE(buf_core->di_flags, mem_core->di_flags, dir, arch); 779 INT_XLATE(buf_core->di_gen, mem_core->di_gen, dir, arch); 780} 781 782STATIC uint 783_xfs_dic2xflags( 784 xfs_dinode_core_t *dic, 785 __uint16_t di_flags) 786{ 787 uint flags = 0; 788 789 if (di_flags & XFS_DIFLAG_ANY) { 790 if (di_flags & XFS_DIFLAG_REALTIME) 791 flags |= XFS_XFLAG_REALTIME; 792 if (di_flags & XFS_DIFLAG_PREALLOC) 793 flags |= XFS_XFLAG_PREALLOC; 794 if (di_flags & XFS_DIFLAG_IMMUTABLE) 795 flags |= XFS_XFLAG_IMMUTABLE; 796 if (di_flags & XFS_DIFLAG_APPEND) 797 flags |= XFS_XFLAG_APPEND; 798 if (di_flags & XFS_DIFLAG_SYNC) 799 flags |= XFS_XFLAG_SYNC; 800 if (di_flags & XFS_DIFLAG_NOATIME) 801 flags |= XFS_XFLAG_NOATIME; 802 if (di_flags & XFS_DIFLAG_NODUMP) 803 flags |= XFS_XFLAG_NODUMP; 804 if (di_flags & XFS_DIFLAG_RTINHERIT) 805 flags |= XFS_XFLAG_RTINHERIT; 806 if (di_flags & XFS_DIFLAG_PROJINHERIT) 807 flags |= XFS_XFLAG_PROJINHERIT; 808 if (di_flags & XFS_DIFLAG_NOSYMLINKS) 809 flags |= XFS_XFLAG_NOSYMLINKS; 810 if (di_flags & XFS_DIFLAG_EXTSIZE) 811 flags |= XFS_XFLAG_EXTSIZE; 812 if (di_flags & XFS_DIFLAG_EXTSZINHERIT) 813 flags |= XFS_XFLAG_EXTSZINHERIT; 814 } 815 816 return flags; 817} 818 819uint 820xfs_ip2xflags( 821 xfs_inode_t *ip) 822{ 823 xfs_dinode_core_t *dic = &ip->i_d; 824 825 return _xfs_dic2xflags(dic, dic->di_flags) | 826 (XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0); 827} 828 829uint 830xfs_dic2xflags( 831 xfs_dinode_core_t *dic) 832{ 833 return _xfs_dic2xflags(dic, INT_GET(dic->di_flags, ARCH_CONVERT)) | 834 (XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0); 835} 836 837/* 838 * Given a mount structure and an inode number, return a pointer 839 * to a newly allocated in-core inode corresponding to the given 840 * inode number. 841 * 842 * Initialize the inode's attributes and extent pointers if it 843 * already has them (it will not if the inode has no links). 844 */ 845int 846xfs_iread( 847 xfs_mount_t *mp, 848 xfs_trans_t *tp, 849 xfs_ino_t ino, 850 xfs_inode_t **ipp, 851 xfs_daddr_t bno) 852{ 853 xfs_buf_t *bp; 854 xfs_dinode_t *dip; 855 xfs_inode_t *ip; 856 int error; 857 858 ASSERT(xfs_inode_zone != NULL); 859 860 ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP); 861 ip->i_ino = ino; 862 ip->i_mount = mp; 863 864 /* 865 * Get pointer's to the on-disk inode and the buffer containing it. 866 * If the inode number refers to a block outside the file system 867 * then xfs_itobp() will return NULL. In this case we should 868 * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will 869 * know that this is a new incore inode. 870 */ 871 error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, 0); 872 if (error) { 873 kmem_zone_free(xfs_inode_zone, ip); 874 return error; 875 } 876 877 /* 878 * Initialize inode's trace buffers. 879 * Do this before xfs_iformat in case it adds entries. 880 */ 881#ifdef XFS_BMAP_TRACE 882 ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP); 883#endif 884#ifdef XFS_BMBT_TRACE 885 ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP); 886#endif 887#ifdef XFS_RW_TRACE 888 ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP); 889#endif 890#ifdef XFS_ILOCK_TRACE 891 ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP); 892#endif 893#ifdef XFS_DIR2_TRACE 894 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP); 895#endif 896 897 /* 898 * If we got something that isn't an inode it means someone 899 * (nfs or dmi) has a stale handle. 900 */ 901 if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) { 902 kmem_zone_free(xfs_inode_zone, ip); 903 xfs_trans_brelse(tp, bp); 904#ifdef DEBUG 905 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 906 "dip->di_core.di_magic (0x%x) != " 907 "XFS_DINODE_MAGIC (0x%x)", 908 INT_GET(dip->di_core.di_magic, ARCH_CONVERT), 909 XFS_DINODE_MAGIC); 910#endif /* DEBUG */ 911 return XFS_ERROR(EINVAL); 912 } 913 914 /* 915 * If the on-disk inode is already linked to a directory 916 * entry, copy all of the inode into the in-core inode. 917 * xfs_iformat() handles copying in the inode format 918 * specific information. 919 * Otherwise, just get the truly permanent information. 920 */ 921 if (dip->di_core.di_mode) { 922 xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core, 923 &(ip->i_d), 1); 924 error = xfs_iformat(ip, dip); 925 if (error) { 926 kmem_zone_free(xfs_inode_zone, ip); 927 xfs_trans_brelse(tp, bp); 928#ifdef DEBUG 929 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 930 "xfs_iformat() returned error %d", 931 error); 932#endif /* DEBUG */ 933 return error; 934 } 935 } else { 936 ip->i_d.di_magic = INT_GET(dip->di_core.di_magic, ARCH_CONVERT); 937 ip->i_d.di_version = INT_GET(dip->di_core.di_version, ARCH_CONVERT); 938 ip->i_d.di_gen = INT_GET(dip->di_core.di_gen, ARCH_CONVERT); 939 ip->i_d.di_flushiter = INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT); 940 /* 941 * Make sure to pull in the mode here as well in 942 * case the inode is released without being used. 943 * This ensures that xfs_inactive() will see that 944 * the inode is already free and not try to mess 945 * with the uninitialized part of it. 946 */ 947 ip->i_d.di_mode = 0; 948 /* 949 * Initialize the per-fork minima and maxima for a new 950 * inode here. xfs_iformat will do it for old inodes. 951 */ 952 ip->i_df.if_ext_max = 953 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 954 } 955 956#ifdef RMC 957 INIT_LIST_HEAD(&ip->i_reclaim); 958#else 959 bzero(&ip->i_reclaim,sizeof(ip->i_reclaim)); 960#endif 961 962 963 /* 964 * The inode format changed when we moved the link count and 965 * made it 32 bits long. If this is an old format inode, 966 * convert it in memory to look like a new one. If it gets 967 * flushed to disk we will convert back before flushing or 968 * logging it. We zero out the new projid field and the old link 969 * count field. We'll handle clearing the pad field (the remains 970 * of the old uuid field) when we actually convert the inode to 971 * the new format. We don't change the version number so that we 972 * can distinguish this from a real new format inode. 973 */ 974 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { 975 ip->i_d.di_nlink = ip->i_d.di_onlink; 976 ip->i_d.di_onlink = 0; 977 ip->i_d.di_projid = 0; 978 } 979 980 ip->i_delayed_blks = 0; 981 982 /* 983 * Mark the buffer containing the inode as something to keep 984 * around for a while. This helps to keep recently accessed 985 * meta-data in-core longer. 986 */ 987 XFS_BUF_SET_REF(bp, XFS_INO_REF); 988 989 /* 990 * Use xfs_trans_brelse() to release the buffer containing the 991 * on-disk inode, because it was acquired with xfs_trans_read_buf() 992 * in xfs_itobp() above. If tp is NULL, this is just a normal 993 * brelse(). If we're within a transaction, then xfs_trans_brelse() 994 * will only release the buffer if it is not dirty within the 995 * transaction. It will be OK to release the buffer in this case, 996 * because inodes on disk are never destroyed and we will be 997 * locking the new in-core inode before putting it in the hash 998 * table where other processes can find it. Thus we don't have 999 * to worry about the inode being changed just because we released 1000 * the buffer. 1001 */ 1002 xfs_trans_brelse(tp, bp); 1003 *ipp = ip; 1004 return 0; 1005} 1006 1007/* 1008 * Read in extents from a btree-format inode. 1009 * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. 1010 */ 1011int 1012xfs_iread_extents( 1013 xfs_trans_t *tp, 1014 xfs_inode_t *ip, 1015 int whichfork) 1016{ 1017 int error; 1018 xfs_ifork_t *ifp; 1019 xfs_extnum_t nextents; 1020 size_t size; 1021 1022 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { 1023 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, 1024 ip->i_mount); 1025 return XFS_ERROR(EFSCORRUPTED); 1026 } 1027 nextents = XFS_IFORK_NEXTENTS(ip, whichfork); 1028 size = nextents * sizeof(xfs_bmbt_rec_t); 1029 ifp = XFS_IFORK_PTR(ip, whichfork); 1030 1031 /* 1032 * We know that the size is valid (it's checked in iformat_btree) 1033 */ 1034 ifp->if_lastex = NULLEXTNUM; 1035 ifp->if_bytes = ifp->if_real_bytes = 0; 1036 ifp->if_flags |= XFS_IFEXTENTS; 1037 xfs_iext_add(ifp, 0, nextents); 1038 error = xfs_bmap_read_extents(tp, ip, whichfork); 1039 if (error) { 1040 xfs_iext_destroy(ifp); 1041 ifp->if_flags &= ~XFS_IFEXTENTS; 1042 return error; 1043 } 1044 xfs_validate_extents(ifp, nextents, 0, XFS_EXTFMT_INODE(ip)); 1045 return 0; 1046} 1047 1048/* 1049 * Allocate an inode on disk and return a copy of its in-core version. 1050 * The in-core inode is locked exclusively. Set mode, nlink, and rdev 1051 * appropriately within the inode. The uid and gid for the inode are 1052 * set according to the contents of the given cred structure. 1053 * 1054 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() 1055 * has a free inode available, call xfs_iget() 1056 * to obtain the in-core version of the allocated inode. Finally, 1057 * fill in the inode and log its initial contents. In this case, 1058 * ialloc_context would be set to NULL and call_again set to false. 1059 * 1060 * If xfs_dialloc() does not have an available inode, 1061 * it will replenish its supply by doing an allocation. Since we can 1062 * only do one allocation within a transaction without deadlocks, we 1063 * must commit the current transaction before returning the inode itself. 1064 * In this case, therefore, we will set call_again to true and return. 1065 * The caller should then commit the current transaction, start a new 1066 * transaction, and call xfs_ialloc() again to actually get the inode. 1067 * 1068 * To ensure that some other process does not grab the inode that 1069 * was allocated during the first call to xfs_ialloc(), this routine 1070 * also returns the [locked] bp pointing to the head of the freelist 1071 * as ialloc_context. The caller should hold this buffer across 1072 * the commit and pass it back into this routine on the second call. 1073 */ 1074int 1075xfs_ialloc( 1076 xfs_trans_t *tp, 1077 xfs_inode_t *pip, 1078 mode_t mode, 1079 xfs_nlink_t nlink, 1080 xfs_dev_t rdev, 1081 cred_t *cr, 1082 xfs_prid_t prid, 1083 int okalloc, 1084 xfs_buf_t **ialloc_context, 1085 boolean_t *call_again, 1086 xfs_inode_t **ipp) 1087{ 1088 xfs_ino_t ino; 1089 xfs_inode_t *ip; 1090 xfs_vnode_t *vp; 1091 uint flags; 1092 int error; 1093 1094 /* 1095 * Call the space management code to pick 1096 * the on-disk inode to be allocated. 1097 */ 1098 error = xfs_dialloc(tp, pip->i_ino, mode, okalloc, 1099 ialloc_context, call_again, &ino); 1100 if (error != 0) { 1101 return error; 1102 } 1103 if (*call_again || ino == NULLFSINO) { 1104 *ipp = NULL; 1105 return 0; 1106 } 1107 ASSERT(*ialloc_context == NULL); 1108 1109 /* 1110 * Get the in-core inode with the lock held exclusively. 1111 * This is because we're setting fields here we need 1112 * to prevent others from looking at until we're done. 1113 */ 1114 error = xfs_trans_iget(tp->t_mountp, tp, ino, 1115 IGET_CREATE, XFS_ILOCK_EXCL, &ip); 1116 if (error != 0) { 1117 return error; 1118 } 1119 ASSERT(ip != NULL); 1120 1121 vp = XFS_ITOV(ip); 1122 ip->i_d.di_mode = (__uint16_t)mode; 1123 ip->i_d.di_onlink = 0; 1124 ip->i_d.di_nlink = nlink; 1125 ASSERT(ip->i_d.di_nlink == nlink); 1126 ip->i_d.di_uid = curthread->td_ucred->cr_uid; 1127 ip->i_d.di_gid = curthread->td_ucred->cr_groups[0]; 1128 ip->i_d.di_projid = prid; 1129 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1130 1131 /* 1132 * If the superblock version is up to where we support new format 1133 * inodes and this is currently an old format inode, then change 1134 * the inode version number now. This way we only do the conversion 1135 * here rather than here and in the flush/logging code. 1136 */ 1137 if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) && 1138 ip->i_d.di_version == XFS_DINODE_VERSION_1) { 1139 ip->i_d.di_version = XFS_DINODE_VERSION_2; 1140 /* 1141 * We've already zeroed the old link count, the projid field, 1142 * and the pad field. 1143 */ 1144 } 1145 1146 /* 1147 * Project ids won't be stored on disk if we are using a version 1 inode. 1148 */ 1149 if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) 1150 xfs_bump_ino_vers2(tp, ip); 1151 1152 if (XFS_INHERIT_GID(pip, vp->v_vfsp)) { 1153 ip->i_d.di_gid = pip->i_d.di_gid; 1154 if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { 1155 ip->i_d.di_mode |= S_ISGID; 1156 } 1157 } 1158 1159 /* 1160 * If the group ID of the new file does not match the effective group 1161 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 1162 * (and only if the irix_sgid_inherit compatibility variable is set). 1163 */ 1164 if ((irix_sgid_inherit) && 1165 (ip->i_d.di_mode & S_ISGID) && 1166 (!groupmember((gid_t)ip->i_d.di_gid, curthread->td_ucred))) { 1167 ip->i_d.di_mode &= ~S_ISGID; 1168 } 1169 1170 ip->i_d.di_size = 0; 1171 ip->i_d.di_nextents = 0; 1172 ASSERT(ip->i_d.di_nblocks == 0); 1173 xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD); 1174 /* 1175 * di_gen will have been taken care of in xfs_iread. 1176 */ 1177 ip->i_d.di_extsize = 0; 1178 ip->i_d.di_dmevmask = 0; 1179 ip->i_d.di_dmstate = 0; 1180 ip->i_d.di_flags = 0; 1181 flags = XFS_ILOG_CORE; 1182 switch (mode & S_IFMT) { 1183 case S_IFIFO: 1184 case S_IFCHR: 1185 case S_IFBLK: 1186 case S_IFSOCK: 1187 ip->i_d.di_format = XFS_DINODE_FMT_DEV; 1188 ip->i_df.if_u2.if_rdev = rdev; 1189 ip->i_df.if_flags = 0; 1190 flags |= XFS_ILOG_DEV; 1191 break; 1192 case S_IFREG: 1193 case S_IFDIR: 1194 if (unlikely(pip->i_d.di_flags & XFS_DIFLAG_ANY)) { 1195 uint di_flags = 0; 1196 1197 if ((mode & S_IFMT) == S_IFDIR) { 1198 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1199 di_flags |= XFS_DIFLAG_RTINHERIT; 1200 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 1201 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 1202 ip->i_d.di_extsize = pip->i_d.di_extsize; 1203 } 1204 } else if ((mode & S_IFMT) == S_IFREG) { 1205 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) { 1206 di_flags |= XFS_DIFLAG_REALTIME; 1207 ip->i_iocore.io_flags |= XFS_IOCORE_RT; 1208 } 1209 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 1210 di_flags |= XFS_DIFLAG_EXTSIZE; 1211 ip->i_d.di_extsize = pip->i_d.di_extsize; 1212 } 1213 } 1214 if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && 1215 xfs_inherit_noatime) 1216 di_flags |= XFS_DIFLAG_NOATIME; 1217 if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && 1218 xfs_inherit_nodump) 1219 di_flags |= XFS_DIFLAG_NODUMP; 1220 if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && 1221 xfs_inherit_sync) 1222 di_flags |= XFS_DIFLAG_SYNC; 1223 if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && 1224 xfs_inherit_nosymlinks) 1225 di_flags |= XFS_DIFLAG_NOSYMLINKS; 1226 if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1227 di_flags |= XFS_DIFLAG_PROJINHERIT; 1228 ip->i_d.di_flags |= di_flags; 1229 } 1230 /* FALLTHROUGH */ 1231 case S_IFLNK: 1232 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 1233 ip->i_df.if_flags = XFS_IFEXTENTS; 1234 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; 1235 ip->i_df.if_u1.if_extents = NULL; 1236 break; 1237 default: 1238 ASSERT(0); 1239 } 1240 /* 1241 * Attribute fork settings for new inode. 1242 */ 1243 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 1244 ip->i_d.di_anextents = 0; 1245 1246 /* 1247 * Log the new values stuffed into the inode. 1248 */ 1249 xfs_trans_log_inode(tp, ip, flags); 1250 1251 /* now that we have an i_mode we can set Linux inode ops (& unlock) */ 1252 XVFS_INIT_VNODE(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1); 1253 1254 *ipp = ip; 1255 return 0; 1256} 1257 1258/* 1259 * Check to make sure that there are no blocks allocated to the 1260 * file beyond the size of the file. We don't check this for 1261 * files with fixed size extents or real time extents, but we 1262 * at least do it for regular files. 1263 */ 1264#ifdef DEBUG 1265void 1266xfs_isize_check( 1267 xfs_mount_t *mp, 1268 xfs_inode_t *ip, 1269 xfs_fsize_t isize) 1270{ 1271 xfs_fileoff_t map_first; 1272 int nimaps; 1273 xfs_bmbt_irec_t imaps[2]; 1274 1275 if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) 1276 return; 1277 1278 if (ip->i_d.di_flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_EXTSIZE)) 1279 return; 1280 1281 nimaps = 2; 1282 map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); 1283 /* 1284 * The filesystem could be shutting down, so bmapi may return 1285 * an error. 1286 */ 1287 if (xfs_bmapi(NULL, ip, map_first, 1288 (XFS_B_TO_FSB(mp, 1289 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - 1290 map_first), 1291 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, 1292 NULL, NULL)) 1293 return; 1294 ASSERT(nimaps == 1); 1295 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); 1296} 1297#endif /* DEBUG */ 1298 1299/* 1300 * Calculate the last possible buffered byte in a file. This must 1301 * include data that was buffered beyond the EOF by the write code. 1302 * This also needs to deal with overflowing the xfs_fsize_t type 1303 * which can happen for sizes near the limit. 1304 * 1305 * We also need to take into account any blocks beyond the EOF. It 1306 * may be the case that they were buffered by a write which failed. 1307 * In that case the pages will still be in memory, but the inode size 1308 * will never have been updated. 1309 */ 1310xfs_fsize_t 1311xfs_file_last_byte( 1312 xfs_inode_t *ip) 1313{ 1314 xfs_mount_t *mp; 1315 xfs_fsize_t last_byte; 1316 xfs_fileoff_t last_block; 1317 xfs_fileoff_t size_last_block; 1318 int error; 1319 1320 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS)); 1321 1322 mp = ip->i_mount; 1323 /* 1324 * Only check for blocks beyond the EOF if the extents have 1325 * been read in. This eliminates the need for the inode lock, 1326 * and it also saves us from looking when it really isn't 1327 * necessary. 1328 */ 1329 if (ip->i_df.if_flags & XFS_IFEXTENTS) { 1330 error = xfs_bmap_last_offset(NULL, ip, &last_block, 1331 XFS_DATA_FORK); 1332 if (error) { 1333 last_block = 0; 1334 } 1335 } else { 1336 last_block = 0; 1337 } 1338 size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size); 1339 last_block = XFS_FILEOFF_MAX(last_block, size_last_block); 1340 1341 last_byte = XFS_FSB_TO_B(mp, last_block); 1342 if (last_byte < 0) { 1343 return XFS_MAXIOFFSET(mp); 1344 } 1345 last_byte += (1 << mp->m_writeio_log); 1346 if (last_byte < 0) { 1347 return XFS_MAXIOFFSET(mp); 1348 } 1349 return last_byte; 1350} 1351 1352#if defined(XFS_RW_TRACE) 1353STATIC void 1354xfs_itrunc_trace( 1355 int tag, 1356 xfs_inode_t *ip, 1357 int flag, 1358 xfs_fsize_t new_size, 1359 xfs_off_t toss_start, 1360 xfs_off_t toss_finish) 1361{ 1362 if (ip->i_rwtrace == NULL) { 1363 return; 1364 } 1365 1366 ktrace_enter(ip->i_rwtrace, 1367 (void*)((long)tag), 1368 (void*)ip, 1369 (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff), 1370 (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff), 1371 (void*)((long)flag), 1372 (void*)(unsigned long)((new_size >> 32) & 0xffffffff), 1373 (void*)(unsigned long)(new_size & 0xffffffff), 1374 (void*)(unsigned long)((toss_start >> 32) & 0xffffffff), 1375 (void*)(unsigned long)(toss_start & 0xffffffff), 1376 (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff), 1377 (void*)(unsigned long)(toss_finish & 0xffffffff), 1378 (void*)(unsigned long)current_cpu(), 1379 (void*)(unsigned long)current_pid(), 1380 (void*)NULL, 1381 (void*)NULL, 1382 (void*)NULL); 1383} 1384#else 1385#define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish) 1386#endif 1387 1388/* 1389 * Start the truncation of the file to new_size. The new size 1390 * must be smaller than the current size. This routine will 1391 * clear the buffer and page caches of file data in the removed 1392 * range, and xfs_itruncate_finish() will remove the underlying 1393 * disk blocks. 1394 * 1395 * The inode must have its I/O lock locked EXCLUSIVELY, and it 1396 * must NOT have the inode lock held at all. This is because we're 1397 * calling into the buffer/page cache code and we can't hold the 1398 * inode lock when we do so. 1399 * 1400 * We need to wait for any direct I/Os in flight to complete before we 1401 * proceed with the truncate. This is needed to prevent the extents 1402 * being read or written by the direct I/Os from being removed while the 1403 * I/O is in flight as there is no other method of synchronising 1404 * direct I/O with the truncate operation. Also, because we hold 1405 * the IOLOCK in exclusive mode, we prevent new direct I/Os from being 1406 * started until the truncate completes and drops the lock. Essentially, 1407 * the vn_iowait() call forms an I/O barrier that provides strict ordering 1408 * between direct I/Os and the truncate operation. 1409 * 1410 * The flags parameter can have either the value XFS_ITRUNC_DEFINITE 1411 * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used 1412 * in the case that the caller is locking things out of order and 1413 * may not be able to call xfs_itruncate_finish() with the inode lock 1414 * held without dropping the I/O lock. If the caller must drop the 1415 * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start() 1416 * must be called again with all the same restrictions as the initial 1417 * call. 1418 */ 1419void 1420xfs_itruncate_start( 1421 xfs_inode_t *ip, 1422 uint flags, 1423 xfs_fsize_t new_size) 1424{ 1425 xfs_fsize_t last_byte; 1426 xfs_off_t toss_start; 1427 xfs_mount_t *mp; 1428 xfs_vnode_t *vp; 1429 1430 ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); 1431 ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size)); 1432 ASSERT((flags == XFS_ITRUNC_DEFINITE) || 1433 (flags == XFS_ITRUNC_MAYBE)); 1434 1435 mp = ip->i_mount; 1436 vp = XFS_ITOV(ip); 1437 1438 vn_iowait(vp); /* wait for the completion of any pending DIOs */ 1439 1440 /* 1441 * Call VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES() to get rid of pages and buffers 1442 * overlapping the region being removed. We have to use 1443 * the less efficient VOP_FLUSHINVAL_PAGES() in the case that the 1444 * caller may not be able to finish the truncate without 1445 * dropping the inode's I/O lock. Make sure 1446 * to catch any pages brought in by buffers overlapping 1447 * the EOF by searching out beyond the isize by our 1448 * block size. We round new_size up to a block boundary 1449 * so that we don't toss things on the same block as 1450 * new_size but before it. 1451 * 1452 * Before calling VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES(), make sure to 1453 * call remapf() over the same region if the file is mapped. 1454 * This frees up mapped file references to the pages in the 1455 * given range and for the VOP_FLUSHINVAL_PAGES() case it ensures 1456 * that we get the latest mapped changes flushed out. 1457 */ 1458 toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1459 toss_start = XFS_FSB_TO_B(mp, toss_start); 1460 if (toss_start < 0) { 1461 /* 1462 * The place to start tossing is beyond our maximum 1463 * file size, so there is no way that the data extended 1464 * out there. 1465 */ 1466 return; 1467 } 1468 last_byte = xfs_file_last_byte(ip); 1469 xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start, 1470 last_byte); 1471 if (last_byte > toss_start) { 1472 if (flags & XFS_ITRUNC_DEFINITE) { 1473 XVOP_TOSS_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED); 1474 } else { 1475 XVOP_FLUSHINVAL_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED); 1476 } 1477 } 1478 1479#ifdef DEBUG 1480 if (new_size == 0) { 1481 ASSERT(VN_CACHED(vp) == 0); 1482 } 1483#endif 1484} 1485 1486/* 1487 * Shrink the file to the given new_size. The new 1488 * size must be smaller than the current size. 1489 * This will free up the underlying blocks 1490 * in the removed range after a call to xfs_itruncate_start() 1491 * or xfs_atruncate_start(). 1492 * 1493 * The transaction passed to this routine must have made 1494 * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES. 1495 * This routine may commit the given transaction and 1496 * start new ones, so make sure everything involved in 1497 * the transaction is tidy before calling here. 1498 * Some transaction will be returned to the caller to be 1499 * committed. The incoming transaction must already include 1500 * the inode, and both inode locks must be held exclusively. 1501 * The inode must also be "held" within the transaction. On 1502 * return the inode will be "held" within the returned transaction. 1503 * This routine does NOT require any disk space to be reserved 1504 * for it within the transaction. 1505 * 1506 * The fork parameter must be either xfs_attr_fork or xfs_data_fork, 1507 * and it indicates the fork which is to be truncated. For the 1508 * attribute fork we only support truncation to size 0. 1509 * 1510 * We use the sync parameter to indicate whether or not the first 1511 * transaction we perform might have to be synchronous. For the attr fork, 1512 * it needs to be so if the unlink of the inode is not yet known to be 1513 * permanent in the log. This keeps us from freeing and reusing the 1514 * blocks of the attribute fork before the unlink of the inode becomes 1515 * permanent. 1516 * 1517 * For the data fork, we normally have to run synchronously if we're 1518 * being called out of the inactive path or we're being called 1519 * out of the create path where we're truncating an existing file. 1520 * Either way, the truncate needs to be sync so blocks don't reappear 1521 * in the file with altered data in case of a crash. wsync filesystems 1522 * can run the first case async because anything that shrinks the inode 1523 * has to run sync so by the time we're called here from inactive, the 1524 * inode size is permanently set to 0. 1525 * 1526 * Calls from the truncate path always need to be sync unless we're 1527 * in a wsync filesystem and the file has already been unlinked. 1528 * 1529 * The caller is responsible for correctly setting the sync parameter. 1530 * It gets too hard for us to guess here which path we're being called 1531 * out of just based on inode state. 1532 */ 1533int 1534xfs_itruncate_finish( 1535 xfs_trans_t **tp, 1536 xfs_inode_t *ip, 1537 xfs_fsize_t new_size, 1538 int fork, 1539 int sync) 1540{ 1541 xfs_fsblock_t first_block; 1542 xfs_fileoff_t first_unmap_block; 1543 xfs_fileoff_t last_block; 1544 xfs_filblks_t unmap_len=0; 1545 xfs_mount_t *mp; 1546 xfs_trans_t *ntp; 1547 int done; 1548 int committed; 1549 xfs_bmap_free_t free_list; 1550 int error; 1551 1552 ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); 1553 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); 1554 ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size)); 1555 ASSERT(*tp != NULL); 1556 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 1557 ASSERT(ip->i_transp == *tp); 1558 ASSERT(ip->i_itemp != NULL); 1559 ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD); 1560 1561 1562 ntp = *tp; 1563 mp = (ntp)->t_mountp; 1564 ASSERT(! XFS_NOT_DQATTACHED(mp, ip)); 1565 1566 /* 1567 * We only support truncating the entire attribute fork. 1568 */ 1569 if (fork == XFS_ATTR_FORK) { 1570 new_size = 0LL; 1571 } 1572 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1573 xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0); 1574 /* 1575 * The first thing we do is set the size to new_size permanently 1576 * on disk. This way we don't have to worry about anyone ever 1577 * being able to look at the data being freed even in the face 1578 * of a crash. What we're getting around here is the case where 1579 * we free a block, it is allocated to another file, it is written 1580 * to, and then we crash. If the new data gets written to the 1581 * file but the log buffers containing the free and reallocation 1582 * don't, then we'd end up with garbage in the blocks being freed. 1583 * As long as we make the new_size permanent before actually 1584 * freeing any blocks it doesn't matter if they get writtten to. 1585 * 1586 * The callers must signal into us whether or not the size 1587 * setting here must be synchronous. There are a few cases 1588 * where it doesn't have to be synchronous. Those cases 1589 * occur if the file is unlinked and we know the unlink is 1590 * permanent or if the blocks being truncated are guaranteed 1591 * to be beyond the inode eof (regardless of the link count) 1592 * and the eof value is permanent. Both of these cases occur 1593 * only on wsync-mounted filesystems. In those cases, we're 1594 * guaranteed that no user will ever see the data in the blocks 1595 * that are being truncated so the truncate can run async. 1596 * In the free beyond eof case, the file may wind up with 1597 * more blocks allocated to it than it needs if we crash 1598 * and that won't get fixed until the next time the file 1599 * is re-opened and closed but that's ok as that shouldn't 1600 * be too many blocks. 1601 * 1602 * However, we can't just make all wsync xactions run async 1603 * because there's one call out of the create path that needs 1604 * to run sync where it's truncating an existing file to size 1605 * 0 whose size is > 0. 1606 * 1607 * It's probably possible to come up with a test in this 1608 * routine that would correctly distinguish all the above 1609 * cases from the values of the function parameters and the 1610 * inode state but for sanity's sake, I've decided to let the 1611 * layers above just tell us. It's simpler to correctly figure 1612 * out in the layer above exactly under what conditions we 1613 * can run async and I think it's easier for others read and 1614 * follow the logic in case something has to be changed. 1615 * cscope is your friend -- rcc. 1616 * 1617 * The attribute fork is much simpler. 1618 * 1619 * For the attribute fork we allow the caller to tell us whether 1620 * the unlink of the inode that led to this call is yet permanent 1621 * in the on disk log. If it is not and we will be freeing extents 1622 * in this inode then we make the first transaction synchronous 1623 * to make sure that the unlink is permanent by the time we free 1624 * the blocks. 1625 */ 1626 if (fork == XFS_DATA_FORK) { 1627 if (ip->i_d.di_nextents > 0) { 1628 ip->i_d.di_size = new_size; 1629 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1630 } 1631 } else if (sync) { 1632 ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC)); 1633 if (ip->i_d.di_anextents > 0) 1634 xfs_trans_set_sync(ntp); 1635 } 1636 ASSERT(fork == XFS_DATA_FORK || 1637 (fork == XFS_ATTR_FORK && 1638 ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) || 1639 (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC))))); 1640 1641 /* 1642 * Since it is possible for space to become allocated beyond 1643 * the end of the file (in a crash where the space is allocated 1644 * but the inode size is not yet updated), simply remove any 1645 * blocks which show up between the new EOF and the maximum 1646 * possible file size. If the first block to be removed is 1647 * beyond the maximum file size (ie it is the same as last_block), 1648 * then there is nothing to do. 1649 */ 1650 last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 1651 ASSERT(first_unmap_block <= last_block); 1652 done = 0; 1653 if (last_block == first_unmap_block) { 1654 done = 1; 1655 } else { 1656 unmap_len = last_block - first_unmap_block + 1; 1657 } 1658 while (!done) { 1659 /* 1660 * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi() 1661 * will tell us whether it freed the entire range or 1662 * not. If this is a synchronous mount (wsync), 1663 * then we can tell bunmapi to keep all the 1664 * transactions asynchronous since the unlink 1665 * transaction that made this inode inactive has 1666 * already hit the disk. There's no danger of 1667 * the freed blocks being reused, there being a 1668 * crash, and the reused blocks suddenly reappearing 1669 * in this file with garbage in them once recovery 1670 * runs. 1671 */ 1672 XFS_BMAP_INIT(&free_list, &first_block); 1673 error = XFS_BUNMAPI(mp, ntp, &ip->i_iocore, 1674 first_unmap_block, unmap_len, 1675 XFS_BMAPI_AFLAG(fork) | 1676 (sync ? 0 : XFS_BMAPI_ASYNC), 1677 XFS_ITRUNC_MAX_EXTENTS, 1678 &first_block, &free_list, 1679 NULL, &done); 1680 if (error) { 1681 /* 1682 * If the bunmapi call encounters an error, 1683 * return to the caller where the transaction 1684 * can be properly aborted. We just need to 1685 * make sure we're not holding any resources 1686 * that we were not when we came in. 1687 */ 1688 xfs_bmap_cancel(&free_list); 1689 return error; 1690 } 1691 1692 /* 1693 * Duplicate the transaction that has the permanent 1694 * reservation and commit the old transaction. 1695 */ 1696 error = xfs_bmap_finish(tp, &free_list, first_block, 1697 &committed); 1698 ntp = *tp; 1699 if (error) { 1700 /* 1701 * If the bmap finish call encounters an error, 1702 * return to the caller where the transaction 1703 * can be properly aborted. We just need to 1704 * make sure we're not holding any resources 1705 * that we were not when we came in. 1706 * 1707 * Aborting from this point might lose some 1708 * blocks in the file system, but oh well. 1709 */ 1710 xfs_bmap_cancel(&free_list); 1711 if (committed) { 1712 /* 1713 * If the passed in transaction committed 1714 * in xfs_bmap_finish(), then we want to 1715 * add the inode to this one before returning. 1716 * This keeps things simple for the higher 1717 * level code, because it always knows that 1718 * the inode is locked and held in the 1719 * transaction that returns to it whether 1720 * errors occur or not. We don't mark the 1721 * inode dirty so that this transaction can 1722 * be easily aborted if possible. 1723 */ 1724 xfs_trans_ijoin(ntp, ip, 1725 XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1726 xfs_trans_ihold(ntp, ip); 1727 } 1728 return error; 1729 } 1730 1731 if (committed) { 1732 /* 1733 * The first xact was committed, 1734 * so add the inode to the new one. 1735 * Mark it dirty so it will be logged 1736 * and moved forward in the log as 1737 * part of every commit. 1738 */ 1739 xfs_trans_ijoin(ntp, ip, 1740 XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1741 xfs_trans_ihold(ntp, ip); 1742 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1743 } 1744 ntp = xfs_trans_dup(ntp); 1745 (void) xfs_trans_commit(*tp, 0, NULL); 1746 *tp = ntp; 1747 error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 1748 XFS_TRANS_PERM_LOG_RES, 1749 XFS_ITRUNCATE_LOG_COUNT); 1750 /* 1751 * Add the inode being truncated to the next chained 1752 * transaction. 1753 */ 1754 xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1755 xfs_trans_ihold(ntp, ip); 1756 if (error) 1757 return (error); 1758 } 1759 /* 1760 * Only update the size in the case of the data fork, but 1761 * always re-log the inode so that our permanent transaction 1762 * can keep on rolling it forward in the log. 1763 */ 1764 if (fork == XFS_DATA_FORK) { 1765 xfs_isize_check(mp, ip, new_size); 1766 ip->i_d.di_size = new_size; 1767 } 1768 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1769 ASSERT((new_size != 0) || 1770 (fork == XFS_ATTR_FORK) || 1771 (ip->i_delayed_blks == 0)); 1772 ASSERT((new_size != 0) || 1773 (fork == XFS_ATTR_FORK) || 1774 (ip->i_d.di_nextents == 0)); 1775 xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0); 1776 return 0; 1777} 1778 1779 1780/* 1781 * xfs_igrow_start 1782 * 1783 * Do the first part of growing a file: zero any data in the last 1784 * block that is beyond the old EOF. We need to do this before 1785 * the inode is joined to the transaction to modify the i_size. 1786 * That way we can drop the inode lock and call into the buffer 1787 * cache to get the buffer mapping the EOF. 1788 */ 1789int 1790xfs_igrow_start( 1791 xfs_inode_t *ip, 1792 xfs_fsize_t new_size, 1793 cred_t *credp) 1794{ 1795 int error; 1796 1797 ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); 1798 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); 1799 ASSERT(new_size > ip->i_d.di_size); 1800 1801 /* 1802 * Zero any pages that may have been created by 1803 * xfs_write_file() beyond the end of the file 1804 * and any blocks between the old and new file sizes. 1805 */ 1806 error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, 1807 ip->i_d.di_size, new_size); 1808 return error; 1809} 1810 1811/* 1812 * xfs_igrow_finish 1813 * 1814 * This routine is called to extend the size of a file. 1815 * The inode must have both the iolock and the ilock locked 1816 * for update and it must be a part of the current transaction. 1817 * The xfs_igrow_start() function must have been called previously. 1818 * If the change_flag is not zero, the inode change timestamp will 1819 * be updated. 1820 */ 1821void 1822xfs_igrow_finish( 1823 xfs_trans_t *tp, 1824 xfs_inode_t *ip, 1825 xfs_fsize_t new_size, 1826 int change_flag) 1827{ 1828 ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); 1829 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); 1830 ASSERT(ip->i_transp == tp); 1831 ASSERT(new_size > ip->i_d.di_size); 1832 1833 /* 1834 * Update the file size. Update the inode change timestamp 1835 * if change_flag set. 1836 */ 1837 ip->i_d.di_size = new_size; 1838 if (change_flag) 1839 xfs_ichgtime(ip, XFS_ICHGTIME_CHG); 1840 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1841 1842} 1843 1844 1845/* 1846 * This is called when the inode's link count goes to 0. 1847 * We place the on-disk inode on a list in the AGI. It 1848 * will be pulled from this list when the inode is freed. 1849 */ 1850int 1851xfs_iunlink( 1852 xfs_trans_t *tp, 1853 xfs_inode_t *ip) 1854{ 1855 xfs_mount_t *mp; 1856 xfs_agi_t *agi; 1857 xfs_dinode_t *dip; 1858 xfs_buf_t *agibp; 1859 xfs_buf_t *ibp; 1860 xfs_agnumber_t agno; 1861 xfs_daddr_t agdaddr; 1862 xfs_agino_t agino; 1863 short bucket_index; 1864 int offset; 1865 int error; 1866 int agi_ok; 1867 1868 ASSERT(ip->i_d.di_nlink == 0); 1869 ASSERT(ip->i_d.di_mode != 0); 1870 ASSERT(ip->i_transp == tp); 1871 1872 mp = tp->t_mountp; 1873 1874 agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 1875 agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); 1876 1877 /* 1878 * Get the agi buffer first. It ensures lock ordering 1879 * on the list. 1880 */ 1881 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, 1882 XFS_FSS_TO_BB(mp, 1), 0, &agibp); 1883 if (error) { 1884 return error; 1885 } 1886 /* 1887 * Validate the magic number of the agi block. 1888 */ 1889 agi = XFS_BUF_TO_AGI(agibp); 1890 agi_ok = 1891 be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && 1892 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); 1893 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK, 1894 XFS_RANDOM_IUNLINK))) { 1895 XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi); 1896 xfs_trans_brelse(tp, agibp); 1897 return XFS_ERROR(EFSCORRUPTED); 1898 } 1899 /* 1900 * Get the index into the agi hash table for the 1901 * list this inode will go on. 1902 */ 1903 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1904 ASSERT(agino != 0); 1905 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1906 ASSERT(agi->agi_unlinked[bucket_index]); 1907 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); 1908 1909 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) { 1910 /* 1911 * There is already another inode in the bucket we need 1912 * to add ourselves to. Add us at the front of the list. 1913 * Here we put the head pointer into our next pointer, 1914 * and then we fall through to point the head at us. 1915 */ 1916 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); 1917 if (error) { 1918 return error; 1919 } 1920 ASSERT(INT_GET(dip->di_next_unlinked, ARCH_CONVERT) == NULLAGINO); 1921 ASSERT(dip->di_next_unlinked); 1922 /* both on-disk, don't endian flip twice */ 1923 dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; 1924 offset = ip->i_boffset + 1925 offsetof(xfs_dinode_t, di_next_unlinked); 1926 xfs_trans_inode_buf(tp, ibp); 1927 xfs_trans_log_buf(tp, ibp, offset, 1928 (offset + sizeof(xfs_agino_t) - 1)); 1929 xfs_inobp_check(mp, ibp); 1930 } 1931 1932 /* 1933 * Point the bucket head pointer at the inode being inserted. 1934 */ 1935 ASSERT(agino != 0); 1936 agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); 1937 offset = offsetof(xfs_agi_t, agi_unlinked) + 1938 (sizeof(xfs_agino_t) * bucket_index); 1939 xfs_trans_log_buf(tp, agibp, offset, 1940 (offset + sizeof(xfs_agino_t) - 1)); 1941 return 0; 1942} 1943 1944/* 1945 * Pull the on-disk inode from the AGI unlinked list. 1946 */ 1947STATIC int 1948xfs_iunlink_remove( 1949 xfs_trans_t *tp, 1950 xfs_inode_t *ip) 1951{ 1952 xfs_ino_t next_ino; 1953 xfs_mount_t *mp; 1954 xfs_agi_t *agi; 1955 xfs_dinode_t *dip; 1956 xfs_buf_t *agibp; 1957 xfs_buf_t *ibp; 1958 xfs_agnumber_t agno; 1959 xfs_daddr_t agdaddr; 1960 xfs_agino_t agino; 1961 xfs_agino_t next_agino; 1962 xfs_buf_t *last_ibp; 1963 xfs_dinode_t *last_dip = NULL; 1964 short bucket_index; 1965 int offset, last_offset = 0; 1966 int error; 1967 int agi_ok; 1968 1969 /* 1970 * First pull the on-disk inode from the AGI unlinked list. 1971 */ 1972 mp = tp->t_mountp; 1973 1974 agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 1975 agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); 1976 1977 /* 1978 * Get the agi buffer first. It ensures lock ordering 1979 * on the list. 1980 */ 1981 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, 1982 XFS_FSS_TO_BB(mp, 1), 0, &agibp); 1983 if (error) { 1984 cmn_err(CE_WARN, 1985 "xfs_iunlink_remove: xfs_trans_read_buf() returned an error %d on %s. Returning error.", 1986 error, mp->m_fsname); 1987 return error; 1988 } 1989 /* 1990 * Validate the magic number of the agi block. 1991 */ 1992 agi = XFS_BUF_TO_AGI(agibp); 1993 agi_ok = 1994 be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && 1995 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); 1996 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE, 1997 XFS_RANDOM_IUNLINK_REMOVE))) { 1998 XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW, 1999 mp, agi); 2000 xfs_trans_brelse(tp, agibp); 2001 cmn_err(CE_WARN, 2002 "xfs_iunlink_remove: XFS_TEST_ERROR() returned an error on %s. Returning EFSCORRUPTED.", 2003 mp->m_fsname); 2004 return XFS_ERROR(EFSCORRUPTED); 2005 } 2006 /* 2007 * Get the index into the agi hash table for the 2008 * list this inode will go on. 2009 */ 2010 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2011 ASSERT(agino != 0); 2012 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 2013 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO); 2014 ASSERT(agi->agi_unlinked[bucket_index]); 2015 2016 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { 2017 /* 2018 * We're at the head of the list. Get the inode's 2019 * on-disk buffer to see if there is anyone after us 2020 * on the list. Only modify our next pointer if it 2021 * is not already NULLAGINO. This saves us the overhead 2022 * of dealing with the buffer when there is no need to 2023 * change it. 2024 */ 2025 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); 2026 if (error) { 2027 cmn_err(CE_WARN, 2028 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 2029 error, mp->m_fsname); 2030 return error; 2031 } 2032 next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT); 2033 ASSERT(next_agino != 0); 2034 if (next_agino != NULLAGINO) { 2035 INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO); 2036 offset = ip->i_boffset + 2037 offsetof(xfs_dinode_t, di_next_unlinked); 2038 xfs_trans_inode_buf(tp, ibp); 2039 xfs_trans_log_buf(tp, ibp, offset, 2040 (offset + sizeof(xfs_agino_t) - 1)); 2041 xfs_inobp_check(mp, ibp); 2042 } else { 2043 xfs_trans_brelse(tp, ibp); 2044 } 2045 /* 2046 * Point the bucket head pointer at the next inode. 2047 */ 2048 ASSERT(next_agino != 0); 2049 ASSERT(next_agino != agino); 2050 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); 2051 offset = offsetof(xfs_agi_t, agi_unlinked) + 2052 (sizeof(xfs_agino_t) * bucket_index); 2053 xfs_trans_log_buf(tp, agibp, offset, 2054 (offset + sizeof(xfs_agino_t) - 1)); 2055 } else { 2056 /* 2057 * We need to search the list for the inode being freed. 2058 */ 2059 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2060 last_ibp = NULL; 2061 while (next_agino != agino) { 2062 /* 2063 * If the last inode wasn't the one pointing to 2064 * us, then release its buffer since we're not 2065 * going to do anything with it. 2066 */ 2067 if (last_ibp != NULL) { 2068 xfs_trans_brelse(tp, last_ibp); 2069 } 2070 next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); 2071 error = xfs_inotobp(mp, tp, next_ino, &last_dip, 2072 &last_ibp, &last_offset); 2073 if (error) { 2074 cmn_err(CE_WARN, 2075 "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.", 2076 error, mp->m_fsname); 2077 return error; 2078 } 2079 next_agino = INT_GET(last_dip->di_next_unlinked, ARCH_CONVERT); 2080 ASSERT(next_agino != NULLAGINO); 2081 ASSERT(next_agino != 0); 2082 } 2083 /* 2084 * Now last_ibp points to the buffer previous to us on 2085 * the unlinked list. Pull us from the list. 2086 */ 2087 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); 2088 if (error) { 2089 cmn_err(CE_WARN, 2090 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 2091 error, mp->m_fsname); 2092 return error; 2093 } 2094 next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT); 2095 ASSERT(next_agino != 0); 2096 ASSERT(next_agino != agino); 2097 if (next_agino != NULLAGINO) { 2098 INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO); 2099 offset = ip->i_boffset + 2100 offsetof(xfs_dinode_t, di_next_unlinked); 2101 xfs_trans_inode_buf(tp, ibp); 2102 xfs_trans_log_buf(tp, ibp, offset, 2103 (offset + sizeof(xfs_agino_t) - 1)); 2104 xfs_inobp_check(mp, ibp); 2105 } else { 2106 xfs_trans_brelse(tp, ibp); 2107 } 2108 /* 2109 * Point the previous inode on the list to the next inode. 2110 */ 2111 INT_SET(last_dip->di_next_unlinked, ARCH_CONVERT, next_agino); 2112 ASSERT(next_agino != 0); 2113 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); 2114 xfs_trans_inode_buf(tp, last_ibp); 2115 xfs_trans_log_buf(tp, last_ibp, offset, 2116 (offset + sizeof(xfs_agino_t) - 1)); 2117 xfs_inobp_check(mp, last_ibp); 2118 } 2119 return 0; 2120} 2121 2122static __inline__ int xfs_inode_clean(xfs_inode_t *ip) 2123{ 2124 return (((ip->i_itemp == NULL) || 2125 !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && 2126 (ip->i_update_core == 0)); 2127} 2128 2129STATIC void 2130xfs_ifree_cluster( 2131 xfs_inode_t *free_ip, 2132 xfs_trans_t *tp, 2133 xfs_ino_t inum) 2134{ 2135 xfs_mount_t *mp = free_ip->i_mount; 2136 int blks_per_cluster; 2137 int nbufs; 2138 int ninodes; 2139 int i, j, found, pre_flushed; 2140 xfs_daddr_t blkno; 2141 xfs_buf_t *bp; 2142 xfs_ihash_t *ih; 2143 xfs_inode_t *ip, **ip_found; 2144 xfs_inode_log_item_t *iip; 2145 xfs_log_item_t *lip; 2146 SPLDECL(s); 2147 2148 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 2149 blks_per_cluster = 1; 2150 ninodes = mp->m_sb.sb_inopblock; 2151 nbufs = XFS_IALLOC_BLOCKS(mp); 2152 } else { 2153 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / 2154 mp->m_sb.sb_blocksize; 2155 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; 2156 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; 2157 } 2158 2159 ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS); 2160 2161 for (j = 0; j < nbufs; j++, inum += ninodes) { 2162 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 2163 XFS_INO_TO_AGBNO(mp, inum)); 2164 2165 2166 /* 2167 * Look for each inode in memory and attempt to lock it, 2168 * we can be racing with flush and tail pushing here. 2169 * any inode we get the locks on, add to an array of 2170 * inode items to process later. 2171 * 2172 * The get the buffer lock, we could beat a flush 2173 * or tail pushing thread to the lock here, in which 2174 * case they will go looking for the inode buffer 2175 * and fail, we need some other form of interlock 2176 * here. 2177 */ 2178 found = 0; 2179 for (i = 0; i < ninodes; i++) { 2180 ih = XFS_IHASH(mp, inum + i); 2181 read_lock(&ih->ih_lock); 2182 for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { 2183 if (ip->i_ino == inum + i) 2184 break; 2185 } 2186 2187 /* Inode not in memory or we found it already, 2188 * nothing to do 2189 */ 2190 if (!ip || (ip->i_flags & XFS_ISTALE)) { 2191 read_unlock(&ih->ih_lock); 2192 continue; 2193 } 2194 2195 if (xfs_inode_clean(ip)) { 2196 read_unlock(&ih->ih_lock); 2197 continue; 2198 } 2199 2200 /* If we can get the locks then add it to the 2201 * list, otherwise by the time we get the bp lock 2202 * below it will already be attached to the 2203 * inode buffer. 2204 */ 2205 2206 /* This inode will already be locked - by us, lets 2207 * keep it that way. 2208 */ 2209 2210 if (ip == free_ip) { 2211 if (xfs_iflock_nowait(ip)) { 2212 ip->i_flags |= XFS_ISTALE; 2213 2214 if (xfs_inode_clean(ip)) { 2215 xfs_ifunlock(ip); 2216 } else { 2217 ip_found[found++] = ip; 2218 } 2219 } 2220 read_unlock(&ih->ih_lock); 2221 continue; 2222 } 2223 2224 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2225 if (xfs_iflock_nowait(ip)) { 2226 ip->i_flags |= XFS_ISTALE; 2227 2228 if (xfs_inode_clean(ip)) { 2229 xfs_ifunlock(ip); 2230 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2231 } else { 2232 ip_found[found++] = ip; 2233 } 2234 } else { 2235 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2236 } 2237 } 2238 2239 read_unlock(&ih->ih_lock); 2240 } 2241 2242 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2243 mp->m_bsize * blks_per_cluster, 2244 XFS_BUF_LOCK); 2245 2246 pre_flushed = 0; 2247 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 2248 while (lip) { 2249 if (lip->li_type == XFS_LI_INODE) { 2250 iip = (xfs_inode_log_item_t *)lip; 2251 ASSERT(iip->ili_logged == 1); 2252 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; 2253 AIL_LOCK(mp,s); 2254 iip->ili_flush_lsn = iip->ili_item.li_lsn; 2255 AIL_UNLOCK(mp, s); 2256 iip->ili_inode->i_flags |= XFS_ISTALE; 2257 pre_flushed++; 2258 } 2259 lip = lip->li_bio_list; 2260 } 2261 2262 for (i = 0; i < found; i++) { 2263 ip = ip_found[i]; 2264 iip = ip->i_itemp; 2265 2266 if (!iip) { 2267 ip->i_update_core = 0; 2268 xfs_ifunlock(ip); 2269 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2270 continue; 2271 } 2272 2273 iip->ili_last_fields = iip->ili_format.ilf_fields; 2274 iip->ili_format.ilf_fields = 0; 2275 iip->ili_logged = 1; 2276 AIL_LOCK(mp,s); 2277 iip->ili_flush_lsn = iip->ili_item.li_lsn; 2278 AIL_UNLOCK(mp, s); 2279 2280 xfs_buf_attach_iodone(bp, 2281 (void(*)(xfs_buf_t*,xfs_log_item_t*)) 2282 xfs_istale_done, (xfs_log_item_t *)iip); 2283 if (ip != free_ip) { 2284 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2285 } 2286 } 2287 2288 if (found || pre_flushed) 2289 xfs_trans_stale_inode_buf(tp, bp); 2290 xfs_trans_binval(tp, bp); 2291 } 2292 2293 kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *)); 2294} 2295 2296/* 2297 * This is called to return an inode to the inode free list. 2298 * The inode should already be truncated to 0 length and have 2299 * no pages associated with it. This routine also assumes that 2300 * the inode is already a part of the transaction. 2301 * 2302 * The on-disk copy of the inode will have been added to the list 2303 * of unlinked inodes in the AGI. We need to remove the inode from 2304 * that list atomically with respect to freeing it here. 2305 */ 2306int 2307xfs_ifree( 2308 xfs_trans_t *tp, 2309 xfs_inode_t *ip, 2310 xfs_bmap_free_t *flist) 2311{ 2312 int error; 2313 int delete; 2314 xfs_ino_t first_ino; 2315 2316 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); 2317 ASSERT(ip->i_transp == tp); 2318 ASSERT(ip->i_d.di_nlink == 0); 2319 ASSERT(ip->i_d.di_nextents == 0); 2320 ASSERT(ip->i_d.di_anextents == 0); 2321 ASSERT((ip->i_d.di_size == 0) || 2322 ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); 2323 ASSERT(ip->i_d.di_nblocks == 0); 2324 2325 /* 2326 * Pull the on-disk inode from the AGI unlinked list. 2327 */ 2328 error = xfs_iunlink_remove(tp, ip); 2329 if (error != 0) { 2330 return error; 2331 } 2332 2333 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 2334 if (error != 0) { 2335 return error; 2336 } 2337 ip->i_d.di_mode = 0; /* mark incore inode as free */ 2338 ip->i_d.di_flags = 0; 2339 ip->i_d.di_dmevmask = 0; 2340 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ 2341 ip->i_df.if_ext_max = 2342 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 2343 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 2344 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 2345 /* 2346 * Bump the generation count so no one will be confused 2347 * by reincarnations of this inode. 2348 */ 2349 ip->i_d.di_gen++; 2350 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2351 2352 if (delete) { 2353 xfs_ifree_cluster(ip, tp, first_ino); 2354 } 2355 2356 return 0; 2357} 2358 2359/* 2360 * Reallocate the space for if_broot based on the number of records 2361 * being added or deleted as indicated in rec_diff. Move the records 2362 * and pointers in if_broot to fit the new size. When shrinking this 2363 * will eliminate holes between the records and pointers created by 2364 * the caller. When growing this will create holes to be filled in 2365 * by the caller. 2366 * 2367 * The caller must not request to add more records than would fit in 2368 * the on-disk inode root. If the if_broot is currently NULL, then 2369 * if we adding records one will be allocated. The caller must also 2370 * not request that the number of records go below zero, although 2371 * it can go to zero. 2372 * 2373 * ip -- the inode whose if_broot area is changing 2374 * ext_diff -- the change in the number of records, positive or negative, 2375 * requested for the if_broot array. 2376 */ 2377void 2378xfs_iroot_realloc( 2379 xfs_inode_t *ip, 2380 int rec_diff, 2381 int whichfork) 2382{ 2383 int cur_max; 2384 xfs_ifork_t *ifp; 2385 xfs_bmbt_block_t *new_broot; 2386 int new_max; 2387 size_t new_size; 2388 char *np; 2389 char *op; 2390 2391 /* 2392 * Handle the degenerate case quietly. 2393 */ 2394 if (rec_diff == 0) { 2395 return; 2396 } 2397 2398 ifp = XFS_IFORK_PTR(ip, whichfork); 2399 if (rec_diff > 0) { 2400 /* 2401 * If there wasn't any memory allocated before, just 2402 * allocate it now and get out. 2403 */ 2404 if (ifp->if_broot_bytes == 0) { 2405 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); 2406 ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size, 2407 KM_SLEEP); 2408 ifp->if_broot_bytes = (int)new_size; 2409 return; 2410 } 2411 2412 /* 2413 * If there is already an existing if_broot, then we need 2414 * to realloc() it and shift the pointers to their new 2415 * location. The records don't change location because 2416 * they are kept butted up against the btree block header. 2417 */ 2418 cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes); 2419 new_max = cur_max + rec_diff; 2420 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2421 ifp->if_broot = (xfs_bmbt_block_t *) 2422 kmem_realloc(ifp->if_broot, 2423 new_size, 2424 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ 2425 KM_SLEEP); 2426 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, 2427 ifp->if_broot_bytes); 2428 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, 2429 (int)new_size); 2430 ifp->if_broot_bytes = (int)new_size; 2431 ASSERT(ifp->if_broot_bytes <= 2432 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 2433 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); 2434 return; 2435 } 2436 2437 /* 2438 * rec_diff is less than 0. In this case, we are shrinking the 2439 * if_broot buffer. It must already exist. If we go to zero 2440 * records, just get rid of the root and clear the status bit. 2441 */ 2442 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); 2443 cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes); 2444 new_max = cur_max + rec_diff; 2445 ASSERT(new_max >= 0); 2446 if (new_max > 0) 2447 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2448 else 2449 new_size = 0; 2450 if (new_size > 0) { 2451 new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP); 2452 /* 2453 * First copy over the btree block header. 2454 */ 2455 memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t)); 2456 } else { 2457 new_broot = NULL; 2458 ifp->if_flags &= ~XFS_IFBROOT; 2459 } 2460 2461 /* 2462 * Only copy the records and pointers if there are any. 2463 */ 2464 if (new_max > 0) { 2465 /* 2466 * First copy the records. 2467 */ 2468 op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1, 2469 ifp->if_broot_bytes); 2470 np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1, 2471 (int)new_size); 2472 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); 2473 2474 /* 2475 * Then copy the pointers. 2476 */ 2477 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, 2478 ifp->if_broot_bytes); 2479 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1, 2480 (int)new_size); 2481 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); 2482 } 2483 kmem_free(ifp->if_broot, ifp->if_broot_bytes); 2484 ifp->if_broot = new_broot; 2485 ifp->if_broot_bytes = (int)new_size; 2486 ASSERT(ifp->if_broot_bytes <= 2487 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 2488 return; 2489} 2490 2491 2492/* 2493 * This is called when the amount of space needed for if_data 2494 * is increased or decreased. The change in size is indicated by 2495 * the number of bytes that need to be added or deleted in the 2496 * byte_diff parameter. 2497 * 2498 * If the amount of space needed has decreased below the size of the 2499 * inline buffer, then switch to using the inline buffer. Otherwise, 2500 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer 2501 * to what is needed. 2502 * 2503 * ip -- the inode whose if_data area is changing 2504 * byte_diff -- the change in the number of bytes, positive or negative, 2505 * requested for the if_data array. 2506 */ 2507void 2508xfs_idata_realloc( 2509 xfs_inode_t *ip, 2510 int byte_diff, 2511 int whichfork) 2512{ 2513 xfs_ifork_t *ifp; 2514 int new_size; 2515 int real_size; 2516 2517 if (byte_diff == 0) { 2518 return; 2519 } 2520 2521 ifp = XFS_IFORK_PTR(ip, whichfork); 2522 new_size = (int)ifp->if_bytes + byte_diff; 2523 ASSERT(new_size >= 0); 2524 2525 if (new_size == 0) { 2526 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2527 kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); 2528 } 2529 ifp->if_u1.if_data = NULL; 2530 real_size = 0; 2531 } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { 2532 /* 2533 * If the valid extents/data can fit in if_inline_ext/data, 2534 * copy them from the malloc'd vector and free it. 2535 */ 2536 if (ifp->if_u1.if_data == NULL) { 2537 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 2538 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2539 ASSERT(ifp->if_real_bytes != 0); 2540 memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, 2541 new_size); 2542 kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); 2543 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 2544 } 2545 real_size = 0; 2546 } else { 2547 /* 2548 * Stuck with malloc/realloc. 2549 * For inline data, the underlying buffer must be 2550 * a multiple of 4 bytes in size so that it can be 2551 * logged and stay on word boundaries. We enforce 2552 * that here. 2553 */ 2554 real_size = roundup(new_size, 4); 2555 if (ifp->if_u1.if_data == NULL) { 2556 ASSERT(ifp->if_real_bytes == 0); 2557 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 2558 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2559 /* 2560 * Only do the realloc if the underlying size 2561 * is really changing. 2562 */ 2563 if (ifp->if_real_bytes != real_size) { 2564 ifp->if_u1.if_data = 2565 kmem_realloc(ifp->if_u1.if_data, 2566 real_size, 2567 ifp->if_real_bytes, 2568 KM_SLEEP); 2569 } 2570 } else { 2571 ASSERT(ifp->if_real_bytes == 0); 2572 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 2573 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, 2574 ifp->if_bytes); 2575 } 2576 } 2577 ifp->if_real_bytes = real_size; 2578 ifp->if_bytes = new_size; 2579 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2580} 2581 2582 2583 2584 2585/* 2586 * Map inode to disk block and offset. 2587 * 2588 * mp -- the mount point structure for the current file system 2589 * tp -- the current transaction 2590 * ino -- the inode number of the inode to be located 2591 * imap -- this structure is filled in with the information necessary 2592 * to retrieve the given inode from disk 2593 * flags -- flags to pass to xfs_dilocate indicating whether or not 2594 * lookups in the inode btree were OK or not 2595 */ 2596int 2597xfs_imap( 2598 xfs_mount_t *mp, 2599 xfs_trans_t *tp, 2600 xfs_ino_t ino, 2601 xfs_imap_t *imap, 2602 uint flags) 2603{ 2604 xfs_fsblock_t fsbno; 2605 int len; 2606 int off; 2607 int error; 2608 2609 fsbno = imap->im_blkno ? 2610 XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK; 2611 error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags); 2612 if (error != 0) { 2613 return error; 2614 } 2615 imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno); 2616 imap->im_len = XFS_FSB_TO_BB(mp, len); 2617 imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno); 2618 imap->im_ioffset = (ushort)off; 2619 imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog); 2620 return 0; 2621} 2622 2623void 2624xfs_idestroy_fork( 2625 xfs_inode_t *ip, 2626 int whichfork) 2627{ 2628 xfs_ifork_t *ifp; 2629 2630 ifp = XFS_IFORK_PTR(ip, whichfork); 2631 if (ifp->if_broot != NULL) { 2632 kmem_free(ifp->if_broot, ifp->if_broot_bytes); 2633 ifp->if_broot = NULL; 2634 } 2635 2636 /* 2637 * If the format is local, then we can't have an extents 2638 * array so just look for an inline data array. If we're 2639 * not local then we may or may not have an extents list, 2640 * so check and free it up if we do. 2641 */ 2642 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { 2643 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && 2644 (ifp->if_u1.if_data != NULL)) { 2645 ASSERT(ifp->if_real_bytes != 0); 2646 kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); 2647 ifp->if_u1.if_data = NULL; 2648 ifp->if_real_bytes = 0; 2649 } 2650 } else if ((ifp->if_flags & XFS_IFEXTENTS) && 2651 ((ifp->if_flags & XFS_IFEXTIREC) || 2652 ((ifp->if_u1.if_extents != NULL) && 2653 (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { 2654 ASSERT(ifp->if_real_bytes != 0); 2655 xfs_iext_destroy(ifp); 2656 } 2657 ASSERT(ifp->if_u1.if_extents == NULL || 2658 ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); 2659 ASSERT(ifp->if_real_bytes == 0); 2660 if (whichfork == XFS_ATTR_FORK) { 2661 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 2662 ip->i_afp = NULL; 2663 } 2664} 2665 2666/* 2667 * This is called free all the memory associated with an inode. 2668 * It must free the inode itself and any buffers allocated for 2669 * if_extents/if_data and if_broot. It must also free the lock 2670 * associated with the inode. 2671 */ 2672void 2673xfs_idestroy( 2674 xfs_inode_t *ip) 2675{ 2676 2677 switch (ip->i_d.di_mode & S_IFMT) { 2678 case S_IFREG: 2679 case S_IFDIR: 2680 case S_IFLNK: 2681 xfs_idestroy_fork(ip, XFS_DATA_FORK); 2682 break; 2683 } 2684 if (ip->i_afp) 2685 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 2686 mrfree(&ip->i_lock); 2687 mrfree(&ip->i_iolock); 2688 freesema(&ip->i_flock); 2689#ifdef XFS_BMAP_TRACE 2690 ktrace_free(ip->i_xtrace); 2691#endif 2692#ifdef XFS_BMBT_TRACE 2693 ktrace_free(ip->i_btrace); 2694#endif 2695#ifdef XFS_RW_TRACE 2696 ktrace_free(ip->i_rwtrace); 2697#endif 2698#ifdef XFS_ILOCK_TRACE 2699 ktrace_free(ip->i_lock_trace); 2700#endif 2701#ifdef XFS_DIR2_TRACE 2702 ktrace_free(ip->i_dir_trace); 2703#endif 2704 if (ip->i_itemp) { 2705 /* XXXdpd should be able to assert this but shutdown 2706 * is leaving the AIL behind. */ 2707 ASSERT(((ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL) == 0) || 2708 XFS_FORCED_SHUTDOWN(ip->i_mount)); 2709 xfs_inode_item_destroy(ip); 2710 } 2711 kmem_zone_free(xfs_inode_zone, ip); 2712} 2713 2714 2715/* 2716 * Increment the pin count of the given buffer. 2717 * This value is protected by ipinlock spinlock in the mount structure. 2718 */ 2719void 2720xfs_ipin( 2721 xfs_inode_t *ip) 2722{ 2723 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); 2724 2725 atomic_inc(&ip->i_pincount); 2726} 2727 2728/* 2729 * Decrement the pin count of the given inode, and wake up 2730 * anyone in xfs_iwait_unpin() if the count goes to 0. The 2731 * inode must have been previously pinned with a call to xfs_ipin(). 2732 */ 2733void 2734xfs_iunpin( 2735 xfs_inode_t *ip) 2736{ 2737 ASSERT(atomic_read(&ip->i_pincount) > 0); 2738 2739 if (atomic_dec_and_test(&ip->i_pincount)) { 2740 /* 2741 * If the inode is currently being reclaimed, the 2742 * linux inode _and_ the xfs vnode may have been 2743 * freed so we cannot reference either of them safely. 2744 * Hence we should not try to do anything to them 2745 * if the xfs inode is currently in the reclaim 2746 * path. 2747 * 2748 * However, we still need to issue the unpin wakeup 2749 * call as the inode reclaim may be blocked waiting for 2750 * the inode to become unpinned. 2751 */ 2752 if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) { 2753 /* 2754 * Should I mark FreeBSD vnode as dirty here? 2755 */ 2756 printf("xfs_iunpin: REC RECABLE ip %p\n",ip); 2757#ifdef RMC 2758 xfs_vnode_t *vp = XFS_ITOV_NULL(ip); 2759 2760 /* make sync come back and flush this inode */ 2761 if (vp) { 2762 struct inode *inode = vn_to_inode(vp); 2763 2764 if (!(inode->i_state & I_NEW)) 2765 mark_inode_dirty_sync(inode); 2766 } 2767#endif 2768 } 2769 wakeup(&ip->i_ipin_wait); 2770 } 2771} 2772 2773/* 2774 * This is called to wait for the given inode to be unpinned. 2775 * It will sleep until this happens. The caller must have the 2776 * inode locked in at least shared mode so that the buffer cannot 2777 * be subsequently pinned once someone is waiting for it to be 2778 * unpinned. 2779 */ 2780STATIC void 2781xfs_iunpin_wait( 2782 xfs_inode_t *ip) 2783{ 2784 xfs_inode_log_item_t *iip; 2785 xfs_lsn_t lsn; 2786 2787 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS)); 2788 2789 if (atomic_read(&ip->i_pincount) == 0) { 2790 return; 2791 } 2792 2793 iip = ip->i_itemp; 2794 if (iip && iip->ili_last_lsn) { 2795 lsn = iip->ili_last_lsn; 2796 } else { 2797 lsn = (xfs_lsn_t)0; 2798 } 2799 2800 /* 2801 * Give the log a push so we don't wait here too long. 2802 */ 2803 xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE); 2804 2805 /* 2806 * XXXKAN: xfs_iunpin is not locking inode 2807 * at all? 2808 */ 2809 while(atomic_read(&ip->i_pincount) != 0) 2810 tsleep(&ip->i_ipin_wait, PRIBIO, "iunpin", 0); 2811} 2812 2813 2814/* 2815 * xfs_iextents_copy() 2816 * 2817 * This is called to copy the REAL extents (as opposed to the delayed 2818 * allocation extents) from the inode into the given buffer. It 2819 * returns the number of bytes copied into the buffer. 2820 * 2821 * If there are no delayed allocation extents, then we can just 2822 * memcpy() the extents into the buffer. Otherwise, we need to 2823 * examine each extent in turn and skip those which are delayed. 2824 */ 2825int 2826xfs_iextents_copy( 2827 xfs_inode_t *ip, 2828 xfs_bmbt_rec_t *buffer, 2829 int whichfork) 2830{ 2831 int copied; 2832 xfs_bmbt_rec_t *dest_ep; 2833 xfs_bmbt_rec_t *ep; 2834#ifdef XFS_BMAP_TRACE 2835 static char fname[] = "xfs_iextents_copy"; 2836#endif 2837 int i; 2838 xfs_ifork_t *ifp; 2839 int nrecs; 2840 xfs_fsblock_t start_block; 2841 2842 ifp = XFS_IFORK_PTR(ip, whichfork); 2843 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); 2844 ASSERT(ifp->if_bytes > 0); 2845 2846 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2847 xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork); 2848 ASSERT(nrecs > 0); 2849 2850 /* 2851 * There are some delayed allocation extents in the 2852 * inode, so copy the extents one at a time and skip 2853 * the delayed ones. There must be at least one 2854 * non-delayed extent. 2855 */ 2856 dest_ep = buffer; 2857 copied = 0; 2858 for (i = 0; i < nrecs; i++) { 2859 ep = xfs_iext_get_ext(ifp, i); 2860 start_block = xfs_bmbt_get_startblock(ep); 2861 if (ISNULLSTARTBLOCK(start_block)) { 2862 /* 2863 * It's a delayed allocation extent, so skip it. 2864 */ 2865 continue; 2866 } 2867 2868 /* Translate to on disk format */ 2869 put_unaligned(INT_GET(ep->l0, ARCH_CONVERT), 2870 (__uint64_t*)&dest_ep->l0); 2871 put_unaligned(INT_GET(ep->l1, ARCH_CONVERT), 2872 (__uint64_t*)&dest_ep->l1); 2873 dest_ep++; 2874 copied++; 2875 } 2876 ASSERT(copied != 0); 2877 xfs_validate_extents(ifp, copied, 1, XFS_EXTFMT_INODE(ip)); 2878 2879 return (copied * (uint)sizeof(xfs_bmbt_rec_t)); 2880} 2881 2882/* 2883 * Each of the following cases stores data into the same region 2884 * of the on-disk inode, so only one of them can be valid at 2885 * any given time. While it is possible to have conflicting formats 2886 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is 2887 * in EXTENTS format, this can only happen when the fork has 2888 * changed formats after being modified but before being flushed. 2889 * In these cases, the format always takes precedence, because the 2890 * format indicates the current state of the fork. 2891 */ 2892/*ARGSUSED*/ 2893STATIC int 2894xfs_iflush_fork( 2895 xfs_inode_t *ip, 2896 xfs_dinode_t *dip, 2897 xfs_inode_log_item_t *iip, 2898 int whichfork, 2899 xfs_buf_t *bp) 2900{ 2901 char *cp; 2902 xfs_ifork_t *ifp; 2903 xfs_mount_t *mp; 2904#ifdef XFS_TRANS_DEBUG 2905 int first; 2906#endif 2907 static const short brootflag[2] = 2908 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; 2909 static const short dataflag[2] = 2910 { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; 2911 static const short extflag[2] = 2912 { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; 2913 2914 if (iip == NULL) 2915 return 0; 2916 ifp = XFS_IFORK_PTR(ip, whichfork); 2917 /* 2918 * This can happen if we gave up in iformat in an error path, 2919 * for the attribute fork. 2920 */ 2921 if (ifp == NULL) { 2922 ASSERT(whichfork == XFS_ATTR_FORK); 2923 return 0; 2924 } 2925 cp = XFS_DFORK_PTR(dip, whichfork); 2926 mp = ip->i_mount; 2927 switch (XFS_IFORK_FORMAT(ip, whichfork)) { 2928 case XFS_DINODE_FMT_LOCAL: 2929 if ((iip->ili_format.ilf_fields & dataflag[whichfork]) && 2930 (ifp->if_bytes > 0)) { 2931 ASSERT(ifp->if_u1.if_data != NULL); 2932 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2933 memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); 2934 } 2935 if (whichfork == XFS_DATA_FORK) { 2936 if (unlikely(XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip))) { 2937 XFS_ERROR_REPORT("xfs_iflush_fork", 2938 XFS_ERRLEVEL_LOW, mp); 2939 return XFS_ERROR(EFSCORRUPTED); 2940 } 2941 } 2942 break; 2943 2944 case XFS_DINODE_FMT_EXTENTS: 2945 ASSERT((ifp->if_flags & XFS_IFEXTENTS) || 2946 !(iip->ili_format.ilf_fields & extflag[whichfork])); 2947 ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) || 2948 (ifp->if_bytes == 0)); 2949 ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) || 2950 (ifp->if_bytes > 0)); 2951 if ((iip->ili_format.ilf_fields & extflag[whichfork]) && 2952 (ifp->if_bytes > 0)) { 2953 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); 2954 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, 2955 whichfork); 2956 } 2957 break; 2958 2959 case XFS_DINODE_FMT_BTREE: 2960 if ((iip->ili_format.ilf_fields & brootflag[whichfork]) && 2961 (ifp->if_broot_bytes > 0)) { 2962 ASSERT(ifp->if_broot != NULL); 2963 ASSERT(ifp->if_broot_bytes <= 2964 (XFS_IFORK_SIZE(ip, whichfork) + 2965 XFS_BROOT_SIZE_ADJ)); 2966 xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes, 2967 (xfs_bmdr_block_t *)cp, 2968 XFS_DFORK_SIZE(dip, mp, whichfork)); 2969 } 2970 break; 2971 2972 case XFS_DINODE_FMT_DEV: 2973 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { 2974 ASSERT(whichfork == XFS_DATA_FORK); 2975 INT_SET(dip->di_u.di_dev, ARCH_CONVERT, ip->i_df.if_u2.if_rdev); 2976 } 2977 break; 2978 2979 case XFS_DINODE_FMT_UUID: 2980 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { 2981 ASSERT(whichfork == XFS_DATA_FORK); 2982 memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid, 2983 sizeof(uuid_t)); 2984 } 2985 break; 2986 2987 default: 2988 ASSERT(0); 2989 break; 2990 } 2991 2992 return 0; 2993} 2994 2995/* 2996 * xfs_iflush() will write a modified inode's changes out to the 2997 * inode's on disk home. The caller must have the inode lock held 2998 * in at least shared mode and the inode flush semaphore must be 2999 * held as well. The inode lock will still be held upon return from 3000 * the call and the caller is free to unlock it. 3001 * The inode flush lock will be unlocked when the inode reaches the disk. 3002 * The flags indicate how the inode's buffer should be written out. 3003 */ 3004int 3005xfs_iflush( 3006 xfs_inode_t *ip, 3007 uint flags) 3008{ 3009 xfs_inode_log_item_t *iip; 3010 xfs_buf_t *bp; 3011 xfs_dinode_t *dip; 3012 xfs_mount_t *mp; 3013 int error; 3014 /* REFERENCED */ 3015 xfs_chash_t *ch; 3016 xfs_inode_t *iq; 3017 int clcount; /* count of inodes clustered */ 3018 int bufwasdelwri; 3019 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; 3020 SPLDECL(s); 3021 3022 XFS_STATS_INC(xs_iflush_count); 3023 3024 3025 printf("xfs_iflush: ip %p i_ino %lld\n",ip,ip->i_ino); 3026 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); 3027 ASSERT(valusema(&ip->i_flock) <= 0); 3028 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 3029 ip->i_d.di_nextents > ip->i_df.if_ext_max); 3030 3031 iip = ip->i_itemp; 3032 mp = ip->i_mount; 3033 3034 /* 3035 * If the inode isn't dirty, then just release the inode 3036 * flush lock and do nothing. 3037 */ 3038 if ((ip->i_update_core == 0) && 3039 ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { 3040 ASSERT((iip != NULL) ? 3041 !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1); 3042 xfs_ifunlock(ip); 3043 return 0; 3044 } 3045 3046 /* 3047 * We can't flush the inode until it is unpinned, so 3048 * wait for it. We know noone new can pin it, because 3049 * we are holding the inode lock shared and you need 3050 * to hold it exclusively to pin the inode. 3051 */ 3052 xfs_iunpin_wait(ip); 3053 3054 /* 3055 * This may have been unpinned because the filesystem is shutting 3056 * down forcibly. If that's the case we must not write this inode 3057 * to disk, because the log record didn't make it to disk! 3058 */ 3059 if (XFS_FORCED_SHUTDOWN(mp)) { 3060 ip->i_update_core = 0; 3061 if (iip) 3062 iip->ili_format.ilf_fields = 0; 3063 xfs_ifunlock(ip); 3064 return XFS_ERROR(EIO); 3065 } 3066 3067 /* 3068 * Get the buffer containing the on-disk inode. 3069 */ 3070 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0); 3071 if (error) { 3072 xfs_ifunlock(ip); 3073 return error; 3074 } 3075 3076 /* 3077 * Decide how buffer will be flushed out. This is done before 3078 * the call to xfs_iflush_int because this field is zeroed by it. 3079 */ 3080 if (iip != NULL && iip->ili_format.ilf_fields != 0) { 3081 /* 3082 * Flush out the inode buffer according to the directions 3083 * of the caller. In the cases where the caller has given 3084 * us a choice choose the non-delwri case. This is because 3085 * the inode is in the AIL and we need to get it out soon. 3086 */ 3087 switch (flags) { 3088 case XFS_IFLUSH_SYNC: 3089 case XFS_IFLUSH_DELWRI_ELSE_SYNC: 3090 flags = 0; 3091 break; 3092 case XFS_IFLUSH_ASYNC: 3093 case XFS_IFLUSH_DELWRI_ELSE_ASYNC: 3094 flags = INT_ASYNC; 3095 break; 3096 case XFS_IFLUSH_DELWRI: 3097 flags = INT_DELWRI; 3098 break; 3099 default: 3100 ASSERT(0); 3101 flags = 0; 3102 break; 3103 } 3104 } else { 3105 switch (flags) { 3106 case XFS_IFLUSH_DELWRI_ELSE_SYNC: 3107 case XFS_IFLUSH_DELWRI_ELSE_ASYNC: 3108 case XFS_IFLUSH_DELWRI: 3109 flags = INT_DELWRI; 3110 break; 3111 case XFS_IFLUSH_ASYNC: 3112 flags = INT_ASYNC; 3113 break; 3114 case XFS_IFLUSH_SYNC: 3115 flags = 0; 3116 break; 3117 default: 3118 ASSERT(0); 3119 flags = 0; 3120 break; 3121 } 3122 } 3123 3124 /* 3125 * First flush out the inode that xfs_iflush was called with. 3126 */ 3127 error = xfs_iflush_int(ip, bp); 3128 if (error) { 3129 goto corrupt_out; 3130 } 3131 3132 /* 3133 * inode clustering: 3134 * see if other inodes can be gathered into this write 3135 */ 3136 3137 ip->i_chash->chl_buf = bp; 3138 3139 ch = XFS_CHASH(mp, ip->i_blkno); 3140 s = mutex_spinlock(&ch->ch_lock); 3141 3142 clcount = 0; 3143 for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) { 3144 /* 3145 * Do an un-protected check to see if the inode is dirty and 3146 * is a candidate for flushing. These checks will be repeated 3147 * later after the appropriate locks are acquired. 3148 */ 3149 iip = iq->i_itemp; 3150 if ((iq->i_update_core == 0) && 3151 ((iip == NULL) || 3152 !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) && 3153 xfs_ipincount(iq) == 0) { 3154 continue; 3155 } 3156 3157 /* 3158 * Try to get locks. If any are unavailable, 3159 * then this inode cannot be flushed and is skipped. 3160 */ 3161 3162 /* get inode locks (just i_lock) */ 3163 if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) { 3164 /* get inode flush lock */ 3165 if (xfs_iflock_nowait(iq)) { 3166 /* check if pinned */ 3167 if (xfs_ipincount(iq) == 0) { 3168 /* arriving here means that 3169 * this inode can be flushed. 3170 * first re-check that it's 3171 * dirty 3172 */ 3173 iip = iq->i_itemp; 3174 if ((iq->i_update_core != 0)|| 3175 ((iip != NULL) && 3176 (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { 3177 clcount++; 3178 error = xfs_iflush_int(iq, bp); 3179 if (error) { 3180 xfs_iunlock(iq, 3181 XFS_ILOCK_SHARED); 3182 goto cluster_corrupt_out; 3183 } 3184 } else { 3185 xfs_ifunlock(iq); 3186 } 3187 } else { 3188 xfs_ifunlock(iq); 3189 } 3190 } 3191 xfs_iunlock(iq, XFS_ILOCK_SHARED); 3192 } 3193 } 3194 mutex_spinunlock(&ch->ch_lock, s); 3195 3196 if (clcount) { 3197 XFS_STATS_INC(xs_icluster_flushcnt); 3198 XFS_STATS_ADD(xs_icluster_flushinode, clcount); 3199 } 3200 3201 /* 3202 * If the buffer is pinned then push on the log so we won't 3203 * get stuck waiting in the write for too long. 3204 */ 3205 if (XFS_BUF_ISPINNED(bp)){ 3206 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 3207 } 3208 3209 if (flags & INT_DELWRI) { 3210 xfs_bdwrite(mp, bp); 3211 } else if (flags & INT_ASYNC) { 3212 xfs_bawrite(mp, bp); 3213 } else { 3214 error = xfs_bwrite(mp, bp); 3215 } 3216 return error; 3217 3218corrupt_out: 3219 xfs_buf_relse(bp); 3220 xfs_force_shutdown(mp, XFS_CORRUPT_INCORE); 3221 xfs_iflush_abort(ip); 3222 /* 3223 * Unlocks the flush lock 3224 */ 3225 return XFS_ERROR(EFSCORRUPTED); 3226 3227cluster_corrupt_out: 3228 /* Corruption detected in the clustering loop. Invalidate the 3229 * inode buffer and shut down the filesystem. 3230 */ 3231 mutex_spinunlock(&ch->ch_lock, s); 3232 3233 /* 3234 * Clean up the buffer. If it was B_DELWRI, just release it -- 3235 * brelse can handle it with no problems. If not, shut down the 3236 * filesystem before releasing the buffer. 3237 */ 3238 if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) { 3239 xfs_buf_relse(bp); 3240 } 3241 3242 xfs_force_shutdown(mp, XFS_CORRUPT_INCORE); 3243 3244 if(!bufwasdelwri) { 3245 /* 3246 * Just like incore_relse: if we have b_iodone functions, 3247 * mark the buffer as an error and call them. Otherwise 3248 * mark it as stale and brelse. 3249 */ 3250 if (XFS_BUF_IODONE_FUNC(bp)) { 3251 XFS_BUF_CLR_BDSTRAT_FUNC(bp); 3252 XFS_BUF_UNDONE(bp); 3253 XFS_BUF_STALE(bp); 3254 XFS_BUF_SHUT(bp); 3255 XFS_BUF_ERROR(bp,EIO); 3256 xfs_biodone(bp); 3257 } else { 3258 XFS_BUF_STALE(bp); 3259 xfs_buf_relse(bp); 3260 } 3261 } 3262 3263 xfs_iflush_abort(iq); 3264 /* 3265 * Unlocks the flush lock 3266 */ 3267 return XFS_ERROR(EFSCORRUPTED); 3268} 3269 3270 3271STATIC int 3272xfs_iflush_int( 3273 xfs_inode_t *ip, 3274 xfs_buf_t *bp) 3275{ 3276 xfs_inode_log_item_t *iip; 3277 xfs_dinode_t *dip; 3278 xfs_mount_t *mp; 3279#ifdef XFS_TRANS_DEBUG 3280 // int first; 3281#endif 3282 SPLDECL(s); 3283 3284 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); 3285 ASSERT(valusema(&ip->i_flock) <= 0); 3286 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 3287 ip->i_d.di_nextents > ip->i_df.if_ext_max); 3288 3289 iip = ip->i_itemp; 3290 mp = ip->i_mount; 3291 3292 3293 /* 3294 * If the inode isn't dirty, then just release the inode 3295 * flush lock and do nothing. 3296 */ 3297 if ((ip->i_update_core == 0) && 3298 ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { 3299 xfs_ifunlock(ip); 3300 return 0; 3301 } 3302 3303 /* set *dip = inode's place in the buffer */ 3304 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset); 3305 3306 /* 3307 * Clear i_update_core before copying out the data. 3308 * This is for coordination with our timestamp updates 3309 * that don't hold the inode lock. They will always 3310 * update the timestamps BEFORE setting i_update_core, 3311 * so if we clear i_update_core after they set it we 3312 * are guaranteed to see their updates to the timestamps. 3313 * I believe that this depends on strongly ordered memory 3314 * semantics, but we have that. We use the SYNCHRONIZE 3315 * macro to make sure that the compiler does not reorder 3316 * the i_update_core access below the data copy below. 3317 */ 3318 ip->i_update_core = 0; 3319 SYNCHRONIZE(); 3320 3321 /* 3322 * Make sure to get the latest atime from the Linux inode. 3323 */ 3324 xfs_synchronize_atime(ip); 3325 3326 if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC, 3327 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 3328 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3329 "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p", 3330 ip->i_ino, (int) INT_GET(dip->di_core.di_magic, ARCH_CONVERT), dip); 3331 goto corrupt_out; 3332 } 3333 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 3334 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { 3335 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3336 "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x", 3337 ip->i_ino, ip, ip->i_d.di_magic); 3338 goto corrupt_out; 3339 } 3340 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { 3341 if (XFS_TEST_ERROR( 3342 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 3343 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 3344 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { 3345 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3346 "xfs_iflush: Bad regular inode %Lu, ptr 0x%p", 3347 ip->i_ino, ip); 3348 goto corrupt_out; 3349 } 3350 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 3351 if (XFS_TEST_ERROR( 3352 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 3353 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 3354 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), 3355 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { 3356 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3357 "xfs_iflush: Bad directory inode %Lu, ptr 0x%p", 3358 ip->i_ino, ip); 3359 goto corrupt_out; 3360 } 3361 } 3362 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > 3363 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, 3364 XFS_RANDOM_IFLUSH_5)) { 3365 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3366 "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p", 3367 ip->i_ino, 3368 ip->i_d.di_nextents + ip->i_d.di_anextents, 3369 ip->i_d.di_nblocks, 3370 ip); 3371 goto corrupt_out; 3372 } 3373 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, 3374 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { 3375 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3376 "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p", 3377 ip->i_ino, ip->i_d.di_forkoff, ip); 3378 goto corrupt_out; 3379 } 3380 /* 3381 * bump the flush iteration count, used to detect flushes which 3382 * postdate a log record during recovery. 3383 */ 3384 3385 ip->i_d.di_flushiter++; 3386 3387 /* 3388 * Copy the dirty parts of the inode into the on-disk 3389 * inode. We always copy out the core of the inode, 3390 * because if the inode is dirty at all the core must 3391 * be. 3392 */ 3393 xfs_xlate_dinode_core((xfs_caddr_t)&(dip->di_core), &(ip->i_d), -1); 3394 3395 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 3396 if (ip->i_d.di_flushiter == DI_MAX_FLUSH) 3397 ip->i_d.di_flushiter = 0; 3398 3399 /* 3400 * If this is really an old format inode and the superblock version 3401 * has not been updated to support only new format inodes, then 3402 * convert back to the old inode format. If the superblock version 3403 * has been updated, then make the conversion permanent. 3404 */ 3405 ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || 3406 XFS_SB_VERSION_HASNLINK(&mp->m_sb)); 3407 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { 3408 if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { 3409 /* 3410 * Convert it back. 3411 */ 3412 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); 3413 INT_SET(dip->di_core.di_onlink, ARCH_CONVERT, ip->i_d.di_nlink); 3414 } else { 3415 /* 3416 * The superblock version has already been bumped, 3417 * so just make the conversion to the new inode 3418 * format permanent. 3419 */ 3420 ip->i_d.di_version = XFS_DINODE_VERSION_2; 3421 INT_SET(dip->di_core.di_version, ARCH_CONVERT, XFS_DINODE_VERSION_2); 3422 ip->i_d.di_onlink = 0; 3423 dip->di_core.di_onlink = 0; 3424 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 3425 memset(&(dip->di_core.di_pad[0]), 0, 3426 sizeof(dip->di_core.di_pad)); 3427 ASSERT(ip->i_d.di_projid == 0); 3428 } 3429 } 3430 3431 if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) { 3432 goto corrupt_out; 3433 } 3434 3435 if (XFS_IFORK_Q(ip)) { 3436 /* 3437 * The only error from xfs_iflush_fork is on the data fork. 3438 */ 3439 (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); 3440 } 3441 xfs_inobp_check(mp, bp); 3442 3443 /* 3444 * We've recorded everything logged in the inode, so we'd 3445 * like to clear the ilf_fields bits so we don't log and 3446 * flush things unnecessarily. However, we can't stop 3447 * logging all this information until the data we've copied 3448 * into the disk buffer is written to disk. If we did we might 3449 * overwrite the copy of the inode in the log with all the 3450 * data after re-logging only part of it, and in the face of 3451 * a crash we wouldn't have all the data we need to recover. 3452 * 3453 * What we do is move the bits to the ili_last_fields field. 3454 * When logging the inode, these bits are moved back to the 3455 * ilf_fields field. In the xfs_iflush_done() routine we 3456 * clear ili_last_fields, since we know that the information 3457 * those bits represent is permanently on disk. As long as 3458 * the flush completes before the inode is logged again, then 3459 * both ilf_fields and ili_last_fields will be cleared. 3460 * 3461 * We can play with the ilf_fields bits here, because the inode 3462 * lock must be held exclusively in order to set bits there 3463 * and the flush lock protects the ili_last_fields bits. 3464 * Set ili_logged so the flush done 3465 * routine can tell whether or not to look in the AIL. 3466 * Also, store the current LSN of the inode so that we can tell 3467 * whether the item has moved in the AIL from xfs_iflush_done(). 3468 * In order to read the lsn we need the AIL lock, because 3469 * it is a 64 bit value that cannot be read atomically. 3470 */ 3471 if (iip != NULL && iip->ili_format.ilf_fields != 0) { 3472 iip->ili_last_fields = iip->ili_format.ilf_fields; 3473 iip->ili_format.ilf_fields = 0; 3474 iip->ili_logged = 1; 3475 3476 ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ 3477 AIL_LOCK(mp,s); 3478 iip->ili_flush_lsn = iip->ili_item.li_lsn; 3479 AIL_UNLOCK(mp, s); 3480 3481 /* 3482 * Attach the function xfs_iflush_done to the inode's 3483 * buffer. This will remove the inode from the AIL 3484 * and unlock the inode's flush lock when the inode is 3485 * completely written to disk. 3486 */ 3487 xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*)) 3488 xfs_iflush_done, (xfs_log_item_t *)iip); 3489 3490 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 3491 ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); 3492 } else { 3493 /* 3494 * We're flushing an inode which is not in the AIL and has 3495 * not been logged but has i_update_core set. For this 3496 * case we can use a B_DELWRI flush and immediately drop 3497 * the inode flush lock because we can avoid the whole 3498 * AIL state thing. It's OK to drop the flush lock now, 3499 * because we've already locked the buffer and to do anything 3500 * you really need both. 3501 */ 3502 if (iip != NULL) { 3503 ASSERT(iip->ili_logged == 0); 3504 ASSERT(iip->ili_last_fields == 0); 3505 ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); 3506 } 3507 xfs_ifunlock(ip); 3508 } 3509 3510 return 0; 3511 3512corrupt_out: 3513 return XFS_ERROR(EFSCORRUPTED); 3514} 3515 3516 3517/* 3518 * Flush all inactive inodes in mp. 3519 */ 3520void 3521xfs_iflush_all( 3522 xfs_mount_t *mp) 3523{ 3524 int done; 3525 int purged; 3526 xfs_inode_t *ip; 3527 xfs_vnode_t *vp; 3528 3529 done = 0; 3530 while (!done) { 3531 purged = 0; 3532 XFS_MOUNT_ILOCK(mp); 3533 ip = mp->m_inodes; 3534 if (ip == NULL) { 3535 break; 3536 } 3537 do { 3538 /* Make sure we skip markers inserted by sync */ 3539 if (ip->i_mount == NULL) { 3540 ip = ip->i_mnext; 3541 continue; 3542 } 3543 3544 /* 3545 * It's up to our caller to purge the root 3546 * and quota vnodes later. 3547 */ 3548 vp = XFS_ITOV_NULL(ip); 3549 3550 if (!vp) { 3551 XFS_MOUNT_IUNLOCK(mp); 3552 xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC); 3553 purged = 1; 3554 break; 3555 } 3556 3557 if (vn_count(vp) != 0) { 3558 if (vn_count(vp) == 1 && 3559 (ip == mp->m_rootip || 3560 (mp->m_quotainfo && 3561 (ip->i_ino == mp->m_sb.sb_uquotino || 3562 ip->i_ino == mp->m_sb.sb_gquotino)))) { 3563 ip = ip->i_mnext; 3564 continue; 3565 } 3566 /* 3567 * Ignore busy inodes but continue flushing 3568 * others. 3569 */ 3570 ip = ip->i_mnext; 3571 continue; 3572 } 3573 /* 3574 * Sample vp mapping while holding mp locked on MP 3575 * systems, so we don't purge a reclaimed or 3576 * nonexistent vnode. We break from the loop 3577 * since we know that we modify 3578 * it by pulling ourselves from it in xfs_reclaim() 3579 * called via vn_purge() below. Set ip to the next 3580 * entry in the list anyway so we'll know below 3581 * whether we reached the end or not. 3582 */ 3583 3584 XFS_MOUNT_IUNLOCK(mp); 3585 vn_purge(vp); 3586 purged = 1; 3587 break; 3588 } while (ip != mp->m_inodes); 3589 /* 3590 * We need to distinguish between when we exit the loop 3591 * after a purge and when we simply hit the end of the 3592 * list. We can't use the (ip == mp->m_inodes) test, 3593 * because when we purge an inode at the start of the list 3594 * the next inode on the list becomes mp->m_inodes. That 3595 * would cause such a test to bail out early. The purged 3596 * variable tells us how we got out of the loop. 3597 */ 3598 if (!purged) { 3599 done = 1; 3600 } 3601 } 3602 XFS_MOUNT_IUNLOCK(mp); 3603} 3604 3605/* 3606 * xfs_iaccess: check accessibility of inode for mode. 3607 * This function is quite linuxy now 3608 * probably should be move to a os specfic location 3609 */ 3610int 3611xfs_iaccess( 3612 xfs_inode_t *ip, 3613 accmode_t accmode, 3614 cred_t *cr) 3615{ 3616 xfs_vnode_t *vp; 3617 int error; 3618 3619 mode_t imode; 3620 3621 vp = XFS_ITOV(ip); 3622 /* FreeBSD local change here */ 3623 imode = (ip->i_d.di_mode & MODEMASK) | VTTOIF(vp->v_vnode->v_type); 3624 /* 3625 * Verify that the MAC policy allows the requested access. 3626 */ 3627 if ((error = _MAC_XFS_IACCESS(ip, accmode, cr))) 3628 return XFS_ERROR(error); 3629 3630 if (accmode & VWRITE) { 3631 xfs_mount_t *mp = ip->i_mount; 3632 3633 if ((XVFSTOMNT(XFS_MTOVFS(mp))->mnt_flag & MNT_RDONLY) && 3634 (S_ISREG(imode) || S_ISDIR(imode) || S_ISLNK(imode))) 3635 return XFS_ERROR(EROFS); 3636 3637#ifdef XXXKAN 3638 if (IS_IMMUTABLE(inode)) 3639 return XFS_ERROR(EACCES); 3640#endif 3641 } 3642 3643 /* 3644 * If there's an Access Control List it's used instead of 3645 * the mode bits. 3646 */ 3647 if ((error = _ACL_XFS_IACCESS(ip, accmode, cr)) != -1) 3648 return error ? XFS_ERROR(error) : 0; 3649 3650 3651 /* FreeBSD local change here */ 3652 error = vaccess(vp->v_vnode->v_type, imode, ip->i_d.di_uid, ip->i_d.di_gid, 3653 accmode, cr, NULL); 3654 3655 return (error); 3656} 3657 3658/* 3659 * xfs_iroundup: round up argument to next power of two 3660 */ 3661uint 3662xfs_iroundup( 3663 uint v) 3664{ 3665 int i; 3666 uint m; 3667 3668 if ((v & (v - 1)) == 0) 3669 return v; 3670 ASSERT((v & 0x80000000) == 0); 3671 if ((v & (v + 1)) == 0) 3672 return v + 1; 3673 for (i = 0, m = 1; i < 31; i++, m <<= 1) { 3674 if (v & m) 3675 continue; 3676 v |= m; 3677 if ((v & (v + 1)) == 0) 3678 return v + 1; 3679 } 3680 ASSERT(0); 3681 return( 0 ); 3682} 3683 3684#ifdef XFS_ILOCK_TRACE 3685ktrace_t *xfs_ilock_trace_buf; 3686 3687void 3688xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra) 3689{ 3690 ktrace_enter(ip->i_lock_trace, 3691 (void *)ip, 3692 (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */ 3693 (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */ 3694 (void *)ra, /* caller of ilock */ 3695 (void *)(unsigned long)current_cpu(), 3696 (void *)(unsigned long)current_pid(), 3697 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL); 3698} 3699#endif 3700 3701/* 3702 * Return a pointer to the extent record at file index idx. 3703 */ 3704xfs_bmbt_rec_t * 3705xfs_iext_get_ext( 3706 xfs_ifork_t *ifp, /* inode fork pointer */ 3707 xfs_extnum_t idx) /* index of target extent */ 3708{ 3709 ASSERT(idx >= 0); 3710 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { 3711 return ifp->if_u1.if_ext_irec->er_extbuf; 3712 } else if (ifp->if_flags & XFS_IFEXTIREC) { 3713 xfs_ext_irec_t *erp; /* irec pointer */ 3714 int erp_idx = 0; /* irec index */ 3715 xfs_extnum_t page_idx = idx; /* ext index in target list */ 3716 3717 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); 3718 return &erp->er_extbuf[page_idx]; 3719 } else if (ifp->if_bytes) { 3720 return &ifp->if_u1.if_extents[idx]; 3721 } else { 3722 return NULL; 3723 } 3724} 3725 3726/* 3727 * Insert new item(s) into the extent records for incore inode 3728 * fork 'ifp'. 'count' new items are inserted at index 'idx'. 3729 */ 3730void 3731xfs_iext_insert( 3732 xfs_ifork_t *ifp, /* inode fork pointer */ 3733 xfs_extnum_t idx, /* starting index of new items */ 3734 xfs_extnum_t count, /* number of inserted items */ 3735 xfs_bmbt_irec_t *new) /* items to insert */ 3736{ 3737 xfs_bmbt_rec_t *ep; /* extent record pointer */ 3738 xfs_extnum_t i; /* extent record index */ 3739 3740 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 3741 xfs_iext_add(ifp, idx, count); 3742 for (i = idx; i < idx + count; i++, new++) { 3743 ep = xfs_iext_get_ext(ifp, i); 3744 xfs_bmbt_set_all(ep, new); 3745 } 3746} 3747 3748/* 3749 * This is called when the amount of space required for incore file 3750 * extents needs to be increased. The ext_diff parameter stores the 3751 * number of new extents being added and the idx parameter contains 3752 * the extent index where the new extents will be added. If the new 3753 * extents are being appended, then we just need to (re)allocate and 3754 * initialize the space. Otherwise, if the new extents are being 3755 * inserted into the middle of the existing entries, a bit more work 3756 * is required to make room for the new extents to be inserted. The 3757 * caller is responsible for filling in the new extent entries upon 3758 * return. 3759 */ 3760void 3761xfs_iext_add( 3762 xfs_ifork_t *ifp, /* inode fork pointer */ 3763 xfs_extnum_t idx, /* index to begin adding exts */ 3764 int ext_diff) /* number of extents to add */ 3765{ 3766 int byte_diff; /* new bytes being added */ 3767 int new_size; /* size of extents after adding */ 3768 xfs_extnum_t nextents; /* number of extents in file */ 3769 3770 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3771 ASSERT((idx >= 0) && (idx <= nextents)); 3772 byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); 3773 new_size = ifp->if_bytes + byte_diff; 3774 /* 3775 * If the new number of extents (nextents + ext_diff) 3776 * fits inside the inode, then continue to use the inline 3777 * extent buffer. 3778 */ 3779 if (nextents + ext_diff <= XFS_INLINE_EXTS) { 3780 if (idx < nextents) { 3781 memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], 3782 &ifp->if_u2.if_inline_ext[idx], 3783 (nextents - idx) * sizeof(xfs_bmbt_rec_t)); 3784 memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); 3785 } 3786 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 3787 ifp->if_real_bytes = 0; 3788 ifp->if_lastex = nextents + ext_diff; 3789 } 3790 /* 3791 * Otherwise use a linear (direct) extent list. 3792 * If the extents are currently inside the inode, 3793 * xfs_iext_realloc_direct will switch us from 3794 * inline to direct extent allocation mode. 3795 */ 3796 else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { 3797 xfs_iext_realloc_direct(ifp, new_size); 3798 if (idx < nextents) { 3799 memmove(&ifp->if_u1.if_extents[idx + ext_diff], 3800 &ifp->if_u1.if_extents[idx], 3801 (nextents - idx) * sizeof(xfs_bmbt_rec_t)); 3802 memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); 3803 } 3804 } 3805 /* Indirection array */ 3806 else { 3807 xfs_ext_irec_t *erp; 3808 int erp_idx = 0; 3809 int page_idx = idx; 3810 3811 ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); 3812 if (ifp->if_flags & XFS_IFEXTIREC) { 3813 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); 3814 } else { 3815 xfs_iext_irec_init(ifp); 3816 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3817 erp = ifp->if_u1.if_ext_irec; 3818 } 3819 /* Extents fit in target extent page */ 3820 if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { 3821 if (page_idx < erp->er_extcount) { 3822 memmove(&erp->er_extbuf[page_idx + ext_diff], 3823 &erp->er_extbuf[page_idx], 3824 (erp->er_extcount - page_idx) * 3825 sizeof(xfs_bmbt_rec_t)); 3826 memset(&erp->er_extbuf[page_idx], 0, byte_diff); 3827 } 3828 erp->er_extcount += ext_diff; 3829 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 3830 } 3831 /* Insert a new extent page */ 3832 else if (erp) { 3833 xfs_iext_add_indirect_multi(ifp, 3834 erp_idx, page_idx, ext_diff); 3835 } 3836 /* 3837 * If extent(s) are being appended to the last page in 3838 * the indirection array and the new extent(s) don't fit 3839 * in the page, then erp is NULL and erp_idx is set to 3840 * the next index needed in the indirection array. 3841 */ 3842 else { 3843 int count = ext_diff; 3844 3845 while (count) { 3846 erp = xfs_iext_irec_new(ifp, erp_idx); 3847 erp->er_extcount = count; 3848 count -= MIN(count, (int)XFS_LINEAR_EXTS); 3849 if (count) { 3850 erp_idx++; 3851 } 3852 } 3853 } 3854 } 3855 ifp->if_bytes = new_size; 3856} 3857 3858/* 3859 * This is called when incore extents are being added to the indirection 3860 * array and the new extents do not fit in the target extent list. The 3861 * erp_idx parameter contains the irec index for the target extent list 3862 * in the indirection array, and the idx parameter contains the extent 3863 * index within the list. The number of extents being added is stored 3864 * in the count parameter. 3865 * 3866 * |-------| |-------| 3867 * | | | | idx - number of extents before idx 3868 * | idx | | count | 3869 * | | | | count - number of extents being inserted at idx 3870 * |-------| |-------| 3871 * | count | | nex2 | nex2 - number of extents after idx + count 3872 * |-------| |-------| 3873 */ 3874void 3875xfs_iext_add_indirect_multi( 3876 xfs_ifork_t *ifp, /* inode fork pointer */ 3877 int erp_idx, /* target extent irec index */ 3878 xfs_extnum_t idx, /* index within target list */ 3879 int count) /* new extents being added */ 3880{ 3881 int byte_diff; /* new bytes being added */ 3882 xfs_ext_irec_t *erp; /* pointer to irec entry */ 3883 xfs_extnum_t ext_diff; /* number of extents to add */ 3884 xfs_extnum_t ext_cnt; /* new extents still needed */ 3885 xfs_extnum_t nex2; /* extents after idx + count */ 3886 xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ 3887 int nlists; /* number of irec's (lists) */ 3888 3889 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3890 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3891 nex2 = erp->er_extcount - idx; 3892 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3893 3894 /* 3895 * Save second part of target extent list 3896 * (all extents past */ 3897 if (nex2) { 3898 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); 3899 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_SLEEP); 3900 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); 3901 erp->er_extcount -= nex2; 3902 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); 3903 memset(&erp->er_extbuf[idx], 0, byte_diff); 3904 } 3905 3906 /* 3907 * Add the new extents to the end of the target 3908 * list, then allocate new irec record(s) and 3909 * extent buffer(s) as needed to store the rest 3910 * of the new extents. 3911 */ 3912 ext_cnt = count; 3913 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); 3914 if (ext_diff) { 3915 erp->er_extcount += ext_diff; 3916 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 3917 ext_cnt -= ext_diff; 3918 } 3919 while (ext_cnt) { 3920 erp_idx++; 3921 erp = xfs_iext_irec_new(ifp, erp_idx); 3922 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); 3923 erp->er_extcount = ext_diff; 3924 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 3925 ext_cnt -= ext_diff; 3926 } 3927 3928 /* Add nex2 extents back to indirection array */ 3929 if (nex2) { 3930 xfs_extnum_t ext_avail; 3931 int i; 3932 3933 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); 3934 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; 3935 i = 0; 3936 /* 3937 * If nex2 extents fit in the current page, append 3938 * nex2_ep after the new extents. 3939 */ 3940 if (nex2 <= ext_avail) { 3941 i = erp->er_extcount; 3942 } 3943 /* 3944 * Otherwise, check if space is available in the 3945 * next page. 3946 */ 3947 else if ((erp_idx < nlists - 1) && 3948 (nex2 <= (ext_avail = XFS_LINEAR_EXTS - 3949 ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { 3950 erp_idx++; 3951 erp++; 3952 /* Create a hole for nex2 extents */ 3953 memmove(&erp->er_extbuf[nex2], erp->er_extbuf, 3954 erp->er_extcount * sizeof(xfs_bmbt_rec_t)); 3955 } 3956 /* 3957 * Final choice, create a new extent page for 3958 * nex2 extents. 3959 */ 3960 else { 3961 erp_idx++; 3962 erp = xfs_iext_irec_new(ifp, erp_idx); 3963 } 3964 memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); 3965 kmem_free(nex2_ep, byte_diff); 3966 erp->er_extcount += nex2; 3967 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); 3968 } 3969} 3970 3971/* 3972 * This is called when the amount of space required for incore file 3973 * extents needs to be decreased. The ext_diff parameter stores the 3974 * number of extents to be removed and the idx parameter contains 3975 * the extent index where the extents will be removed from. 3976 * 3977 * If the amount of space needed has decreased below the linear 3978 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous 3979 * extent array. Otherwise, use kmem_realloc() to adjust the 3980 * size to what is needed. 3981 */ 3982void 3983xfs_iext_remove( 3984 xfs_ifork_t *ifp, /* inode fork pointer */ 3985 xfs_extnum_t idx, /* index to begin removing exts */ 3986 int ext_diff) /* number of extents to remove */ 3987{ 3988 xfs_extnum_t nextents; /* number of extents in file */ 3989 int new_size; /* size of extents after removal */ 3990 3991 ASSERT(ext_diff > 0); 3992 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3993 new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); 3994 3995 if (new_size == 0) { 3996 xfs_iext_destroy(ifp); 3997 } else if (ifp->if_flags & XFS_IFEXTIREC) { 3998 xfs_iext_remove_indirect(ifp, idx, ext_diff); 3999 } else if (ifp->if_real_bytes) { 4000 xfs_iext_remove_direct(ifp, idx, ext_diff); 4001 } else { 4002 xfs_iext_remove_inline(ifp, idx, ext_diff); 4003 } 4004 ifp->if_bytes = new_size; 4005} 4006 4007/* 4008 * This removes ext_diff extents from the inline buffer, beginning 4009 * at extent index idx. 4010 */ 4011void 4012xfs_iext_remove_inline( 4013 xfs_ifork_t *ifp, /* inode fork pointer */ 4014 xfs_extnum_t idx, /* index to begin removing exts */ 4015 int ext_diff) /* number of extents to remove */ 4016{ 4017 int nextents; /* number of extents in file */ 4018 4019 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 4020 ASSERT(idx < XFS_INLINE_EXTS); 4021 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4022 ASSERT(((nextents - ext_diff) > 0) && 4023 (nextents - ext_diff) < XFS_INLINE_EXTS); 4024 4025 if (idx + ext_diff < nextents) { 4026 memmove(&ifp->if_u2.if_inline_ext[idx], 4027 &ifp->if_u2.if_inline_ext[idx + ext_diff], 4028 (nextents - (idx + ext_diff)) * 4029 sizeof(xfs_bmbt_rec_t)); 4030 memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], 4031 0, ext_diff * sizeof(xfs_bmbt_rec_t)); 4032 } else { 4033 memset(&ifp->if_u2.if_inline_ext[idx], 0, 4034 ext_diff * sizeof(xfs_bmbt_rec_t)); 4035 } 4036} 4037 4038/* 4039 * This removes ext_diff extents from a linear (direct) extent list, 4040 * beginning at extent index idx. If the extents are being removed 4041 * from the end of the list (ie. truncate) then we just need to re- 4042 * allocate the list to remove the extra space. Otherwise, if the 4043 * extents are being removed from the middle of the existing extent 4044 * entries, then we first need to move the extent records beginning 4045 * at idx + ext_diff up in the list to overwrite the records being 4046 * removed, then remove the extra space via kmem_realloc. 4047 */ 4048void 4049xfs_iext_remove_direct( 4050 xfs_ifork_t *ifp, /* inode fork pointer */ 4051 xfs_extnum_t idx, /* index to begin removing exts */ 4052 int ext_diff) /* number of extents to remove */ 4053{ 4054 xfs_extnum_t nextents; /* number of extents in file */ 4055 int new_size; /* size of extents after removal */ 4056 4057 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 4058 new_size = ifp->if_bytes - 4059 (ext_diff * sizeof(xfs_bmbt_rec_t)); 4060 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4061 4062 if (new_size == 0) { 4063 xfs_iext_destroy(ifp); 4064 return; 4065 } 4066 /* Move extents up in the list (if needed) */ 4067 if (idx + ext_diff < nextents) { 4068 memmove(&ifp->if_u1.if_extents[idx], 4069 &ifp->if_u1.if_extents[idx + ext_diff], 4070 (nextents - (idx + ext_diff)) * 4071 sizeof(xfs_bmbt_rec_t)); 4072 } 4073 memset(&ifp->if_u1.if_extents[nextents - ext_diff], 4074 0, ext_diff * sizeof(xfs_bmbt_rec_t)); 4075 /* 4076 * Reallocate the direct extent list. If the extents 4077 * will fit inside the inode then xfs_iext_realloc_direct 4078 * will switch from direct to inline extent allocation 4079 * mode for us. 4080 */ 4081 xfs_iext_realloc_direct(ifp, new_size); 4082 ifp->if_bytes = new_size; 4083} 4084 4085/* 4086 * This is called when incore extents are being removed from the 4087 * indirection array and the extents being removed span multiple extent 4088 * buffers. The idx parameter contains the file extent index where we 4089 * want to begin removing extents, and the count parameter contains 4090 * how many extents need to be removed. 4091 * 4092 * |-------| |-------| 4093 * | nex1 | | | nex1 - number of extents before idx 4094 * |-------| | count | 4095 * | | | | count - number of extents being removed at idx 4096 * | count | |-------| 4097 * | | | nex2 | nex2 - number of extents after idx + count 4098 * |-------| |-------| 4099 */ 4100void 4101xfs_iext_remove_indirect( 4102 xfs_ifork_t *ifp, /* inode fork pointer */ 4103 xfs_extnum_t idx, /* index to begin removing extents */ 4104 int count) /* number of extents to remove */ 4105{ 4106 xfs_ext_irec_t *erp; /* indirection array pointer */ 4107 int erp_idx = 0; /* indirection array index */ 4108 xfs_extnum_t ext_cnt; /* extents left to remove */ 4109 xfs_extnum_t ext_diff; /* extents to remove in current list */ 4110 xfs_extnum_t nex1; /* number of extents before idx */ 4111 xfs_extnum_t nex2; /* extents after idx + count */ 4112 int nlists; /* entries in indirection array */ 4113 int page_idx = idx; /* index in target extent list */ 4114 4115 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4116 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); 4117 ASSERT(erp != NULL); 4118 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4119 nex1 = page_idx; 4120 ext_cnt = count; 4121 while (ext_cnt) { 4122 nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); 4123 ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); 4124 /* 4125 * Check for deletion of entire list; 4126 * xfs_iext_irec_remove() updates extent offsets. 4127 */ 4128 if (ext_diff == erp->er_extcount) { 4129 xfs_iext_irec_remove(ifp, erp_idx); 4130 ext_cnt -= ext_diff; 4131 nex1 = 0; 4132 if (ext_cnt) { 4133 ASSERT(erp_idx < ifp->if_real_bytes / 4134 XFS_IEXT_BUFSZ); 4135 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 4136 nex1 = 0; 4137 continue; 4138 } else { 4139 break; 4140 } 4141 } 4142 /* Move extents up (if needed) */ 4143 if (nex2) { 4144 memmove(&erp->er_extbuf[nex1], 4145 &erp->er_extbuf[nex1 + ext_diff], 4146 nex2 * sizeof(xfs_bmbt_rec_t)); 4147 } 4148 /* Zero out rest of page */ 4149 memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - 4150 ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); 4151 /* Update remaining counters */ 4152 erp->er_extcount -= ext_diff; 4153 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); 4154 ext_cnt -= ext_diff; 4155 nex1 = 0; 4156 erp_idx++; 4157 erp++; 4158 } 4159 ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); 4160 xfs_iext_irec_compact(ifp); 4161} 4162 4163/* 4164 * Create, destroy, or resize a linear (direct) block of extents. 4165 */ 4166void 4167xfs_iext_realloc_direct( 4168 xfs_ifork_t *ifp, /* inode fork pointer */ 4169 int new_size) /* new size of extents */ 4170{ 4171 int rnew_size; /* real new size of extents */ 4172 4173 rnew_size = new_size; 4174 4175 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || 4176 ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && 4177 (new_size != ifp->if_real_bytes))); 4178 4179 /* Free extent records */ 4180 if (new_size == 0) { 4181 xfs_iext_destroy(ifp); 4182 } 4183 /* Resize direct extent list and zero any new bytes */ 4184 else if (ifp->if_real_bytes) { 4185 /* Check if extents will fit inside the inode */ 4186 if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { 4187 xfs_iext_direct_to_inline(ifp, new_size / 4188 (uint)sizeof(xfs_bmbt_rec_t)); 4189 ifp->if_bytes = new_size; 4190 return; 4191 } 4192 if ((new_size & (new_size - 1)) != 0) { 4193 rnew_size = xfs_iroundup(new_size); 4194 } 4195 if (rnew_size != ifp->if_real_bytes) { 4196 ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) 4197 kmem_realloc(ifp->if_u1.if_extents, 4198 rnew_size, 4199 ifp->if_real_bytes, 4200 KM_SLEEP); 4201 } 4202 if (rnew_size > ifp->if_real_bytes) { 4203 memset(&ifp->if_u1.if_extents[ifp->if_bytes / 4204 (uint)sizeof(xfs_bmbt_rec_t)], 0, 4205 rnew_size - ifp->if_real_bytes); 4206 } 4207 } 4208 /* 4209 * Switch from the inline extent buffer to a direct 4210 * extent list. Be sure to include the inline extent 4211 * bytes in new_size. 4212 */ 4213 else { 4214 new_size += ifp->if_bytes; 4215 if ((new_size & (new_size - 1)) != 0) { 4216 rnew_size = xfs_iroundup(new_size); 4217 } 4218 xfs_iext_inline_to_direct(ifp, rnew_size); 4219 } 4220 ifp->if_real_bytes = rnew_size; 4221 ifp->if_bytes = new_size; 4222} 4223 4224/* 4225 * Switch from linear (direct) extent records to inline buffer. 4226 */ 4227void 4228xfs_iext_direct_to_inline( 4229 xfs_ifork_t *ifp, /* inode fork pointer */ 4230 xfs_extnum_t nextents) /* number of extents in file */ 4231{ 4232 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 4233 ASSERT(nextents <= XFS_INLINE_EXTS); 4234 /* 4235 * The inline buffer was zeroed when we switched 4236 * from inline to direct extent allocation mode, 4237 * so we don't need to clear it here. 4238 */ 4239 memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, 4240 nextents * sizeof(xfs_bmbt_rec_t)); 4241 kmem_free(ifp->if_u1.if_extents, KM_SLEEP); 4242 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 4243 ifp->if_real_bytes = 0; 4244} 4245 4246/* 4247 * Switch from inline buffer to linear (direct) extent records. 4248 * new_size should already be rounded up to the next power of 2 4249 * by the caller (when appropriate), so use new_size as it is. 4250 * However, since new_size may be rounded up, we can't update 4251 * if_bytes here. It is the caller's responsibility to update 4252 * if_bytes upon return. 4253 */ 4254void 4255xfs_iext_inline_to_direct( 4256 xfs_ifork_t *ifp, /* inode fork pointer */ 4257 int new_size) /* number of extents in file */ 4258{ 4259 ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) 4260 kmem_alloc(new_size, KM_SLEEP); 4261 memset(ifp->if_u1.if_extents, 0, new_size); 4262 if (ifp->if_bytes) { 4263 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, 4264 ifp->if_bytes); 4265 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * 4266 sizeof(xfs_bmbt_rec_t)); 4267 } 4268 ifp->if_real_bytes = new_size; 4269} 4270 4271/* 4272 * Resize an extent indirection array to new_size bytes. 4273 */ 4274void 4275xfs_iext_realloc_indirect( 4276 xfs_ifork_t *ifp, /* inode fork pointer */ 4277 int new_size) /* new indirection array size */ 4278{ 4279 int nlists; /* number of irec's (ex lists) */ 4280 int size; /* current indirection array size */ 4281 4282 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4283 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4284 size = nlists * sizeof(xfs_ext_irec_t); 4285 ASSERT(ifp->if_real_bytes); 4286 ASSERT((new_size >= 0) && (new_size != size)); 4287 if (new_size == 0) { 4288 xfs_iext_destroy(ifp); 4289 } else { 4290 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) 4291 kmem_realloc(ifp->if_u1.if_ext_irec, 4292 new_size, size, KM_SLEEP); 4293 } 4294} 4295 4296/* 4297 * Switch from indirection array to linear (direct) extent allocations. 4298 */ 4299void 4300xfs_iext_indirect_to_direct( 4301 xfs_ifork_t *ifp) /* inode fork pointer */ 4302{ 4303 xfs_bmbt_rec_t *ep; /* extent record pointer */ 4304 xfs_extnum_t nextents; /* number of extents in file */ 4305 int size; /* size of file extents */ 4306 4307 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4308 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4309 ASSERT(nextents <= XFS_LINEAR_EXTS); 4310 size = nextents * sizeof(xfs_bmbt_rec_t); 4311 4312 xfs_iext_irec_compact_full(ifp); 4313 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); 4314 4315 ep = ifp->if_u1.if_ext_irec->er_extbuf; 4316 kmem_free(ifp->if_u1.if_ext_irec, sizeof(xfs_ext_irec_t)); 4317 ifp->if_flags &= ~XFS_IFEXTIREC; 4318 ifp->if_u1.if_extents = ep; 4319 ifp->if_bytes = size; 4320 if (nextents < XFS_LINEAR_EXTS) { 4321 xfs_iext_realloc_direct(ifp, size); 4322 } 4323} 4324 4325/* 4326 * Free incore file extents. 4327 */ 4328void 4329xfs_iext_destroy( 4330 xfs_ifork_t *ifp) /* inode fork pointer */ 4331{ 4332 if (ifp->if_flags & XFS_IFEXTIREC) { 4333 int erp_idx; 4334 int nlists; 4335 4336 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4337 for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { 4338 xfs_iext_irec_remove(ifp, erp_idx); 4339 } 4340 ifp->if_flags &= ~XFS_IFEXTIREC; 4341 } else if (ifp->if_real_bytes) { 4342 kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes); 4343 } else if (ifp->if_bytes) { 4344 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * 4345 sizeof(xfs_bmbt_rec_t)); 4346 } 4347 ifp->if_u1.if_extents = NULL; 4348 ifp->if_real_bytes = 0; 4349 ifp->if_bytes = 0; 4350} 4351 4352/* 4353 * Return a pointer to the extent record for file system block bno. 4354 */ 4355xfs_bmbt_rec_t * /* pointer to found extent record */ 4356xfs_iext_bno_to_ext( 4357 xfs_ifork_t *ifp, /* inode fork pointer */ 4358 xfs_fileoff_t bno, /* block number to search for */ 4359 xfs_extnum_t *idxp) /* index of target extent */ 4360{ 4361 xfs_bmbt_rec_t *base; /* pointer to first extent */ 4362 xfs_filblks_t blockcount = 0; /* number of blocks in extent */ 4363 xfs_bmbt_rec_t *ep = NULL; /* pointer to target extent */ 4364 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ 4365 int high; /* upper boundary in search */ 4366 xfs_extnum_t idx = 0; /* index of target extent */ 4367 int low; /* lower boundary in search */ 4368 xfs_extnum_t nextents; /* number of file extents */ 4369 xfs_fileoff_t startoff = 0; /* start offset of extent */ 4370 4371 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4372 if (nextents == 0) { 4373 *idxp = 0; 4374 return NULL; 4375 } 4376 low = 0; 4377 if (ifp->if_flags & XFS_IFEXTIREC) { 4378 /* Find target extent list */ 4379 int erp_idx = 0; 4380 erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); 4381 base = erp->er_extbuf; 4382 high = erp->er_extcount - 1; 4383 } else { 4384 base = ifp->if_u1.if_extents; 4385 high = nextents - 1; 4386 } 4387 /* Binary search extent records */ 4388 while (low <= high) { 4389 idx = (low + high) >> 1; 4390 ep = base + idx; 4391 startoff = xfs_bmbt_get_startoff(ep); 4392 blockcount = xfs_bmbt_get_blockcount(ep); 4393 if (bno < startoff) { 4394 high = idx - 1; 4395 } else if (bno >= startoff + blockcount) { 4396 low = idx + 1; 4397 } else { 4398 /* Convert back to file-based extent index */ 4399 if (ifp->if_flags & XFS_IFEXTIREC) { 4400 idx += erp->er_extoff; 4401 } 4402 *idxp = idx; 4403 return ep; 4404 } 4405 } 4406 /* Convert back to file-based extent index */ 4407 if (ifp->if_flags & XFS_IFEXTIREC) { 4408 idx += erp->er_extoff; 4409 } 4410 if (bno >= startoff + blockcount) { 4411 if (++idx == nextents) { 4412 ep = NULL; 4413 } else { 4414 ep = xfs_iext_get_ext(ifp, idx); 4415 } 4416 } 4417 *idxp = idx; 4418 return ep; 4419} 4420 4421/* 4422 * Return a pointer to the indirection array entry containing the 4423 * extent record for filesystem block bno. Store the index of the 4424 * target irec in *erp_idxp. 4425 */ 4426xfs_ext_irec_t * /* pointer to found extent record */ 4427xfs_iext_bno_to_irec( 4428 xfs_ifork_t *ifp, /* inode fork pointer */ 4429 xfs_fileoff_t bno, /* block number to search for */ 4430 int *erp_idxp) /* irec index of target ext list */ 4431{ 4432 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ 4433 xfs_ext_irec_t *erp_next; /* next indirection array entry */ 4434 int erp_idx; /* indirection array index */ 4435 int nlists; /* number of extent irec's (lists) */ 4436 int high; /* binary search upper limit */ 4437 int low; /* binary search lower limit */ 4438 4439 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4440 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4441 erp_idx = 0; 4442 low = 0; 4443 high = nlists - 1; 4444 while (low <= high) { 4445 erp_idx = (low + high) >> 1; 4446 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 4447 erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; 4448 if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { 4449 high = erp_idx - 1; 4450 } else if (erp_next && bno >= 4451 xfs_bmbt_get_startoff(erp_next->er_extbuf)) { 4452 low = erp_idx + 1; 4453 } else { 4454 break; 4455 } 4456 } 4457 *erp_idxp = erp_idx; 4458 return erp; 4459} 4460 4461/* 4462 * Return a pointer to the indirection array entry containing the 4463 * extent record at file extent index *idxp. Store the index of the 4464 * target irec in *erp_idxp and store the page index of the target 4465 * extent record in *idxp. 4466 */ 4467xfs_ext_irec_t * 4468xfs_iext_idx_to_irec( 4469 xfs_ifork_t *ifp, /* inode fork pointer */ 4470 xfs_extnum_t *idxp, /* extent index (file -> page) */ 4471 int *erp_idxp, /* pointer to target irec */ 4472 int realloc) /* new bytes were just added */ 4473{ 4474 xfs_ext_irec_t *prev; /* pointer to previous irec */ 4475 xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ 4476 int erp_idx; /* indirection array index */ 4477 int nlists; /* number of irec's (ex lists) */ 4478 int high; /* binary search upper limit */ 4479 int low; /* binary search lower limit */ 4480 xfs_extnum_t page_idx = *idxp; /* extent index in target list */ 4481 4482 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4483 ASSERT(page_idx >= 0 && page_idx <= 4484 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); 4485 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4486 erp_idx = 0; 4487 low = 0; 4488 high = nlists - 1; 4489 4490 /* Binary search extent irec's */ 4491 while (low <= high) { 4492 erp_idx = (low + high) >> 1; 4493 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 4494 prev = erp_idx > 0 ? erp - 1 : NULL; 4495 if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && 4496 realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { 4497 high = erp_idx - 1; 4498 } else if (page_idx > erp->er_extoff + erp->er_extcount || 4499 (page_idx == erp->er_extoff + erp->er_extcount && 4500 !realloc)) { 4501 low = erp_idx + 1; 4502 } else if (page_idx == erp->er_extoff + erp->er_extcount && 4503 erp->er_extcount == XFS_LINEAR_EXTS) { 4504 ASSERT(realloc); 4505 page_idx = 0; 4506 erp_idx++; 4507 erp = erp_idx < nlists ? erp + 1 : NULL; 4508 break; 4509 } else { 4510 page_idx -= erp->er_extoff; 4511 break; 4512 } 4513 } 4514 *idxp = page_idx; 4515 *erp_idxp = erp_idx; 4516 return(erp); 4517} 4518 4519/* 4520 * Allocate and initialize an indirection array once the space needed 4521 * for incore extents increases above XFS_IEXT_BUFSZ. 4522 */ 4523void 4524xfs_iext_irec_init( 4525 xfs_ifork_t *ifp) /* inode fork pointer */ 4526{ 4527 xfs_ext_irec_t *erp; /* indirection array pointer */ 4528 xfs_extnum_t nextents; /* number of extents in file */ 4529 4530 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 4531 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4532 ASSERT(nextents <= XFS_LINEAR_EXTS); 4533 4534 erp = (xfs_ext_irec_t *) 4535 kmem_alloc(sizeof(xfs_ext_irec_t), KM_SLEEP); 4536 4537 if (nextents == 0) { 4538 ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) 4539 kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP); 4540 } else if (!ifp->if_real_bytes) { 4541 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); 4542 } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { 4543 xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); 4544 } 4545 erp->er_extbuf = ifp->if_u1.if_extents; 4546 erp->er_extcount = nextents; 4547 erp->er_extoff = 0; 4548 4549 ifp->if_flags |= XFS_IFEXTIREC; 4550 ifp->if_real_bytes = XFS_IEXT_BUFSZ; 4551 ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); 4552 ifp->if_u1.if_ext_irec = erp; 4553 4554 return; 4555} 4556 4557/* 4558 * Allocate and initialize a new entry in the indirection array. 4559 */ 4560xfs_ext_irec_t * 4561xfs_iext_irec_new( 4562 xfs_ifork_t *ifp, /* inode fork pointer */ 4563 int erp_idx) /* index for new irec */ 4564{ 4565 xfs_ext_irec_t *erp; /* indirection array pointer */ 4566 int i; /* loop counter */ 4567 int nlists; /* number of irec's (ex lists) */ 4568 4569 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4570 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4571 4572 /* Resize indirection array */ 4573 xfs_iext_realloc_indirect(ifp, ++nlists * 4574 sizeof(xfs_ext_irec_t)); 4575 /* 4576 * Move records down in the array so the 4577 * new page can use erp_idx. 4578 */ 4579 erp = ifp->if_u1.if_ext_irec; 4580 for (i = nlists - 1; i > erp_idx; i--) { 4581 memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); 4582 } 4583 ASSERT(i == erp_idx); 4584 4585 /* Initialize new extent record */ 4586 erp = ifp->if_u1.if_ext_irec; 4587 erp[erp_idx].er_extbuf = (xfs_bmbt_rec_t *) 4588 kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP); 4589 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; 4590 memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); 4591 erp[erp_idx].er_extcount = 0; 4592 erp[erp_idx].er_extoff = erp_idx > 0 ? 4593 erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; 4594 return (&erp[erp_idx]); 4595} 4596 4597/* 4598 * Remove a record from the indirection array. 4599 */ 4600void 4601xfs_iext_irec_remove( 4602 xfs_ifork_t *ifp, /* inode fork pointer */ 4603 int erp_idx) /* irec index to remove */ 4604{ 4605 xfs_ext_irec_t *erp; /* indirection array pointer */ 4606 int i; /* loop counter */ 4607 int nlists; /* number of irec's (ex lists) */ 4608 4609 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4610 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4611 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 4612 if (erp->er_extbuf) { 4613 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, 4614 -erp->er_extcount); 4615 kmem_free(erp->er_extbuf, XFS_IEXT_BUFSZ); 4616 } 4617 /* Compact extent records */ 4618 erp = ifp->if_u1.if_ext_irec; 4619 for (i = erp_idx; i < nlists - 1; i++) { 4620 memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); 4621 } 4622 /* 4623 * Manually free the last extent record from the indirection 4624 * array. A call to xfs_iext_realloc_indirect() with a size 4625 * of zero would result in a call to xfs_iext_destroy() which 4626 * would in turn call this function again, creating a nasty 4627 * infinite loop. 4628 */ 4629 if (--nlists) { 4630 xfs_iext_realloc_indirect(ifp, 4631 nlists * sizeof(xfs_ext_irec_t)); 4632 } else { 4633 kmem_free(ifp->if_u1.if_ext_irec, 4634 sizeof(xfs_ext_irec_t)); 4635 } 4636 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; 4637} 4638 4639/* 4640 * This is called to clean up large amounts of unused memory allocated 4641 * by the indirection array. Before compacting anything though, verify 4642 * that the indirection array is still needed and switch back to the 4643 * linear extent list (or even the inline buffer) if possible. The 4644 * compaction policy is as follows: 4645 * 4646 * Full Compaction: Extents fit into a single page (or inline buffer) 4647 * Full Compaction: Extents occupy less than 10% of allocated space 4648 * Partial Compaction: Extents occupy > 10% and < 50% of allocated space 4649 * No Compaction: Extents occupy at least 50% of allocated space 4650 */ 4651void 4652xfs_iext_irec_compact( 4653 xfs_ifork_t *ifp) /* inode fork pointer */ 4654{ 4655 xfs_extnum_t nextents; /* number of extents in file */ 4656 int nlists; /* number of irec's (ex lists) */ 4657 4658 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4659 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4660 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4661 4662 if (nextents == 0) { 4663 xfs_iext_destroy(ifp); 4664 } else if (nextents <= XFS_INLINE_EXTS) { 4665 xfs_iext_indirect_to_direct(ifp); 4666 xfs_iext_direct_to_inline(ifp, nextents); 4667 } else if (nextents <= XFS_LINEAR_EXTS) { 4668 xfs_iext_indirect_to_direct(ifp); 4669 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) { 4670 xfs_iext_irec_compact_full(ifp); 4671 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { 4672 xfs_iext_irec_compact_pages(ifp); 4673 } 4674} 4675 4676/* 4677 * Combine extents from neighboring extent pages. 4678 */ 4679void 4680xfs_iext_irec_compact_pages( 4681 xfs_ifork_t *ifp) /* inode fork pointer */ 4682{ 4683 xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ 4684 int erp_idx = 0; /* indirection array index */ 4685 int nlists; /* number of irec's (ex lists) */ 4686 4687 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4688 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4689 while (erp_idx < nlists - 1) { 4690 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 4691 erp_next = erp + 1; 4692 if (erp_next->er_extcount <= 4693 (XFS_LINEAR_EXTS - erp->er_extcount)) { 4694 memmove(&erp->er_extbuf[erp->er_extcount], 4695 erp_next->er_extbuf, erp_next->er_extcount * 4696 sizeof(xfs_bmbt_rec_t)); 4697 erp->er_extcount += erp_next->er_extcount; 4698 /* 4699 * Free page before removing extent record 4700 * so er_extoffs don't get modified in 4701 * xfs_iext_irec_remove. 4702 */ 4703 kmem_free(erp_next->er_extbuf, XFS_IEXT_BUFSZ); 4704 erp_next->er_extbuf = NULL; 4705 xfs_iext_irec_remove(ifp, erp_idx + 1); 4706 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4707 } else { 4708 erp_idx++; 4709 } 4710 } 4711} 4712 4713/* 4714 * Fully compact the extent records managed by the indirection array. 4715 */ 4716void 4717xfs_iext_irec_compact_full( 4718 xfs_ifork_t *ifp) /* inode fork pointer */ 4719{ 4720 xfs_bmbt_rec_t *ep, *ep_next; /* extent record pointers */ 4721 xfs_ext_irec_t *erp, *erp_next; /* extent irec pointers */ 4722 int erp_idx = 0; /* extent irec index */ 4723 int ext_avail; /* empty entries in ex list */ 4724 int ext_diff; /* number of exts to add */ 4725 int nlists; /* number of irec's (ex lists) */ 4726 4727 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4728 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4729 erp = ifp->if_u1.if_ext_irec; 4730 ep = &erp->er_extbuf[erp->er_extcount]; 4731 erp_next = erp + 1; 4732 ep_next = erp_next->er_extbuf; 4733 while (erp_idx < nlists - 1) { 4734 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; 4735 ext_diff = MIN(ext_avail, erp_next->er_extcount); 4736 memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t)); 4737 erp->er_extcount += ext_diff; 4738 erp_next->er_extcount -= ext_diff; 4739 /* Remove next page */ 4740 if (erp_next->er_extcount == 0) { 4741 /* 4742 * Free page before removing extent record 4743 * so er_extoffs don't get modified in 4744 * xfs_iext_irec_remove. 4745 */ 4746 kmem_free(erp_next->er_extbuf, 4747 erp_next->er_extcount * sizeof(xfs_bmbt_rec_t)); 4748 erp_next->er_extbuf = NULL; 4749 xfs_iext_irec_remove(ifp, erp_idx + 1); 4750 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 4751 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4752 /* Update next page */ 4753 } else { 4754 /* Move rest of page up to become next new page */ 4755 memmove(erp_next->er_extbuf, ep_next, 4756 erp_next->er_extcount * sizeof(xfs_bmbt_rec_t)); 4757 ep_next = erp_next->er_extbuf; 4758 memset(&ep_next[erp_next->er_extcount], 0, 4759 (XFS_LINEAR_EXTS - erp_next->er_extcount) * 4760 sizeof(xfs_bmbt_rec_t)); 4761 } 4762 if (erp->er_extcount == XFS_LINEAR_EXTS) { 4763 erp_idx++; 4764 if (erp_idx < nlists) 4765 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 4766 else 4767 break; 4768 } 4769 ep = &erp->er_extbuf[erp->er_extcount]; 4770 erp_next = erp + 1; 4771 ep_next = erp_next->er_extbuf; 4772 } 4773} 4774 4775/* 4776 * This is called to update the er_extoff field in the indirection 4777 * array when extents have been added or removed from one of the 4778 * extent lists. erp_idx contains the irec index to begin updating 4779 * at and ext_diff contains the number of extents that were added 4780 * or removed. 4781 */ 4782void 4783xfs_iext_irec_update_extoffs( 4784 xfs_ifork_t *ifp, /* inode fork pointer */ 4785 int erp_idx, /* irec index to update */ 4786 int ext_diff) /* number of new extents */ 4787{ 4788 int i; /* loop counter */ 4789 int nlists; /* number of irec's (ex lists */ 4790 4791 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4792 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4793 for (i = erp_idx; i < nlists; i++) { 4794 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; 4795 } 4796} 4797