1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 24 */ 25 26/* Portions Copyright 2007 Jeremy Teo */ 27 28#ifdef _KERNEL 29#include <sys/types.h> 30#include <sys/param.h> 31#include <sys/time.h> 32#include <sys/sysmacros.h> 33#include <sys/mntent.h> 34#include <sys/u8_textprep.h> 35#include <sys/dsl_dataset.h> 36#include <sys/vfs.h> 37#include <sys/vnode.h> 38#include <sys/file.h> 39#include <sys/kmem.h> 40#include <sys/errno.h> 41#include <sys/atomic.h> 42#include <sys/zfs_dir.h> 43#include <sys/zfs_acl.h> 44#include <sys/zfs_ioctl.h> 45#include <sys/zfs_rlock.h> 46#include <sys/zfs_fuid.h> 47#include <sys/zfs_vnops.h> 48#include <sys/zfs_ctldir.h> 49#include <sys/dnode.h> 50#include <sys/fs/zfs.h> 51#include <sys/zpl.h> 52#endif /* _KERNEL */ 53 54#include <sys/dmu.h> 55#include <sys/dmu_objset.h> 56#include <sys/dmu_tx.h> 57#include <sys/zfs_refcount.h> 58#include <sys/stat.h> 59#include <sys/zap.h> 60#include <sys/zfs_znode.h> 61#include <sys/sa.h> 62#include <sys/zfs_sa.h> 63#include <sys/zfs_stat.h> 64 65#include "zfs_prop.h" 66#include "zfs_comutil.h" 67 68/* 69 * Functions needed for userland (ie: libzpool) are not put under 70 * #ifdef_KERNEL; the rest of the functions have dependencies 71 * (such as VFS logic) that will not compile easily in userland. 72 */ 73#ifdef _KERNEL 74 75static kmem_cache_t *znode_cache = NULL; 76static kmem_cache_t *znode_hold_cache = NULL; 77unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; 78 79/* 80 * This is used by the test suite so that it can delay znodes from being 81 * freed in order to inspect the unlinked set. 82 */ 83int zfs_unlink_suspend_progress = 0; 84 85/* 86 * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on 87 * z_rangelock. It will modify the offset and length of the lock to reflect 88 * znode-specific information, and convert RL_APPEND to RL_WRITER. This is 89 * called with the rangelock_t's rl_lock held, which avoids races. 90 */ 91static void 92zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) 93{ 94 znode_t *zp = arg; 95 96 /* 97 * If in append mode, convert to writer and lock starting at the 98 * current end of file. 99 */ 100 if (new->lr_type == RL_APPEND) { 101 new->lr_offset = zp->z_size; 102 new->lr_type = RL_WRITER; 103 } 104 105 /* 106 * If we need to grow the block size then lock the whole file range. 107 */ 108 uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); 109 if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || 110 zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { 111 new->lr_offset = 0; 112 new->lr_length = UINT64_MAX; 113 } 114} 115 116/*ARGSUSED*/ 117static int 118zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 119{ 120 znode_t *zp = buf; 121 122 inode_init_once(ZTOI(zp)); 123 list_link_init(&zp->z_link_node); 124 125 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 126 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 127 rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL); 128 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 129 rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); 130 131 zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); 132 133 zp->z_dirlocks = NULL; 134 zp->z_acl_cached = NULL; 135 zp->z_xattr_cached = NULL; 136 zp->z_xattr_parent = 0; 137 return (0); 138} 139 140/*ARGSUSED*/ 141static void 142zfs_znode_cache_destructor(void *buf, void *arg) 143{ 144 znode_t *zp = buf; 145 146 ASSERT(!list_link_active(&zp->z_link_node)); 147 mutex_destroy(&zp->z_lock); 148 rw_destroy(&zp->z_parent_lock); 149 rw_destroy(&zp->z_name_lock); 150 mutex_destroy(&zp->z_acl_lock); 151 rw_destroy(&zp->z_xattr_lock); 152 zfs_rangelock_fini(&zp->z_rangelock); 153 154 ASSERT(zp->z_dirlocks == NULL); 155 ASSERT(zp->z_acl_cached == NULL); 156 ASSERT(zp->z_xattr_cached == NULL); 157} 158 159static int 160zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) 161{ 162 znode_hold_t *zh = buf; 163 164 mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); 165 zfs_refcount_create(&zh->zh_refcount); 166 zh->zh_obj = ZFS_NO_OBJECT; 167 168 return (0); 169} 170 171static void 172zfs_znode_hold_cache_destructor(void *buf, void *arg) 173{ 174 znode_hold_t *zh = buf; 175 176 mutex_destroy(&zh->zh_lock); 177 zfs_refcount_destroy(&zh->zh_refcount); 178} 179 180void 181zfs_znode_init(void) 182{ 183 /* 184 * Initialize zcache. The KMC_SLAB hint is used in order that it be 185 * backed by kmalloc() when on the Linux slab in order that any 186 * wait_on_bit() operations on the related inode operate properly. 187 */ 188 ASSERT(znode_cache == NULL); 189 znode_cache = kmem_cache_create("zfs_znode_cache", 190 sizeof (znode_t), 0, zfs_znode_cache_constructor, 191 zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB); 192 193 ASSERT(znode_hold_cache == NULL); 194 znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", 195 sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, 196 zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); 197} 198 199void 200zfs_znode_fini(void) 201{ 202 /* 203 * Cleanup zcache 204 */ 205 if (znode_cache) 206 kmem_cache_destroy(znode_cache); 207 znode_cache = NULL; 208 209 if (znode_hold_cache) 210 kmem_cache_destroy(znode_hold_cache); 211 znode_hold_cache = NULL; 212} 213 214/* 215 * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to 216 * serialize access to a znode and its SA buffer while the object is being 217 * created or destroyed. This kind of locking would normally reside in the 218 * znode itself but in this case that's impossible because the znode and SA 219 * buffer may not yet exist. Therefore the locking is handled externally 220 * with an array of mutexes and AVLs trees which contain per-object locks. 221 * 222 * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted 223 * in to the correct AVL tree and finally the per-object lock is held. In 224 * zfs_znode_hold_exit() the process is reversed. The per-object lock is 225 * released, removed from the AVL tree and destroyed if there are no waiters. 226 * 227 * This scheme has two important properties: 228 * 229 * 1) No memory allocations are performed while holding one of the z_hold_locks. 230 * This ensures evict(), which can be called from direct memory reclaim, will 231 * never block waiting on a z_hold_locks which just happens to have hashed 232 * to the same index. 233 * 234 * 2) All locks used to serialize access to an object are per-object and never 235 * shared. This minimizes lock contention without creating a large number 236 * of dedicated locks. 237 * 238 * On the downside it does require znode_lock_t structures to be frequently 239 * allocated and freed. However, because these are backed by a kmem cache 240 * and very short lived this cost is minimal. 241 */ 242int 243zfs_znode_hold_compare(const void *a, const void *b) 244{ 245 const znode_hold_t *zh_a = (const znode_hold_t *)a; 246 const znode_hold_t *zh_b = (const znode_hold_t *)b; 247 248 return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); 249} 250 251static boolean_t __maybe_unused 252zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) 253{ 254 znode_hold_t *zh, search; 255 int i = ZFS_OBJ_HASH(zfsvfs, obj); 256 boolean_t held; 257 258 search.zh_obj = obj; 259 260 mutex_enter(&zfsvfs->z_hold_locks[i]); 261 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); 262 held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; 263 mutex_exit(&zfsvfs->z_hold_locks[i]); 264 265 return (held); 266} 267 268static znode_hold_t * 269zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) 270{ 271 znode_hold_t *zh, *zh_new, search; 272 int i = ZFS_OBJ_HASH(zfsvfs, obj); 273 boolean_t found = B_FALSE; 274 275 zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); 276 zh_new->zh_obj = obj; 277 search.zh_obj = obj; 278 279 mutex_enter(&zfsvfs->z_hold_locks[i]); 280 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); 281 if (likely(zh == NULL)) { 282 zh = zh_new; 283 avl_add(&zfsvfs->z_hold_trees[i], zh); 284 } else { 285 ASSERT3U(zh->zh_obj, ==, obj); 286 found = B_TRUE; 287 } 288 zfs_refcount_add(&zh->zh_refcount, NULL); 289 mutex_exit(&zfsvfs->z_hold_locks[i]); 290 291 if (found == B_TRUE) 292 kmem_cache_free(znode_hold_cache, zh_new); 293 294 ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); 295 ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); 296 mutex_enter(&zh->zh_lock); 297 298 return (zh); 299} 300 301static void 302zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) 303{ 304 int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); 305 boolean_t remove = B_FALSE; 306 307 ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); 308 ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); 309 mutex_exit(&zh->zh_lock); 310 311 mutex_enter(&zfsvfs->z_hold_locks[i]); 312 if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) { 313 avl_remove(&zfsvfs->z_hold_trees[i], zh); 314 remove = B_TRUE; 315 } 316 mutex_exit(&zfsvfs->z_hold_locks[i]); 317 318 if (remove == B_TRUE) 319 kmem_cache_free(znode_hold_cache, zh); 320} 321 322dev_t 323zfs_cmpldev(uint64_t dev) 324{ 325 return (dev); 326} 327 328static void 329zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, 330 dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) 331{ 332 ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); 333 334 mutex_enter(&zp->z_lock); 335 336 ASSERT(zp->z_sa_hdl == NULL); 337 ASSERT(zp->z_acl_cached == NULL); 338 if (sa_hdl == NULL) { 339 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, 340 SA_HDL_SHARED, &zp->z_sa_hdl)); 341 } else { 342 zp->z_sa_hdl = sa_hdl; 343 sa_set_userp(sa_hdl, zp); 344 } 345 346 zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; 347 348 mutex_exit(&zp->z_lock); 349} 350 351void 352zfs_znode_dmu_fini(znode_t *zp) 353{ 354 ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked || 355 RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); 356 357 sa_handle_destroy(zp->z_sa_hdl); 358 zp->z_sa_hdl = NULL; 359} 360 361/* 362 * Called by new_inode() to allocate a new inode. 363 */ 364int 365zfs_inode_alloc(struct super_block *sb, struct inode **ip) 366{ 367 znode_t *zp; 368 369 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 370 *ip = ZTOI(zp); 371 372 return (0); 373} 374 375/* 376 * Called in multiple places when an inode should be destroyed. 377 */ 378void 379zfs_inode_destroy(struct inode *ip) 380{ 381 znode_t *zp = ITOZ(ip); 382 zfsvfs_t *zfsvfs = ZTOZSB(zp); 383 384 mutex_enter(&zfsvfs->z_znodes_lock); 385 if (list_link_active(&zp->z_link_node)) { 386 list_remove(&zfsvfs->z_all_znodes, zp); 387 zfsvfs->z_nr_znodes--; 388 } 389 mutex_exit(&zfsvfs->z_znodes_lock); 390 391 if (zp->z_acl_cached) { 392 zfs_acl_free(zp->z_acl_cached); 393 zp->z_acl_cached = NULL; 394 } 395 396 if (zp->z_xattr_cached) { 397 nvlist_free(zp->z_xattr_cached); 398 zp->z_xattr_cached = NULL; 399 } 400 401 kmem_cache_free(znode_cache, zp); 402} 403 404static void 405zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) 406{ 407 uint64_t rdev = 0; 408 409 switch (ip->i_mode & S_IFMT) { 410 case S_IFREG: 411 ip->i_op = &zpl_inode_operations; 412 ip->i_fop = &zpl_file_operations; 413 ip->i_mapping->a_ops = &zpl_address_space_operations; 414 break; 415 416 case S_IFDIR: 417 ip->i_op = &zpl_dir_inode_operations; 418 ip->i_fop = &zpl_dir_file_operations; 419 ITOZ(ip)->z_zn_prefetch = B_TRUE; 420 break; 421 422 case S_IFLNK: 423 ip->i_op = &zpl_symlink_inode_operations; 424 break; 425 426 /* 427 * rdev is only stored in a SA only for device files. 428 */ 429 case S_IFCHR: 430 case S_IFBLK: 431 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, 432 sizeof (rdev)); 433 /*FALLTHROUGH*/ 434 case S_IFIFO: 435 case S_IFSOCK: 436 init_special_inode(ip, ip->i_mode, rdev); 437 ip->i_op = &zpl_special_inode_operations; 438 break; 439 440 default: 441 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n", 442 (u_longlong_t)ip->i_ino, ip->i_mode); 443 444 /* Assume the inode is a file and attempt to continue */ 445 ip->i_mode = S_IFREG | 0644; 446 ip->i_op = &zpl_inode_operations; 447 ip->i_fop = &zpl_file_operations; 448 ip->i_mapping->a_ops = &zpl_address_space_operations; 449 break; 450 } 451} 452 453static void 454zfs_set_inode_flags(znode_t *zp, struct inode *ip) 455{ 456 /* 457 * Linux and Solaris have different sets of file attributes, so we 458 * restrict this conversion to the intersection of the two. 459 */ 460#ifdef HAVE_INODE_SET_FLAGS 461 unsigned int flags = 0; 462 if (zp->z_pflags & ZFS_IMMUTABLE) 463 flags |= S_IMMUTABLE; 464 if (zp->z_pflags & ZFS_APPENDONLY) 465 flags |= S_APPEND; 466 467 inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); 468#else 469 if (zp->z_pflags & ZFS_IMMUTABLE) 470 ip->i_flags |= S_IMMUTABLE; 471 else 472 ip->i_flags &= ~S_IMMUTABLE; 473 474 if (zp->z_pflags & ZFS_APPENDONLY) 475 ip->i_flags |= S_APPEND; 476 else 477 ip->i_flags &= ~S_APPEND; 478#endif 479} 480 481/* 482 * Update the embedded inode given the znode. 483 */ 484void 485zfs_znode_update_vfs(znode_t *zp) 486{ 487 zfsvfs_t *zfsvfs; 488 struct inode *ip; 489 uint32_t blksize; 490 u_longlong_t i_blocks; 491 492 ASSERT(zp != NULL); 493 zfsvfs = ZTOZSB(zp); 494 ip = ZTOI(zp); 495 496 /* Skip .zfs control nodes which do not exist on disk. */ 497 if (zfsctl_is_node(ip)) 498 return; 499 500 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); 501 502 spin_lock(&ip->i_lock); 503 ip->i_mode = zp->z_mode; 504 ip->i_blocks = i_blocks; 505 i_size_write(ip, zp->z_size); 506 spin_unlock(&ip->i_lock); 507} 508 509 510/* 511 * Construct a znode+inode and initialize. 512 * 513 * This does not do a call to dmu_set_user() that is 514 * up to the caller to do, in case you don't want to 515 * return the znode 516 */ 517static znode_t * 518zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, 519 dmu_object_type_t obj_type, sa_handle_t *hdl) 520{ 521 znode_t *zp; 522 struct inode *ip; 523 uint64_t mode; 524 uint64_t parent; 525 uint64_t tmp_gen; 526 uint64_t links; 527 uint64_t z_uid, z_gid; 528 uint64_t atime[2], mtime[2], ctime[2]; 529 uint64_t projid = ZFS_DEFAULT_PROJID; 530 sa_bulk_attr_t bulk[11]; 531 int count = 0; 532 533 ASSERT(zfsvfs != NULL); 534 535 ip = new_inode(zfsvfs->z_sb); 536 if (ip == NULL) 537 return (NULL); 538 539 zp = ITOZ(ip); 540 ASSERT(zp->z_dirlocks == NULL); 541 ASSERT3P(zp->z_acl_cached, ==, NULL); 542 ASSERT3P(zp->z_xattr_cached, ==, NULL); 543 zp->z_unlinked = B_FALSE; 544 zp->z_atime_dirty = B_FALSE; 545 zp->z_is_mapped = B_FALSE; 546 zp->z_is_ctldir = B_FALSE; 547 zp->z_is_stale = B_FALSE; 548 zp->z_suspended = B_FALSE; 549 zp->z_sa_hdl = NULL; 550 zp->z_mapcnt = 0; 551 zp->z_id = db->db_object; 552 zp->z_blksz = blksz; 553 zp->z_seq = 0x7A4653; 554 zp->z_sync_cnt = 0; 555 556 zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); 557 558 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); 559 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); 560 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 561 &zp->z_size, 8); 562 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); 563 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 564 &zp->z_pflags, 8); 565 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, 566 &parent, 8); 567 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8); 568 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8); 569 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); 570 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 571 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 572 573 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || 574 (dmu_objset_projectquota_enabled(zfsvfs->z_os) && 575 (zp->z_pflags & ZFS_PROJID) && 576 sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { 577 if (hdl == NULL) 578 sa_handle_destroy(zp->z_sa_hdl); 579 zp->z_sa_hdl = NULL; 580 goto error; 581 } 582 583 zp->z_projid = projid; 584 zp->z_mode = ip->i_mode = mode; 585 ip->i_generation = (uint32_t)tmp_gen; 586 ip->i_blkbits = SPA_MINBLOCKSHIFT; 587 set_nlink(ip, (uint32_t)links); 588 zfs_uid_write(ip, z_uid); 589 zfs_gid_write(ip, z_gid); 590 zfs_set_inode_flags(zp, ip); 591 592 /* Cache the xattr parent id */ 593 if (zp->z_pflags & ZFS_XATTR) 594 zp->z_xattr_parent = parent; 595 596 ZFS_TIME_DECODE(&ip->i_atime, atime); 597 ZFS_TIME_DECODE(&ip->i_mtime, mtime); 598 ZFS_TIME_DECODE(&ip->i_ctime, ctime); 599 600 ip->i_ino = zp->z_id; 601 zfs_znode_update_vfs(zp); 602 zfs_inode_set_ops(zfsvfs, ip); 603 604 /* 605 * The only way insert_inode_locked() can fail is if the ip->i_ino 606 * number is already hashed for this super block. This can never 607 * happen because the inode numbers map 1:1 with the object numbers. 608 * 609 * Exceptions include rolling back a mounted file system, either 610 * from the zfs rollback or zfs recv command. 611 * 612 * Active inodes are unhashed during the rollback, but since zrele 613 * can happen asynchronously, we can't guarantee they've been 614 * unhashed. This can cause hash collisions in unlinked drain 615 * processing so do not hash unlinked znodes. 616 */ 617 if (links > 0) 618 VERIFY3S(insert_inode_locked(ip), ==, 0); 619 620 mutex_enter(&zfsvfs->z_znodes_lock); 621 list_insert_tail(&zfsvfs->z_all_znodes, zp); 622 zfsvfs->z_nr_znodes++; 623 mutex_exit(&zfsvfs->z_znodes_lock); 624 625 if (links > 0) 626 unlock_new_inode(ip); 627 return (zp); 628 629error: 630 iput(ip); 631 return (NULL); 632} 633 634/* 635 * Safely mark an inode dirty. Inodes which are part of a read-only 636 * file system or snapshot may not be dirtied. 637 */ 638void 639zfs_mark_inode_dirty(struct inode *ip) 640{ 641 zfsvfs_t *zfsvfs = ITOZSB(ip); 642 643 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) 644 return; 645 646 mark_inode_dirty(ip); 647} 648 649static uint64_t empty_xattr; 650static uint64_t pad[4]; 651static zfs_acl_phys_t acl_phys; 652/* 653 * Create a new DMU object to hold a zfs znode. 654 * 655 * IN: dzp - parent directory for new znode 656 * vap - file attributes for new znode 657 * tx - dmu transaction id for zap operations 658 * cr - credentials of caller 659 * flag - flags: 660 * IS_ROOT_NODE - new object will be root 661 * IS_TMPFILE - new object is of O_TMPFILE 662 * IS_XATTR - new object is an attribute 663 * acl_ids - ACL related attributes 664 * 665 * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) 666 * 667 */ 668void 669zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 670 uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) 671{ 672 uint64_t crtime[2], atime[2], mtime[2], ctime[2]; 673 uint64_t mode, size, links, parent, pflags; 674 uint64_t projid = ZFS_DEFAULT_PROJID; 675 uint64_t rdev = 0; 676 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 677 dmu_buf_t *db; 678 inode_timespec_t now; 679 uint64_t gen, obj; 680 int bonuslen; 681 int dnodesize; 682 sa_handle_t *sa_hdl; 683 dmu_object_type_t obj_type; 684 sa_bulk_attr_t *sa_attrs; 685 int cnt = 0; 686 zfs_acl_locator_cb_t locate = { 0 }; 687 znode_hold_t *zh; 688 689 if (zfsvfs->z_replay) { 690 obj = vap->va_nodeid; 691 now = vap->va_ctime; /* see zfs_replay_create() */ 692 gen = vap->va_nblocks; /* ditto */ 693 dnodesize = vap->va_fsid; /* ditto */ 694 } else { 695 obj = 0; 696 gethrestime(&now); 697 gen = dmu_tx_get_txg(tx); 698 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); 699 } 700 701 if (dnodesize == 0) 702 dnodesize = DNODE_MIN_SIZE; 703 704 obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; 705 706 bonuslen = (obj_type == DMU_OT_SA) ? 707 DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; 708 709 /* 710 * Create a new DMU object. 711 */ 712 /* 713 * There's currently no mechanism for pre-reading the blocks that will 714 * be needed to allocate a new object, so we accept the small chance 715 * that there will be an i/o error and we will fail one of the 716 * assertions below. 717 */ 718 if (S_ISDIR(vap->va_mode)) { 719 if (zfsvfs->z_replay) { 720 VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, 721 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 722 obj_type, bonuslen, dnodesize, tx)); 723 } else { 724 obj = zap_create_norm_dnsize(zfsvfs->z_os, 725 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 726 obj_type, bonuslen, dnodesize, tx); 727 } 728 } else { 729 if (zfsvfs->z_replay) { 730 VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, 731 DMU_OT_PLAIN_FILE_CONTENTS, 0, 732 obj_type, bonuslen, dnodesize, tx)); 733 } else { 734 obj = dmu_object_alloc_dnsize(zfsvfs->z_os, 735 DMU_OT_PLAIN_FILE_CONTENTS, 0, 736 obj_type, bonuslen, dnodesize, tx); 737 } 738 } 739 740 zh = zfs_znode_hold_enter(zfsvfs, obj); 741 VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); 742 743 /* 744 * If this is the root, fix up the half-initialized parent pointer 745 * to reference the just-allocated physical data area. 746 */ 747 if (flag & IS_ROOT_NODE) { 748 dzp->z_id = obj; 749 } 750 751 /* 752 * If parent is an xattr, so am I. 753 */ 754 if (dzp->z_pflags & ZFS_XATTR) { 755 flag |= IS_XATTR; 756 } 757 758 if (zfsvfs->z_use_fuids) 759 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 760 else 761 pflags = 0; 762 763 if (S_ISDIR(vap->va_mode)) { 764 size = 2; /* contents ("." and "..") */ 765 links = 2; 766 } else { 767 size = 0; 768 links = (flag & IS_TMPFILE) ? 0 : 1; 769 } 770 771 if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) 772 rdev = vap->va_rdev; 773 774 parent = dzp->z_id; 775 mode = acl_ids->z_mode; 776 if (flag & IS_XATTR) 777 pflags |= ZFS_XATTR; 778 779 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { 780 /* 781 * With ZFS_PROJID flag, we can easily know whether there is 782 * project ID stored on disk or not. See zfs_space_delta_cb(). 783 */ 784 if (obj_type != DMU_OT_ZNODE && 785 dmu_objset_projectquota_enabled(zfsvfs->z_os)) 786 pflags |= ZFS_PROJID; 787 788 /* 789 * Inherit project ID from parent if required. 790 */ 791 projid = zfs_inherit_projid(dzp); 792 if (dzp->z_pflags & ZFS_PROJINHERIT) 793 pflags |= ZFS_PROJINHERIT; 794 } 795 796 /* 797 * No execs denied will be determined when zfs_mode_compute() is called. 798 */ 799 pflags |= acl_ids->z_aclp->z_hints & 800 (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| 801 ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); 802 803 ZFS_TIME_ENCODE(&now, crtime); 804 ZFS_TIME_ENCODE(&now, ctime); 805 806 if (vap->va_mask & ATTR_ATIME) { 807 ZFS_TIME_ENCODE(&vap->va_atime, atime); 808 } else { 809 ZFS_TIME_ENCODE(&now, atime); 810 } 811 812 if (vap->va_mask & ATTR_MTIME) { 813 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 814 } else { 815 ZFS_TIME_ENCODE(&now, mtime); 816 } 817 818 /* Now add in all of the "SA" attributes */ 819 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, 820 &sa_hdl)); 821 822 /* 823 * Setup the array of attributes to be replaced/set on the new file 824 * 825 * order for DMU_OT_ZNODE is critical since it needs to be constructed 826 * in the old znode_phys_t format. Don't change this ordering 827 */ 828 sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); 829 830 if (obj_type == DMU_OT_ZNODE) { 831 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), 832 NULL, &atime, 16); 833 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), 834 NULL, &mtime, 16); 835 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), 836 NULL, &ctime, 16); 837 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), 838 NULL, &crtime, 16); 839 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), 840 NULL, &gen, 8); 841 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), 842 NULL, &mode, 8); 843 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), 844 NULL, &size, 8); 845 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), 846 NULL, &parent, 8); 847 } else { 848 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), 849 NULL, &mode, 8); 850 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), 851 NULL, &size, 8); 852 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), 853 NULL, &gen, 8); 854 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), 855 NULL, &acl_ids->z_fuid, 8); 856 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), 857 NULL, &acl_ids->z_fgid, 8); 858 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), 859 NULL, &parent, 8); 860 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), 861 NULL, &pflags, 8); 862 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), 863 NULL, &atime, 16); 864 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), 865 NULL, &mtime, 16); 866 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), 867 NULL, &ctime, 16); 868 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), 869 NULL, &crtime, 16); 870 } 871 872 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); 873 874 if (obj_type == DMU_OT_ZNODE) { 875 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, 876 &empty_xattr, 8); 877 } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && 878 pflags & ZFS_PROJID) { 879 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), 880 NULL, &projid, 8); 881 } 882 if (obj_type == DMU_OT_ZNODE || 883 (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { 884 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), 885 NULL, &rdev, 8); 886 } 887 if (obj_type == DMU_OT_ZNODE) { 888 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), 889 NULL, &pflags, 8); 890 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, 891 &acl_ids->z_fuid, 8); 892 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, 893 &acl_ids->z_fgid, 8); 894 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, 895 sizeof (uint64_t) * 4); 896 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, 897 &acl_phys, sizeof (zfs_acl_phys_t)); 898 } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { 899 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, 900 &acl_ids->z_aclp->z_acl_count, 8); 901 locate.cb_aclp = acl_ids->z_aclp; 902 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), 903 zfs_acl_data_locator, &locate, 904 acl_ids->z_aclp->z_acl_bytes); 905 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, 906 acl_ids->z_fuid, acl_ids->z_fgid); 907 } 908 909 VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); 910 911 if (!(flag & IS_ROOT_NODE)) { 912 /* 913 * The call to zfs_znode_alloc() may fail if memory is low 914 * via the call path: alloc_inode() -> inode_init_always() -> 915 * security_inode_alloc() -> inode_alloc_security(). Since 916 * the existing code is written such that zfs_mknode() can 917 * not fail retry until sufficient memory has been reclaimed. 918 */ 919 do { 920 *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); 921 } while (*zpp == NULL); 922 923 VERIFY(*zpp != NULL); 924 VERIFY(dzp != NULL); 925 } else { 926 /* 927 * If we are creating the root node, the "parent" we 928 * passed in is the znode for the root. 929 */ 930 *zpp = dzp; 931 932 (*zpp)->z_sa_hdl = sa_hdl; 933 } 934 935 (*zpp)->z_pflags = pflags; 936 (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode; 937 (*zpp)->z_dnodesize = dnodesize; 938 (*zpp)->z_projid = projid; 939 940 if (obj_type == DMU_OT_ZNODE || 941 acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { 942 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); 943 } 944 kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); 945 zfs_znode_hold_exit(zfsvfs, zh); 946} 947 948/* 949 * Update in-core attributes. It is assumed the caller will be doing an 950 * sa_bulk_update to push the changes out. 951 */ 952void 953zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) 954{ 955 xoptattr_t *xoap; 956 boolean_t update_inode = B_FALSE; 957 958 xoap = xva_getxoptattr(xvap); 959 ASSERT(xoap); 960 961 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 962 uint64_t times[2]; 963 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); 964 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), 965 ×, sizeof (times), tx); 966 XVA_SET_RTN(xvap, XAT_CREATETIME); 967 } 968 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 969 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, 970 zp->z_pflags, tx); 971 XVA_SET_RTN(xvap, XAT_READONLY); 972 } 973 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 974 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, 975 zp->z_pflags, tx); 976 XVA_SET_RTN(xvap, XAT_HIDDEN); 977 } 978 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 979 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, 980 zp->z_pflags, tx); 981 XVA_SET_RTN(xvap, XAT_SYSTEM); 982 } 983 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 984 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, 985 zp->z_pflags, tx); 986 XVA_SET_RTN(xvap, XAT_ARCHIVE); 987 } 988 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 989 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, 990 zp->z_pflags, tx); 991 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 992 993 update_inode = B_TRUE; 994 } 995 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 996 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, 997 zp->z_pflags, tx); 998 XVA_SET_RTN(xvap, XAT_NOUNLINK); 999 } 1000 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 1001 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, 1002 zp->z_pflags, tx); 1003 XVA_SET_RTN(xvap, XAT_APPENDONLY); 1004 1005 update_inode = B_TRUE; 1006 } 1007 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 1008 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, 1009 zp->z_pflags, tx); 1010 XVA_SET_RTN(xvap, XAT_NODUMP); 1011 } 1012 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 1013 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, 1014 zp->z_pflags, tx); 1015 XVA_SET_RTN(xvap, XAT_OPAQUE); 1016 } 1017 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 1018 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 1019 xoap->xoa_av_quarantined, zp->z_pflags, tx); 1020 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 1021 } 1022 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 1023 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, 1024 zp->z_pflags, tx); 1025 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 1026 } 1027 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 1028 zfs_sa_set_scanstamp(zp, xvap, tx); 1029 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 1030 } 1031 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 1032 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, 1033 zp->z_pflags, tx); 1034 XVA_SET_RTN(xvap, XAT_REPARSE); 1035 } 1036 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 1037 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, 1038 zp->z_pflags, tx); 1039 XVA_SET_RTN(xvap, XAT_OFFLINE); 1040 } 1041 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 1042 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, 1043 zp->z_pflags, tx); 1044 XVA_SET_RTN(xvap, XAT_SPARSE); 1045 } 1046 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 1047 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, 1048 zp->z_pflags, tx); 1049 XVA_SET_RTN(xvap, XAT_PROJINHERIT); 1050 } 1051 1052 if (update_inode) 1053 zfs_set_inode_flags(zp, ZTOI(zp)); 1054} 1055 1056int 1057zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 1058{ 1059 dmu_object_info_t doi; 1060 dmu_buf_t *db; 1061 znode_t *zp; 1062 znode_hold_t *zh; 1063 int err; 1064 sa_handle_t *hdl; 1065 1066 *zpp = NULL; 1067 1068again: 1069 zh = zfs_znode_hold_enter(zfsvfs, obj_num); 1070 1071 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); 1072 if (err) { 1073 zfs_znode_hold_exit(zfsvfs, zh); 1074 return (err); 1075 } 1076 1077 dmu_object_info_from_db(db, &doi); 1078 if (doi.doi_bonus_type != DMU_OT_SA && 1079 (doi.doi_bonus_type != DMU_OT_ZNODE || 1080 (doi.doi_bonus_type == DMU_OT_ZNODE && 1081 doi.doi_bonus_size < sizeof (znode_phys_t)))) { 1082 sa_buf_rele(db, NULL); 1083 zfs_znode_hold_exit(zfsvfs, zh); 1084 return (SET_ERROR(EINVAL)); 1085 } 1086 1087 hdl = dmu_buf_get_user(db); 1088 if (hdl != NULL) { 1089 zp = sa_get_userdata(hdl); 1090 1091 1092 /* 1093 * Since "SA" does immediate eviction we 1094 * should never find a sa handle that doesn't 1095 * know about the znode. 1096 */ 1097 1098 ASSERT3P(zp, !=, NULL); 1099 1100 mutex_enter(&zp->z_lock); 1101 ASSERT3U(zp->z_id, ==, obj_num); 1102 /* 1103 * If zp->z_unlinked is set, the znode is already marked 1104 * for deletion and should not be discovered. Check this 1105 * after checking igrab() due to fsetxattr() & O_TMPFILE. 1106 * 1107 * If igrab() returns NULL the VFS has independently 1108 * determined the inode should be evicted and has 1109 * called iput_final() to start the eviction process. 1110 * The SA handle is still valid but because the VFS 1111 * requires that the eviction succeed we must drop 1112 * our locks and references to allow the eviction to 1113 * complete. The zfs_zget() may then be retried. 1114 * 1115 * This unlikely case could be optimized by registering 1116 * a sops->drop_inode() callback. The callback would 1117 * need to detect the active SA hold thereby informing 1118 * the VFS that this inode should not be evicted. 1119 */ 1120 if (igrab(ZTOI(zp)) == NULL) { 1121 if (zp->z_unlinked) 1122 err = SET_ERROR(ENOENT); 1123 else 1124 err = SET_ERROR(EAGAIN); 1125 } else { 1126 *zpp = zp; 1127 err = 0; 1128 } 1129 1130 mutex_exit(&zp->z_lock); 1131 sa_buf_rele(db, NULL); 1132 zfs_znode_hold_exit(zfsvfs, zh); 1133 1134 if (err == EAGAIN) { 1135 /* inode might need this to finish evict */ 1136 cond_resched(); 1137 goto again; 1138 } 1139 return (err); 1140 } 1141 1142 /* 1143 * Not found create new znode/vnode but only if file exists. 1144 * 1145 * There is a small window where zfs_vget() could 1146 * find this object while a file create is still in 1147 * progress. This is checked for in zfs_znode_alloc() 1148 * 1149 * if zfs_znode_alloc() fails it will drop the hold on the 1150 * bonus buffer. 1151 */ 1152 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, 1153 doi.doi_bonus_type, NULL); 1154 if (zp == NULL) { 1155 err = SET_ERROR(ENOENT); 1156 } else { 1157 *zpp = zp; 1158 } 1159 zfs_znode_hold_exit(zfsvfs, zh); 1160 return (err); 1161} 1162 1163int 1164zfs_rezget(znode_t *zp) 1165{ 1166 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1167 dmu_object_info_t doi; 1168 dmu_buf_t *db; 1169 uint64_t obj_num = zp->z_id; 1170 uint64_t mode; 1171 uint64_t links; 1172 sa_bulk_attr_t bulk[10]; 1173 int err; 1174 int count = 0; 1175 uint64_t gen; 1176 uint64_t z_uid, z_gid; 1177 uint64_t atime[2], mtime[2], ctime[2]; 1178 uint64_t projid = ZFS_DEFAULT_PROJID; 1179 znode_hold_t *zh; 1180 1181 /* 1182 * skip ctldir, otherwise they will always get invalidated. This will 1183 * cause funny behaviour for the mounted snapdirs. Especially for 1184 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent 1185 * anyone automount it again as long as someone is still using the 1186 * detached mount. 1187 */ 1188 if (zp->z_is_ctldir) 1189 return (0); 1190 1191 zh = zfs_znode_hold_enter(zfsvfs, obj_num); 1192 1193 mutex_enter(&zp->z_acl_lock); 1194 if (zp->z_acl_cached) { 1195 zfs_acl_free(zp->z_acl_cached); 1196 zp->z_acl_cached = NULL; 1197 } 1198 mutex_exit(&zp->z_acl_lock); 1199 1200 rw_enter(&zp->z_xattr_lock, RW_WRITER); 1201 if (zp->z_xattr_cached) { 1202 nvlist_free(zp->z_xattr_cached); 1203 zp->z_xattr_cached = NULL; 1204 } 1205 rw_exit(&zp->z_xattr_lock); 1206 1207 ASSERT(zp->z_sa_hdl == NULL); 1208 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); 1209 if (err) { 1210 zfs_znode_hold_exit(zfsvfs, zh); 1211 return (err); 1212 } 1213 1214 dmu_object_info_from_db(db, &doi); 1215 if (doi.doi_bonus_type != DMU_OT_SA && 1216 (doi.doi_bonus_type != DMU_OT_ZNODE || 1217 (doi.doi_bonus_type == DMU_OT_ZNODE && 1218 doi.doi_bonus_size < sizeof (znode_phys_t)))) { 1219 sa_buf_rele(db, NULL); 1220 zfs_znode_hold_exit(zfsvfs, zh); 1221 return (SET_ERROR(EINVAL)); 1222 } 1223 1224 zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); 1225 1226 /* reload cached values */ 1227 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, 1228 &gen, sizeof (gen)); 1229 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 1230 &zp->z_size, sizeof (zp->z_size)); 1231 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, 1232 &links, sizeof (links)); 1233 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 1234 &zp->z_pflags, sizeof (zp->z_pflags)); 1235 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 1236 &z_uid, sizeof (z_uid)); 1237 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 1238 &z_gid, sizeof (z_gid)); 1239 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 1240 &mode, sizeof (mode)); 1241 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 1242 &atime, 16); 1243 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 1244 &mtime, 16); 1245 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 1246 &ctime, 16); 1247 1248 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { 1249 zfs_znode_dmu_fini(zp); 1250 zfs_znode_hold_exit(zfsvfs, zh); 1251 return (SET_ERROR(EIO)); 1252 } 1253 1254 if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { 1255 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), 1256 &projid, 8); 1257 if (err != 0 && err != ENOENT) { 1258 zfs_znode_dmu_fini(zp); 1259 zfs_znode_hold_exit(zfsvfs, zh); 1260 return (SET_ERROR(err)); 1261 } 1262 } 1263 1264 zp->z_projid = projid; 1265 zp->z_mode = ZTOI(zp)->i_mode = mode; 1266 zfs_uid_write(ZTOI(zp), z_uid); 1267 zfs_gid_write(ZTOI(zp), z_gid); 1268 1269 ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); 1270 ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); 1271 ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime); 1272 1273 if ((uint32_t)gen != ZTOI(zp)->i_generation) { 1274 zfs_znode_dmu_fini(zp); 1275 zfs_znode_hold_exit(zfsvfs, zh); 1276 return (SET_ERROR(EIO)); 1277 } 1278 1279 set_nlink(ZTOI(zp), (uint32_t)links); 1280 zfs_set_inode_flags(zp, ZTOI(zp)); 1281 1282 zp->z_blksz = doi.doi_data_block_size; 1283 zp->z_atime_dirty = B_FALSE; 1284 zfs_znode_update_vfs(zp); 1285 1286 /* 1287 * If the file has zero links, then it has been unlinked on the send 1288 * side and it must be in the received unlinked set. 1289 * We call zfs_znode_dmu_fini() now to prevent any accesses to the 1290 * stale data and to prevent automatic removal of the file in 1291 * zfs_zinactive(). The file will be removed either when it is removed 1292 * on the send side and the next incremental stream is received or 1293 * when the unlinked set gets processed. 1294 */ 1295 zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); 1296 if (zp->z_unlinked) 1297 zfs_znode_dmu_fini(zp); 1298 1299 zfs_znode_hold_exit(zfsvfs, zh); 1300 1301 return (0); 1302} 1303 1304void 1305zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 1306{ 1307 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1308 objset_t *os = zfsvfs->z_os; 1309 uint64_t obj = zp->z_id; 1310 uint64_t acl_obj = zfs_external_acl(zp); 1311 znode_hold_t *zh; 1312 1313 zh = zfs_znode_hold_enter(zfsvfs, obj); 1314 if (acl_obj) { 1315 VERIFY(!zp->z_is_sa); 1316 VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1317 } 1318 VERIFY(0 == dmu_object_free(os, obj, tx)); 1319 zfs_znode_dmu_fini(zp); 1320 zfs_znode_hold_exit(zfsvfs, zh); 1321} 1322 1323void 1324zfs_zinactive(znode_t *zp) 1325{ 1326 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1327 uint64_t z_id = zp->z_id; 1328 znode_hold_t *zh; 1329 1330 ASSERT(zp->z_sa_hdl); 1331 1332 /* 1333 * Don't allow a zfs_zget() while were trying to release this znode. 1334 */ 1335 zh = zfs_znode_hold_enter(zfsvfs, z_id); 1336 1337 mutex_enter(&zp->z_lock); 1338 1339 /* 1340 * If this was the last reference to a file with no links, remove 1341 * the file from the file system unless the file system is mounted 1342 * read-only. That can happen, for example, if the file system was 1343 * originally read-write, the file was opened, then unlinked and 1344 * the file system was made read-only before the file was finally 1345 * closed. The file will remain in the unlinked set. 1346 */ 1347 if (zp->z_unlinked) { 1348 ASSERT(!zfsvfs->z_issnap); 1349 if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) { 1350 mutex_exit(&zp->z_lock); 1351 zfs_znode_hold_exit(zfsvfs, zh); 1352 zfs_rmnode(zp); 1353 return; 1354 } 1355 } 1356 1357 mutex_exit(&zp->z_lock); 1358 zfs_znode_dmu_fini(zp); 1359 1360 zfs_znode_hold_exit(zfsvfs, zh); 1361} 1362 1363#if defined(HAVE_INODE_TIMESPEC64_TIMES) 1364#define zfs_compare_timespec timespec64_compare 1365#else 1366#define zfs_compare_timespec timespec_compare 1367#endif 1368 1369/* 1370 * Determine whether the znode's atime must be updated. The logic mostly 1371 * duplicates the Linux kernel's relatime_need_update() functionality. 1372 * This function is only called if the underlying filesystem actually has 1373 * atime updates enabled. 1374 */ 1375boolean_t 1376zfs_relatime_need_update(const struct inode *ip) 1377{ 1378 inode_timespec_t now; 1379 1380 gethrestime(&now); 1381 /* 1382 * In relatime mode, only update the atime if the previous atime 1383 * is earlier than either the ctime or mtime or if at least a day 1384 * has passed since the last update of atime. 1385 */ 1386 if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) 1387 return (B_TRUE); 1388 1389 if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0) 1390 return (B_TRUE); 1391 1392 if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) 1393 return (B_TRUE); 1394 1395 return (B_FALSE); 1396} 1397 1398/* 1399 * Prepare to update znode time stamps. 1400 * 1401 * IN: zp - znode requiring timestamp update 1402 * flag - ATTR_MTIME, ATTR_CTIME flags 1403 * 1404 * OUT: zp - z_seq 1405 * mtime - new mtime 1406 * ctime - new ctime 1407 * 1408 * Note: We don't update atime here, because we rely on Linux VFS to do 1409 * atime updating. 1410 */ 1411void 1412zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], 1413 uint64_t ctime[2]) 1414{ 1415 inode_timespec_t now; 1416 1417 gethrestime(&now); 1418 1419 zp->z_seq++; 1420 1421 if (flag & ATTR_MTIME) { 1422 ZFS_TIME_ENCODE(&now, mtime); 1423 ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime); 1424 if (ZTOZSB(zp)->z_use_fuids) { 1425 zp->z_pflags |= (ZFS_ARCHIVE | 1426 ZFS_AV_MODIFIED); 1427 } 1428 } 1429 1430 if (flag & ATTR_CTIME) { 1431 ZFS_TIME_ENCODE(&now, ctime); 1432 ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime); 1433 if (ZTOZSB(zp)->z_use_fuids) 1434 zp->z_pflags |= ZFS_ARCHIVE; 1435 } 1436} 1437 1438/* 1439 * Grow the block size for a file. 1440 * 1441 * IN: zp - znode of file to free data in. 1442 * size - requested block size 1443 * tx - open transaction. 1444 * 1445 * NOTE: this function assumes that the znode is write locked. 1446 */ 1447void 1448zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1449{ 1450 int error; 1451 u_longlong_t dummy; 1452 1453 if (size <= zp->z_blksz) 1454 return; 1455 /* 1456 * If the file size is already greater than the current blocksize, 1457 * we will not grow. If there is more than one block in a file, 1458 * the blocksize cannot change. 1459 */ 1460 if (zp->z_blksz && zp->z_size > zp->z_blksz) 1461 return; 1462 1463 error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, 1464 size, 0, tx); 1465 1466 if (error == ENOTSUP) 1467 return; 1468 ASSERT0(error); 1469 1470 /* What blocksize did we actually get? */ 1471 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); 1472} 1473 1474/* 1475 * Increase the file length 1476 * 1477 * IN: zp - znode of file to free data in. 1478 * end - new end-of-file 1479 * 1480 * RETURN: 0 on success, error code on failure 1481 */ 1482static int 1483zfs_extend(znode_t *zp, uint64_t end) 1484{ 1485 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1486 dmu_tx_t *tx; 1487 zfs_locked_range_t *lr; 1488 uint64_t newblksz; 1489 int error; 1490 1491 /* 1492 * We will change zp_size, lock the whole file. 1493 */ 1494 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); 1495 1496 /* 1497 * Nothing to do if file already at desired length. 1498 */ 1499 if (end <= zp->z_size) { 1500 zfs_rangelock_exit(lr); 1501 return (0); 1502 } 1503 tx = dmu_tx_create(zfsvfs->z_os); 1504 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1505 zfs_sa_upgrade_txholds(tx, zp); 1506 if (end > zp->z_blksz && 1507 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1508 /* 1509 * We are growing the file past the current block size. 1510 */ 1511 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { 1512 /* 1513 * File's blocksize is already larger than the 1514 * "recordsize" property. Only let it grow to 1515 * the next power of 2. 1516 */ 1517 ASSERT(!ISP2(zp->z_blksz)); 1518 newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); 1519 } else { 1520 newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); 1521 } 1522 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1523 } else { 1524 newblksz = 0; 1525 } 1526 1527 error = dmu_tx_assign(tx, TXG_WAIT); 1528 if (error) { 1529 dmu_tx_abort(tx); 1530 zfs_rangelock_exit(lr); 1531 return (error); 1532 } 1533 1534 if (newblksz) 1535 zfs_grow_blocksize(zp, newblksz, tx); 1536 1537 zp->z_size = end; 1538 1539 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), 1540 &zp->z_size, sizeof (zp->z_size), tx)); 1541 1542 zfs_rangelock_exit(lr); 1543 1544 dmu_tx_commit(tx); 1545 1546 return (0); 1547} 1548 1549/* 1550 * zfs_zero_partial_page - Modeled after update_pages() but 1551 * with different arguments and semantics for use by zfs_freesp(). 1552 * 1553 * Zeroes a piece of a single page cache entry for zp at offset 1554 * start and length len. 1555 * 1556 * Caller must acquire a range lock on the file for the region 1557 * being zeroed in order that the ARC and page cache stay in sync. 1558 */ 1559static void 1560zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) 1561{ 1562 struct address_space *mp = ZTOI(zp)->i_mapping; 1563 struct page *pp; 1564 int64_t off; 1565 void *pb; 1566 1567 ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK)); 1568 1569 off = start & (PAGE_SIZE - 1); 1570 start &= PAGE_MASK; 1571 1572 pp = find_lock_page(mp, start >> PAGE_SHIFT); 1573 if (pp) { 1574 if (mapping_writably_mapped(mp)) 1575 flush_dcache_page(pp); 1576 1577 pb = kmap(pp); 1578 bzero(pb + off, len); 1579 kunmap(pp); 1580 1581 if (mapping_writably_mapped(mp)) 1582 flush_dcache_page(pp); 1583 1584 mark_page_accessed(pp); 1585 SetPageUptodate(pp); 1586 ClearPageError(pp); 1587 unlock_page(pp); 1588 put_page(pp); 1589 } 1590} 1591 1592/* 1593 * Free space in a file. 1594 * 1595 * IN: zp - znode of file to free data in. 1596 * off - start of section to free. 1597 * len - length of section to free. 1598 * 1599 * RETURN: 0 on success, error code on failure 1600 */ 1601static int 1602zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1603{ 1604 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1605 zfs_locked_range_t *lr; 1606 int error; 1607 1608 /* 1609 * Lock the range being freed. 1610 */ 1611 lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); 1612 1613 /* 1614 * Nothing to do if file already at desired length. 1615 */ 1616 if (off >= zp->z_size) { 1617 zfs_rangelock_exit(lr); 1618 return (0); 1619 } 1620 1621 if (off + len > zp->z_size) 1622 len = zp->z_size - off; 1623 1624 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1625 1626 /* 1627 * Zero partial page cache entries. This must be done under a 1628 * range lock in order to keep the ARC and page cache in sync. 1629 */ 1630 if (zp->z_is_mapped) { 1631 loff_t first_page, last_page, page_len; 1632 loff_t first_page_offset, last_page_offset; 1633 1634 /* first possible full page in hole */ 1635 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT; 1636 /* last page of hole */ 1637 last_page = (off + len) >> PAGE_SHIFT; 1638 1639 /* offset of first_page */ 1640 first_page_offset = first_page << PAGE_SHIFT; 1641 /* offset of last_page */ 1642 last_page_offset = last_page << PAGE_SHIFT; 1643 1644 /* truncate whole pages */ 1645 if (last_page_offset > first_page_offset) { 1646 truncate_inode_pages_range(ZTOI(zp)->i_mapping, 1647 first_page_offset, last_page_offset - 1); 1648 } 1649 1650 /* truncate sub-page ranges */ 1651 if (first_page > last_page) { 1652 /* entire punched area within a single page */ 1653 zfs_zero_partial_page(zp, off, len); 1654 } else { 1655 /* beginning of punched area at the end of a page */ 1656 page_len = first_page_offset - off; 1657 if (page_len > 0) 1658 zfs_zero_partial_page(zp, off, page_len); 1659 1660 /* end of punched area at the beginning of a page */ 1661 page_len = off + len - last_page_offset; 1662 if (page_len > 0) 1663 zfs_zero_partial_page(zp, last_page_offset, 1664 page_len); 1665 } 1666 } 1667 zfs_rangelock_exit(lr); 1668 1669 return (error); 1670} 1671 1672/* 1673 * Truncate a file 1674 * 1675 * IN: zp - znode of file to free data in. 1676 * end - new end-of-file. 1677 * 1678 * RETURN: 0 on success, error code on failure 1679 */ 1680static int 1681zfs_trunc(znode_t *zp, uint64_t end) 1682{ 1683 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1684 dmu_tx_t *tx; 1685 zfs_locked_range_t *lr; 1686 int error; 1687 sa_bulk_attr_t bulk[2]; 1688 int count = 0; 1689 1690 /* 1691 * We will change zp_size, lock the whole file. 1692 */ 1693 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); 1694 1695 /* 1696 * Nothing to do if file already at desired length. 1697 */ 1698 if (end >= zp->z_size) { 1699 zfs_rangelock_exit(lr); 1700 return (0); 1701 } 1702 1703 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, 1704 DMU_OBJECT_END); 1705 if (error) { 1706 zfs_rangelock_exit(lr); 1707 return (error); 1708 } 1709 tx = dmu_tx_create(zfsvfs->z_os); 1710 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1711 zfs_sa_upgrade_txholds(tx, zp); 1712 dmu_tx_mark_netfree(tx); 1713 error = dmu_tx_assign(tx, TXG_WAIT); 1714 if (error) { 1715 dmu_tx_abort(tx); 1716 zfs_rangelock_exit(lr); 1717 return (error); 1718 } 1719 1720 zp->z_size = end; 1721 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), 1722 NULL, &zp->z_size, sizeof (zp->z_size)); 1723 1724 if (end == 0) { 1725 zp->z_pflags &= ~ZFS_SPARSE; 1726 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1727 NULL, &zp->z_pflags, 8); 1728 } 1729 VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); 1730 1731 dmu_tx_commit(tx); 1732 zfs_rangelock_exit(lr); 1733 1734 return (0); 1735} 1736 1737/* 1738 * Free space in a file 1739 * 1740 * IN: zp - znode of file to free data in. 1741 * off - start of range 1742 * len - end of range (0 => EOF) 1743 * flag - current file open mode flags. 1744 * log - TRUE if this action should be logged 1745 * 1746 * RETURN: 0 on success, error code on failure 1747 */ 1748int 1749zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1750{ 1751 dmu_tx_t *tx; 1752 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1753 zilog_t *zilog = zfsvfs->z_log; 1754 uint64_t mode; 1755 uint64_t mtime[2], ctime[2]; 1756 sa_bulk_attr_t bulk[3]; 1757 int count = 0; 1758 int error; 1759 1760 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, 1761 sizeof (mode))) != 0) 1762 return (error); 1763 1764 if (off > zp->z_size) { 1765 error = zfs_extend(zp, off+len); 1766 if (error == 0 && log) 1767 goto log; 1768 goto out; 1769 } 1770 1771 if (len == 0) { 1772 error = zfs_trunc(zp, off); 1773 } else { 1774 if ((error = zfs_free_range(zp, off, len)) == 0 && 1775 off + len > zp->z_size) 1776 error = zfs_extend(zp, off+len); 1777 } 1778 if (error || !log) 1779 goto out; 1780log: 1781 tx = dmu_tx_create(zfsvfs->z_os); 1782 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1783 zfs_sa_upgrade_txholds(tx, zp); 1784 error = dmu_tx_assign(tx, TXG_WAIT); 1785 if (error) { 1786 dmu_tx_abort(tx); 1787 goto out; 1788 } 1789 1790 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); 1791 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); 1792 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1793 NULL, &zp->z_pflags, 8); 1794 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 1795 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1796 ASSERT(error == 0); 1797 1798 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1799 1800 dmu_tx_commit(tx); 1801 1802 zfs_znode_update_vfs(zp); 1803 error = 0; 1804 1805out: 1806 /* 1807 * Truncate the page cache - for file truncate operations, use 1808 * the purpose-built API for truncations. For punching operations, 1809 * the truncation is handled under a range lock in zfs_free_range. 1810 */ 1811 if (len == 0) 1812 truncate_setsize(ZTOI(zp), off); 1813 return (error); 1814} 1815 1816void 1817zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1818{ 1819 struct super_block *sb; 1820 zfsvfs_t *zfsvfs; 1821 uint64_t moid, obj, sa_obj, version; 1822 uint64_t sense = ZFS_CASE_SENSITIVE; 1823 uint64_t norm = 0; 1824 nvpair_t *elem; 1825 int size; 1826 int error; 1827 int i; 1828 znode_t *rootzp = NULL; 1829 vattr_t vattr; 1830 znode_t *zp; 1831 zfs_acl_ids_t acl_ids; 1832 1833 /* 1834 * First attempt to create master node. 1835 */ 1836 /* 1837 * In an empty objset, there are no blocks to read and thus 1838 * there can be no i/o errors (which we assert below). 1839 */ 1840 moid = MASTER_NODE_OBJ; 1841 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1842 DMU_OT_NONE, 0, tx); 1843 ASSERT(error == 0); 1844 1845 /* 1846 * Set starting attributes. 1847 */ 1848 version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); 1849 elem = NULL; 1850 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1851 /* For the moment we expect all zpl props to be uint64_ts */ 1852 uint64_t val; 1853 char *name; 1854 1855 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1856 VERIFY(nvpair_value_uint64(elem, &val) == 0); 1857 name = nvpair_name(elem); 1858 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1859 if (val < version) 1860 version = val; 1861 } else { 1862 error = zap_update(os, moid, name, 8, 1, &val, tx); 1863 } 1864 ASSERT(error == 0); 1865 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1866 norm = val; 1867 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1868 sense = val; 1869 } 1870 ASSERT(version != 0); 1871 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); 1872 1873 /* 1874 * Create zap object used for SA attribute registration 1875 */ 1876 1877 if (version >= ZPL_VERSION_SA) { 1878 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 1879 DMU_OT_NONE, 0, tx); 1880 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 1881 ASSERT(error == 0); 1882 } else { 1883 sa_obj = 0; 1884 } 1885 /* 1886 * Create a delete queue. 1887 */ 1888 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1889 1890 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); 1891 ASSERT(error == 0); 1892 1893 /* 1894 * Create root znode. Create minimal znode/inode/zfsvfs/sb 1895 * to allow zfs_mknode to work. 1896 */ 1897 vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; 1898 vattr.va_mode = S_IFDIR|0755; 1899 vattr.va_uid = crgetuid(cr); 1900 vattr.va_gid = crgetgid(cr); 1901 1902 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1903 rootzp->z_unlinked = B_FALSE; 1904 rootzp->z_atime_dirty = B_FALSE; 1905 rootzp->z_is_sa = USE_SA(version, os); 1906 rootzp->z_pflags = 0; 1907 1908 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 1909 zfsvfs->z_os = os; 1910 zfsvfs->z_parent = zfsvfs; 1911 zfsvfs->z_version = version; 1912 zfsvfs->z_use_fuids = USE_FUIDS(version, os); 1913 zfsvfs->z_use_sa = USE_SA(version, os); 1914 zfsvfs->z_norm = norm; 1915 1916 sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP); 1917 sb->s_fs_info = zfsvfs; 1918 1919 ZTOI(rootzp)->i_sb = sb; 1920 1921 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 1922 &zfsvfs->z_attr_table); 1923 1924 ASSERT(error == 0); 1925 1926 /* 1927 * Fold case on file systems that are always or sometimes case 1928 * insensitive. 1929 */ 1930 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1931 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 1932 1933 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1934 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1935 offsetof(znode_t, z_link_node)); 1936 1937 size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); 1938 zfsvfs->z_hold_size = size; 1939 zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, 1940 KM_SLEEP); 1941 zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); 1942 for (i = 0; i != size; i++) { 1943 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, 1944 sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); 1945 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); 1946 } 1947 1948 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, 1949 cr, NULL, &acl_ids)); 1950 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); 1951 ASSERT3P(zp, ==, rootzp); 1952 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1953 ASSERT(error == 0); 1954 zfs_acl_ids_free(&acl_ids); 1955 1956 atomic_set(&ZTOI(rootzp)->i_count, 0); 1957 sa_handle_destroy(rootzp->z_sa_hdl); 1958 kmem_cache_free(znode_cache, rootzp); 1959 1960 for (i = 0; i != size; i++) { 1961 avl_destroy(&zfsvfs->z_hold_trees[i]); 1962 mutex_destroy(&zfsvfs->z_hold_locks[i]); 1963 } 1964 1965 mutex_destroy(&zfsvfs->z_znodes_lock); 1966 1967 vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); 1968 vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); 1969 kmem_free(sb, sizeof (struct super_block)); 1970 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1971} 1972#endif /* _KERNEL */ 1973 1974static int 1975zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) 1976{ 1977 uint64_t sa_obj = 0; 1978 int error; 1979 1980 error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); 1981 if (error != 0 && error != ENOENT) 1982 return (error); 1983 1984 error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); 1985 return (error); 1986} 1987 1988static int 1989zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, 1990 dmu_buf_t **db, void *tag) 1991{ 1992 dmu_object_info_t doi; 1993 int error; 1994 1995 if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) 1996 return (error); 1997 1998 dmu_object_info_from_db(*db, &doi); 1999 if ((doi.doi_bonus_type != DMU_OT_SA && 2000 doi.doi_bonus_type != DMU_OT_ZNODE) || 2001 (doi.doi_bonus_type == DMU_OT_ZNODE && 2002 doi.doi_bonus_size < sizeof (znode_phys_t))) { 2003 sa_buf_rele(*db, tag); 2004 return (SET_ERROR(ENOTSUP)); 2005 } 2006 2007 error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); 2008 if (error != 0) { 2009 sa_buf_rele(*db, tag); 2010 return (error); 2011 } 2012 2013 return (0); 2014} 2015 2016static void 2017zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) 2018{ 2019 sa_handle_destroy(hdl); 2020 sa_buf_rele(db, tag); 2021} 2022 2023/* 2024 * Given an object number, return its parent object number and whether 2025 * or not the object is an extended attribute directory. 2026 */ 2027static int 2028zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, 2029 uint64_t *pobjp, int *is_xattrdir) 2030{ 2031 uint64_t parent; 2032 uint64_t pflags; 2033 uint64_t mode; 2034 uint64_t parent_mode; 2035 sa_bulk_attr_t bulk[3]; 2036 sa_handle_t *sa_hdl; 2037 dmu_buf_t *sa_db; 2038 int count = 0; 2039 int error; 2040 2041 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, 2042 &parent, sizeof (parent)); 2043 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, 2044 &pflags, sizeof (pflags)); 2045 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, 2046 &mode, sizeof (mode)); 2047 2048 if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) 2049 return (error); 2050 2051 /* 2052 * When a link is removed its parent pointer is not changed and will 2053 * be invalid. There are two cases where a link is removed but the 2054 * file stays around, when it goes to the delete queue and when there 2055 * are additional links. 2056 */ 2057 error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); 2058 if (error != 0) 2059 return (error); 2060 2061 error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); 2062 zfs_release_sa_handle(sa_hdl, sa_db, FTAG); 2063 if (error != 0) 2064 return (error); 2065 2066 *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); 2067 2068 /* 2069 * Extended attributes can be applied to files, directories, etc. 2070 * Otherwise the parent must be a directory. 2071 */ 2072 if (!*is_xattrdir && !S_ISDIR(parent_mode)) 2073 return (SET_ERROR(EINVAL)); 2074 2075 *pobjp = parent; 2076 2077 return (0); 2078} 2079 2080/* 2081 * Given an object number, return some zpl level statistics 2082 */ 2083static int 2084zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, 2085 zfs_stat_t *sb) 2086{ 2087 sa_bulk_attr_t bulk[4]; 2088 int count = 0; 2089 2090 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, 2091 &sb->zs_mode, sizeof (sb->zs_mode)); 2092 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, 2093 &sb->zs_gen, sizeof (sb->zs_gen)); 2094 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, 2095 &sb->zs_links, sizeof (sb->zs_links)); 2096 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, 2097 &sb->zs_ctime, sizeof (sb->zs_ctime)); 2098 2099 return (sa_bulk_lookup(hdl, bulk, count)); 2100} 2101 2102static int 2103zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, 2104 sa_attr_type_t *sa_table, char *buf, int len) 2105{ 2106 sa_handle_t *sa_hdl; 2107 sa_handle_t *prevhdl = NULL; 2108 dmu_buf_t *prevdb = NULL; 2109 dmu_buf_t *sa_db = NULL; 2110 char *path = buf + len - 1; 2111 int error; 2112 2113 *path = '\0'; 2114 sa_hdl = hdl; 2115 2116 uint64_t deleteq_obj; 2117 VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, 2118 ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); 2119 error = zap_lookup_int(osp, deleteq_obj, obj); 2120 if (error == 0) { 2121 return (ESTALE); 2122 } else if (error != ENOENT) { 2123 return (error); 2124 } 2125 error = 0; 2126 2127 for (;;) { 2128 uint64_t pobj = 0; 2129 char component[MAXNAMELEN + 2]; 2130 size_t complen; 2131 int is_xattrdir = 0; 2132 2133 if (prevdb) { 2134 ASSERT(prevhdl != NULL); 2135 zfs_release_sa_handle(prevhdl, prevdb, FTAG); 2136 } 2137 2138 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, 2139 &is_xattrdir)) != 0) 2140 break; 2141 2142 if (pobj == obj) { 2143 if (path[0] != '/') 2144 *--path = '/'; 2145 break; 2146 } 2147 2148 component[0] = '/'; 2149 if (is_xattrdir) { 2150 (void) sprintf(component + 1, "<xattrdir>"); 2151 } else { 2152 error = zap_value_search(osp, pobj, obj, 2153 ZFS_DIRENT_OBJ(-1ULL), component + 1); 2154 if (error != 0) 2155 break; 2156 } 2157 2158 complen = strlen(component); 2159 path -= complen; 2160 ASSERT(path >= buf); 2161 bcopy(component, path, complen); 2162 obj = pobj; 2163 2164 if (sa_hdl != hdl) { 2165 prevhdl = sa_hdl; 2166 prevdb = sa_db; 2167 } 2168 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); 2169 if (error != 0) { 2170 sa_hdl = prevhdl; 2171 sa_db = prevdb; 2172 break; 2173 } 2174 } 2175 2176 if (sa_hdl != NULL && sa_hdl != hdl) { 2177 ASSERT(sa_db != NULL); 2178 zfs_release_sa_handle(sa_hdl, sa_db, FTAG); 2179 } 2180 2181 if (error == 0) 2182 (void) memmove(buf, path, buf + len - path); 2183 2184 return (error); 2185} 2186 2187int 2188zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 2189{ 2190 sa_attr_type_t *sa_table; 2191 sa_handle_t *hdl; 2192 dmu_buf_t *db; 2193 int error; 2194 2195 error = zfs_sa_setup(osp, &sa_table); 2196 if (error != 0) 2197 return (error); 2198 2199 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); 2200 if (error != 0) 2201 return (error); 2202 2203 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); 2204 2205 zfs_release_sa_handle(hdl, db, FTAG); 2206 return (error); 2207} 2208 2209int 2210zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, 2211 char *buf, int len) 2212{ 2213 char *path = buf + len - 1; 2214 sa_attr_type_t *sa_table; 2215 sa_handle_t *hdl; 2216 dmu_buf_t *db; 2217 int error; 2218 2219 *path = '\0'; 2220 2221 error = zfs_sa_setup(osp, &sa_table); 2222 if (error != 0) 2223 return (error); 2224 2225 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); 2226 if (error != 0) 2227 return (error); 2228 2229 error = zfs_obj_to_stats_impl(hdl, sa_table, sb); 2230 if (error != 0) { 2231 zfs_release_sa_handle(hdl, db, FTAG); 2232 return (error); 2233 } 2234 2235 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); 2236 2237 zfs_release_sa_handle(hdl, db, FTAG); 2238 return (error); 2239} 2240 2241#if defined(_KERNEL) 2242EXPORT_SYMBOL(zfs_create_fs); 2243EXPORT_SYMBOL(zfs_obj_to_path); 2244 2245/* CSTYLED */ 2246module_param(zfs_object_mutex_size, uint, 0644); 2247MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); 2248module_param(zfs_unlink_suspend_progress, int, 0644); 2249MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks " 2250"(debug - leaks space into the unlinked set)"); 2251#endif 2252