zfs_znode.c revision 211932
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22209962Smm * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23168404Spjd * Use is subject to license terms. 24168404Spjd */ 25168404Spjd 26169195Spjd/* Portions Copyright 2007 Jeremy Teo */ 27169195Spjd 28168404Spjd#ifdef _KERNEL 29168404Spjd#include <sys/types.h> 30168404Spjd#include <sys/param.h> 31168404Spjd#include <sys/time.h> 32168404Spjd#include <sys/systm.h> 33168404Spjd#include <sys/sysmacros.h> 34168404Spjd#include <sys/resource.h> 35168404Spjd#include <sys/mntent.h> 36185029Spjd#include <sys/u8_textprep.h> 37185029Spjd#include <sys/dsl_dataset.h> 38168404Spjd#include <sys/vfs.h> 39168404Spjd#include <sys/vnode.h> 40168404Spjd#include <sys/file.h> 41168404Spjd#include <sys/kmem.h> 42168404Spjd#include <sys/errno.h> 43168404Spjd#include <sys/unistd.h> 44168404Spjd#include <sys/atomic.h> 45168404Spjd#include <sys/zfs_dir.h> 46168404Spjd#include <sys/zfs_acl.h> 47168404Spjd#include <sys/zfs_ioctl.h> 48168404Spjd#include <sys/zfs_rlock.h> 49185029Spjd#include <sys/zfs_fuid.h> 50168404Spjd#include <sys/fs/zfs.h> 51185029Spjd#include <sys/kidmap.h> 52168404Spjd#endif /* _KERNEL */ 53168404Spjd 54168404Spjd#include <sys/dmu.h> 55168404Spjd#include <sys/refcount.h> 56168404Spjd#include <sys/stat.h> 57168404Spjd#include <sys/zap.h> 58168404Spjd#include <sys/zfs_znode.h> 59168404Spjd#include <sys/refcount.h> 60168404Spjd 61185029Spjd#include "zfs_prop.h" 62185029Spjd 63173268Slulf/* Used by fstat(1). */ 64173268SlulfSYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t), 65173268Slulf "sizeof(znode_t)"); 66173268Slulf 67168404Spjd/* 68185029Spjd * Define ZNODE_STATS to turn on statistic gathering. By default, it is only 69185029Spjd * turned on when DEBUG is also defined. 70185029Spjd */ 71185029Spjd#ifdef DEBUG 72185029Spjd#define ZNODE_STATS 73185029Spjd#endif /* DEBUG */ 74185029Spjd 75185029Spjd#ifdef ZNODE_STATS 76185029Spjd#define ZNODE_STAT_ADD(stat) ((stat)++) 77185029Spjd#else 78185029Spjd#define ZNODE_STAT_ADD(stat) /* nothing */ 79185029Spjd#endif /* ZNODE_STATS */ 80185029Spjd 81185029Spjd#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) 82185029Spjd#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) 83185029Spjd 84185029Spjd/* 85168404Spjd * Functions needed for userland (ie: libzpool) are not put under 86168404Spjd * #ifdef_KERNEL; the rest of the functions have dependencies 87168404Spjd * (such as VFS logic) that will not compile easily in userland. 88168404Spjd */ 89168404Spjd#ifdef _KERNEL 90210470Smm/* 91210470Smm * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to 92210470Smm * be freed before it can be safely accessed. 93210470Smm */ 94210470Smmkrwlock_t zfsvfs_lock; 95210470Smm 96185029Spjdstatic kmem_cache_t *znode_cache = NULL; 97168404Spjd 98168404Spjd/*ARGSUSED*/ 99168404Spjdstatic void 100185029Spjdznode_evict_error(dmu_buf_t *dbuf, void *user_ptr) 101168404Spjd{ 102185029Spjd#if 1 /* XXXPJD: From OpenSolaris. */ 103185029Spjd /* 104185029Spjd * We should never drop all dbuf refs without first clearing 105185029Spjd * the eviction callback. 106185029Spjd */ 107185029Spjd panic("evicting znode %p\n", user_ptr); 108185029Spjd#else /* XXXPJD */ 109168404Spjd znode_t *zp = user_ptr; 110168488Spjd vnode_t *vp; 111168404Spjd 112168404Spjd mutex_enter(&zp->z_lock); 113185029Spjd zp->z_dbuf = NULL; 114168488Spjd vp = ZTOV(zp); 115168404Spjd if (vp == NULL) { 116168404Spjd mutex_exit(&zp->z_lock); 117168404Spjd zfs_znode_free(zp); 118168404Spjd } else if (vp->v_count == 0) { 119197153Spjd zp->z_vnode = NULL; 120168488Spjd vhold(vp); 121168404Spjd mutex_exit(&zp->z_lock); 122185029Spjd vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); 123168404Spjd vrecycle(vp, curthread); 124175294Sattilio VOP_UNLOCK(vp, 0); 125168404Spjd vdrop(vp); 126168404Spjd zfs_znode_free(zp); 127168404Spjd } else { 128168404Spjd mutex_exit(&zp->z_lock); 129168404Spjd } 130185029Spjd#endif 131168404Spjd} 132168404Spjd 133168404Spjdextern struct vop_vector zfs_vnodeops; 134168404Spjdextern struct vop_vector zfs_fifoops; 135209962Smmextern struct vop_vector zfs_shareops; 136168404Spjd 137168404Spjd/* 138168404Spjd * XXX: We cannot use this function as a cache constructor, because 139168404Spjd * there is one global cache for all file systems and we need 140168404Spjd * to pass vfsp here, which is not possible, because argument 141168404Spjd * 'cdrarg' is defined at kmem_cache_create() time. 142168404Spjd */ 143168404Spjdstatic int 144185029Spjdzfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 145168404Spjd{ 146168404Spjd znode_t *zp = buf; 147169196Spjd vnode_t *vp; 148185029Spjd vfs_t *vfsp = arg; 149168404Spjd int error; 150168404Spjd 151185029Spjd POINTER_INVALIDATE(&zp->z_zfsvfs); 152185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 153185029Spjd 154199156Spjd if (vfsp != NULL) { 155199156Spjd error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp); 156199156Spjd if (error != 0 && (kmflags & KM_NOSLEEP)) 157199156Spjd return (-1); 158199156Spjd ASSERT(error == 0); 159199156Spjd vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 160199156Spjd zp->z_vnode = vp; 161199156Spjd vp->v_data = (caddr_t)zp; 162199156Spjd VN_LOCK_AREC(vp); 163199156Spjd } else { 164199156Spjd zp->z_vnode = NULL; 165199156Spjd } 166185029Spjd 167185029Spjd list_link_init(&zp->z_link_node); 168185029Spjd 169168404Spjd mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 170168404Spjd rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 171168404Spjd rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); 172168404Spjd mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 173168404Spjd 174168404Spjd mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); 175168404Spjd avl_create(&zp->z_range_avl, zfs_range_compare, 176168404Spjd sizeof (rl_t), offsetof(rl_t, r_node)); 177168404Spjd 178185029Spjd zp->z_dbuf = NULL; 179185029Spjd zp->z_dirlocks = NULL; 180211932Smm zp->z_acl_cached = NULL; 181168404Spjd return (0); 182168404Spjd} 183168404Spjd 184168404Spjd/*ARGSUSED*/ 185168404Spjdstatic void 186185029Spjdzfs_znode_cache_destructor(void *buf, void *arg) 187168404Spjd{ 188168404Spjd znode_t *zp = buf; 189168404Spjd 190185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 191185029Spjd ASSERT(ZTOV(zp) == NULL); 192185029Spjd vn_free(ZTOV(zp)); 193185029Spjd ASSERT(!list_link_active(&zp->z_link_node)); 194168404Spjd mutex_destroy(&zp->z_lock); 195168404Spjd rw_destroy(&zp->z_parent_lock); 196168404Spjd rw_destroy(&zp->z_name_lock); 197168404Spjd mutex_destroy(&zp->z_acl_lock); 198185029Spjd avl_destroy(&zp->z_range_avl); 199168404Spjd mutex_destroy(&zp->z_range_lock); 200168404Spjd 201185029Spjd ASSERT(zp->z_dbuf == NULL); 202185029Spjd ASSERT(zp->z_dirlocks == NULL); 203211932Smm ASSERT(zp->z_acl_cached == NULL); 204168404Spjd} 205168404Spjd 206185029Spjd#ifdef ZNODE_STATS 207185029Spjdstatic struct { 208185029Spjd uint64_t zms_zfsvfs_invalid; 209210470Smm uint64_t zms_zfsvfs_recheck1; 210185029Spjd uint64_t zms_zfsvfs_unmounted; 211210470Smm uint64_t zms_zfsvfs_recheck2; 212185029Spjd uint64_t zms_obj_held; 213185029Spjd uint64_t zms_vnode_locked; 214185029Spjd uint64_t zms_not_only_dnlc; 215185029Spjd} znode_move_stats; 216185029Spjd#endif /* ZNODE_STATS */ 217185029Spjd 218185029Spjd#if defined(sun) 219185029Spjdstatic void 220185029Spjdzfs_znode_move_impl(znode_t *ozp, znode_t *nzp) 221185029Spjd{ 222185029Spjd vnode_t *vp; 223185029Spjd 224185029Spjd /* Copy fields. */ 225185029Spjd nzp->z_zfsvfs = ozp->z_zfsvfs; 226185029Spjd 227185029Spjd /* Swap vnodes. */ 228185029Spjd vp = nzp->z_vnode; 229185029Spjd nzp->z_vnode = ozp->z_vnode; 230185029Spjd ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ 231185029Spjd ZTOV(ozp)->v_data = ozp; 232185029Spjd ZTOV(nzp)->v_data = nzp; 233185029Spjd 234185029Spjd nzp->z_id = ozp->z_id; 235185029Spjd ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ 236185029Spjd ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); 237185029Spjd nzp->z_unlinked = ozp->z_unlinked; 238185029Spjd nzp->z_atime_dirty = ozp->z_atime_dirty; 239185029Spjd nzp->z_zn_prefetch = ozp->z_zn_prefetch; 240185029Spjd nzp->z_blksz = ozp->z_blksz; 241185029Spjd nzp->z_seq = ozp->z_seq; 242185029Spjd nzp->z_mapcnt = ozp->z_mapcnt; 243185029Spjd nzp->z_last_itx = ozp->z_last_itx; 244185029Spjd nzp->z_gen = ozp->z_gen; 245185029Spjd nzp->z_sync_cnt = ozp->z_sync_cnt; 246185029Spjd nzp->z_phys = ozp->z_phys; 247185029Spjd nzp->z_dbuf = ozp->z_dbuf; 248185029Spjd 249211932Smm /* 250211932Smm * Since this is just an idle znode and kmem is already dealing with 251211932Smm * memory pressure, release any cached ACL. 252211932Smm */ 253211932Smm if (ozp->z_acl_cached) { 254211932Smm zfs_acl_free(ozp->z_acl_cached); 255211932Smm ozp->z_acl_cached = NULL; 256211932Smm } 257211932Smm 258185029Spjd /* Update back pointers. */ 259185029Spjd (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, 260185029Spjd znode_evict_error); 261185029Spjd 262185029Spjd /* 263185029Spjd * Invalidate the original znode by clearing fields that provide a 264185029Spjd * pointer back to the znode. Set the low bit of the vfs pointer to 265185029Spjd * ensure that zfs_znode_move() recognizes the znode as invalid in any 266185029Spjd * subsequent callback. 267185029Spjd */ 268185029Spjd ozp->z_dbuf = NULL; 269185029Spjd POINTER_INVALIDATE(&ozp->z_zfsvfs); 270185029Spjd} 271185029Spjd 272185029Spjd/*ARGSUSED*/ 273185029Spjdstatic kmem_cbrc_t 274185029Spjdzfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) 275185029Spjd{ 276185029Spjd znode_t *ozp = buf, *nzp = newbuf; 277185029Spjd zfsvfs_t *zfsvfs; 278185029Spjd vnode_t *vp; 279185029Spjd 280185029Spjd /* 281185029Spjd * The znode is on the file system's list of known znodes if the vfs 282185029Spjd * pointer is valid. We set the low bit of the vfs pointer when freeing 283185029Spjd * the znode to invalidate it, and the memory patterns written by kmem 284185029Spjd * (baddcafe and deadbeef) set at least one of the two low bits. A newly 285185029Spjd * created znode sets the vfs pointer last of all to indicate that the 286185029Spjd * znode is known and in a valid state to be moved by this function. 287185029Spjd */ 288185029Spjd zfsvfs = ozp->z_zfsvfs; 289185029Spjd if (!POINTER_IS_VALID(zfsvfs)) { 290185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); 291185029Spjd return (KMEM_CBRC_DONT_KNOW); 292185029Spjd } 293185029Spjd 294185029Spjd /* 295210470Smm * Close a small window in which it's possible that the filesystem could 296210470Smm * be unmounted and freed, and zfsvfs, though valid in the previous 297210470Smm * statement, could point to unrelated memory by the time we try to 298210470Smm * prevent the filesystem from being unmounted. 299185029Spjd */ 300210470Smm rw_enter(&zfsvfs_lock, RW_WRITER); 301210470Smm if (zfsvfs != ozp->z_zfsvfs) { 302210470Smm rw_exit(&zfsvfs_lock); 303210470Smm ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1); 304210470Smm return (KMEM_CBRC_DONT_KNOW); 305210470Smm } 306210470Smm 307210470Smm /* 308210470Smm * If the znode is still valid, then so is the file system. We know that 309210470Smm * no valid file system can be freed while we hold zfsvfs_lock, so we 310210470Smm * can safely ensure that the filesystem is not and will not be 311210470Smm * unmounted. The next statement is equivalent to ZFS_ENTER(). 312210470Smm */ 313209962Smm rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); 314209962Smm if (zfsvfs->z_unmounted) { 315209962Smm ZFS_EXIT(zfsvfs); 316210470Smm rw_exit(&zfsvfs_lock); 317185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); 318185029Spjd return (KMEM_CBRC_DONT_KNOW); 319185029Spjd } 320210470Smm rw_exit(&zfsvfs_lock); 321185029Spjd 322185029Spjd mutex_enter(&zfsvfs->z_znodes_lock); 323185029Spjd /* 324185029Spjd * Recheck the vfs pointer in case the znode was removed just before 325185029Spjd * acquiring the lock. 326185029Spjd */ 327185029Spjd if (zfsvfs != ozp->z_zfsvfs) { 328185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 329185029Spjd ZFS_EXIT(zfsvfs); 330210470Smm ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2); 331185029Spjd return (KMEM_CBRC_DONT_KNOW); 332185029Spjd } 333185029Spjd 334185029Spjd /* 335185029Spjd * At this point we know that as long as we hold z_znodes_lock, the 336185029Spjd * znode cannot be freed and fields within the znode can be safely 337185029Spjd * accessed. Now, prevent a race with zfs_zget(). 338185029Spjd */ 339185029Spjd if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { 340185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 341185029Spjd ZFS_EXIT(zfsvfs); 342185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); 343185029Spjd return (KMEM_CBRC_LATER); 344185029Spjd } 345185029Spjd 346185029Spjd vp = ZTOV(ozp); 347185029Spjd if (mutex_tryenter(&vp->v_lock) == 0) { 348185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 349185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 350185029Spjd ZFS_EXIT(zfsvfs); 351185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); 352185029Spjd return (KMEM_CBRC_LATER); 353185029Spjd } 354185029Spjd 355185029Spjd /* Only move znodes that are referenced _only_ by the DNLC. */ 356185029Spjd if (vp->v_count != 1 || !vn_in_dnlc(vp)) { 357185029Spjd mutex_exit(&vp->v_lock); 358185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 359185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 360185029Spjd ZFS_EXIT(zfsvfs); 361185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); 362185029Spjd return (KMEM_CBRC_LATER); 363185029Spjd } 364185029Spjd 365185029Spjd /* 366185029Spjd * The znode is known and in a valid state to move. We're holding the 367185029Spjd * locks needed to execute the critical section. 368185029Spjd */ 369185029Spjd zfs_znode_move_impl(ozp, nzp); 370185029Spjd mutex_exit(&vp->v_lock); 371185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 372185029Spjd 373185029Spjd list_link_replace(&ozp->z_link_node, &nzp->z_link_node); 374185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 375185029Spjd ZFS_EXIT(zfsvfs); 376185029Spjd 377185029Spjd return (KMEM_CBRC_YES); 378185029Spjd} 379185029Spjd#endif /* sun */ 380185029Spjd 381168404Spjdvoid 382168404Spjdzfs_znode_init(void) 383168404Spjd{ 384168404Spjd /* 385168404Spjd * Initialize zcache 386168404Spjd */ 387210470Smm rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); 388168404Spjd ASSERT(znode_cache == NULL); 389168404Spjd znode_cache = kmem_cache_create("zfs_znode_cache", 390168404Spjd sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL, 391168404Spjd zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 392185029Spjd#if defined(sun) 393185029Spjd kmem_cache_set_move(znode_cache, zfs_znode_move); 394185029Spjd#endif 395168404Spjd} 396168404Spjd 397168404Spjdvoid 398168404Spjdzfs_znode_fini(void) 399168404Spjd{ 400168404Spjd /* 401168404Spjd * Cleanup zcache 402168404Spjd */ 403168404Spjd if (znode_cache) 404168404Spjd kmem_cache_destroy(znode_cache); 405168404Spjd znode_cache = NULL; 406210470Smm rw_destroy(&zfsvfs_lock); 407168404Spjd} 408168404Spjd 409168404Spjdint 410209962Smmzfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) 411168404Spjd{ 412209962Smm zfs_acl_ids_t acl_ids; 413209962Smm vattr_t vattr; 414209962Smm znode_t *sharezp; 415209962Smm vnode_t *vp, vnode; 416209962Smm znode_t *zp; 417209962Smm int error; 418168404Spjd 419209962Smm vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 420209962Smm vattr.va_type = VDIR; 421209962Smm vattr.va_mode = S_IFDIR|0555; 422209962Smm vattr.va_uid = crgetuid(kcred); 423209962Smm vattr.va_gid = crgetgid(kcred); 424168404Spjd 425209962Smm sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); 426209962Smm zfs_znode_cache_constructor(sharezp, zfsvfs->z_parent->z_vfs, 0); 427209962Smm sharezp->z_unlinked = 0; 428209962Smm sharezp->z_atime_dirty = 0; 429209962Smm sharezp->z_zfsvfs = zfsvfs; 430168404Spjd 431209962Smm sharezp->z_vnode = &vnode; 432209962Smm vnode.v_data = sharezp; 433185029Spjd 434209962Smm vp = ZTOV(sharezp); 435209962Smm vp->v_type = VDIR; 436168404Spjd 437209962Smm VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, 438209962Smm kcred, NULL, &acl_ids)); 439209962Smm zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, 440209962Smm &zp, 0, &acl_ids); 441209962Smm ASSERT3P(zp, ==, sharezp); 442209962Smm POINTER_INVALIDATE(&sharezp->z_zfsvfs); 443209962Smm error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 444209962Smm ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); 445209962Smm zfsvfs->z_shares_dir = sharezp->z_id; 446168404Spjd 447209962Smm zfs_acl_ids_free(&acl_ids); 448209962Smm ZTOV(sharezp)->v_data = NULL; 449209962Smm ZTOV(sharezp)->v_count = 0; 450209962Smm ZTOV(sharezp)->v_holdcnt = 0; 451209962Smm zp->z_vnode = NULL; 452209962Smm sharezp->z_vnode = NULL; 453209962Smm dmu_buf_rele(sharezp->z_dbuf, NULL); 454209962Smm sharezp->z_dbuf = NULL; 455209962Smm kmem_cache_free(znode_cache, sharezp); 456168404Spjd 457209962Smm return (error); 458168404Spjd} 459168404Spjd 460168404Spjd/* 461168404Spjd * define a couple of values we need available 462168404Spjd * for both 64 and 32 bit environments. 463168404Spjd */ 464168404Spjd#ifndef NBITSMINOR64 465168404Spjd#define NBITSMINOR64 32 466168404Spjd#endif 467168404Spjd#ifndef MAXMAJ64 468168404Spjd#define MAXMAJ64 0xffffffffUL 469168404Spjd#endif 470168404Spjd#ifndef MAXMIN64 471168404Spjd#define MAXMIN64 0xffffffffUL 472168404Spjd#endif 473168404Spjd 474168404Spjd/* 475168404Spjd * Create special expldev for ZFS private use. 476168404Spjd * Can't use standard expldev since it doesn't do 477168404Spjd * what we want. The standard expldev() takes a 478168404Spjd * dev32_t in LP64 and expands it to a long dev_t. 479168404Spjd * We need an interface that takes a dev32_t in ILP32 480168404Spjd * and expands it to a long dev_t. 481168404Spjd */ 482168404Spjdstatic uint64_t 483168404Spjdzfs_expldev(dev_t dev) 484168404Spjd{ 485187830Sed return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); 486168404Spjd} 487168404Spjd/* 488168404Spjd * Special cmpldev for ZFS private use. 489168404Spjd * Can't use standard cmpldev since it takes 490168404Spjd * a long dev_t and compresses it to dev32_t in 491168404Spjd * LP64. We need to do a compaction of a long dev_t 492168404Spjd * to a dev32_t in ILP32. 493168404Spjd */ 494168404Spjddev_t 495168404Spjdzfs_cmpldev(uint64_t dev) 496168404Spjd{ 497168958Spjd return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); 498168404Spjd} 499168404Spjd 500185029Spjdstatic void 501185029Spjdzfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) 502185029Spjd{ 503185029Spjd znode_t *nzp; 504185029Spjd 505185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); 506185029Spjd ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); 507185029Spjd 508185029Spjd mutex_enter(&zp->z_lock); 509185029Spjd 510185029Spjd ASSERT(zp->z_dbuf == NULL); 511211932Smm ASSERT(zp->z_acl_cached == NULL); 512185029Spjd zp->z_dbuf = db; 513185029Spjd nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); 514185029Spjd 515185029Spjd /* 516185029Spjd * there should be no 517185029Spjd * concurrent zgets on this object. 518185029Spjd */ 519185029Spjd if (nzp != NULL) 520185029Spjd panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); 521185029Spjd 522185029Spjd /* 523185029Spjd * Slap on VROOT if we are the root znode 524185029Spjd */ 525185029Spjd if (zp->z_id == zfsvfs->z_root) 526185029Spjd ZTOV(zp)->v_flag |= VROOT; 527185029Spjd 528185029Spjd mutex_exit(&zp->z_lock); 529185029Spjd vn_exists(ZTOV(zp)); 530185029Spjd} 531185029Spjd 532185029Spjdvoid 533185029Spjdzfs_znode_dmu_fini(znode_t *zp) 534185029Spjd{ 535185029Spjd dmu_buf_t *db = zp->z_dbuf; 536185029Spjd ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || 537185029Spjd zp->z_unlinked || 538185029Spjd RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); 539185029Spjd ASSERT(zp->z_dbuf != NULL); 540185029Spjd zp->z_dbuf = NULL; 541185029Spjd VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); 542185029Spjd dmu_buf_rele(db, NULL); 543185029Spjd} 544185029Spjd 545168404Spjd/* 546168404Spjd * Construct a new znode/vnode and intialize. 547168404Spjd * 548168404Spjd * This does not do a call to dmu_set_user() that is 549168404Spjd * up to the caller to do, in case you don't want to 550168404Spjd * return the znode 551168404Spjd */ 552168404Spjdstatic znode_t * 553185029Spjdzfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) 554168404Spjd{ 555168404Spjd znode_t *zp; 556168404Spjd vnode_t *vp; 557168404Spjd 558168404Spjd zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 559185029Spjd zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0); 560168404Spjd 561168404Spjd ASSERT(zp->z_dirlocks == NULL); 562185029Spjd ASSERT(zp->z_dbuf == NULL); 563185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 564168404Spjd 565185029Spjd /* 566185029Spjd * Defer setting z_zfsvfs until the znode is ready to be a candidate for 567185029Spjd * the zfs_znode_move() callback. 568185029Spjd */ 569185029Spjd zp->z_phys = NULL; 570168404Spjd zp->z_unlinked = 0; 571168404Spjd zp->z_atime_dirty = 0; 572168404Spjd zp->z_mapcnt = 0; 573168404Spjd zp->z_last_itx = 0; 574185029Spjd zp->z_id = db->db_object; 575168404Spjd zp->z_blksz = blksz; 576168404Spjd zp->z_seq = 0x7A4653; 577168404Spjd zp->z_sync_cnt = 0; 578168404Spjd 579185029Spjd vp = ZTOV(zp); 580185029Spjd#ifdef TODO 581185029Spjd vn_reinit(vp); 582185029Spjd#endif 583168404Spjd 584185029Spjd zfs_znode_dmu_init(zfsvfs, zp, db); 585185029Spjd 586185029Spjd zp->z_gen = zp->z_phys->zp_gen; 587185029Spjd 588185029Spjd#if 0 589168404Spjd if (vp == NULL) 590168404Spjd return (zp); 591185029Spjd#endif 592168404Spjd 593168404Spjd vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 594168404Spjd switch (vp->v_type) { 595168404Spjd case VDIR: 596168404Spjd zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 597168404Spjd break; 598168404Spjd case VFIFO: 599168404Spjd vp->v_op = &zfs_fifoops; 600168404Spjd break; 601209962Smm case VREG: 602209962Smm if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir) { 603209962Smm vp->v_op = &zfs_shareops; 604209962Smm } 605209962Smm break; 606168404Spjd } 607210172Sjhb if (vp->v_type != VFIFO) 608189696Sjhb VN_LOCK_ASHARE(vp); 609168404Spjd 610185029Spjd mutex_enter(&zfsvfs->z_znodes_lock); 611185029Spjd list_insert_tail(&zfsvfs->z_all_znodes, zp); 612185029Spjd membar_producer(); 613168404Spjd /* 614185029Spjd * Everything else must be valid before assigning z_zfsvfs makes the 615185029Spjd * znode eligible for zfs_znode_move(). 616168404Spjd */ 617185029Spjd zp->z_zfsvfs = zfsvfs; 618185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 619168404Spjd 620168404Spjd VFS_HOLD(zfsvfs->z_vfs); 621185029Spjd return (zp); 622168404Spjd} 623168404Spjd 624168404Spjd/* 625168404Spjd * Create a new DMU object to hold a zfs znode. 626168404Spjd * 627168404Spjd * IN: dzp - parent directory for new znode 628168404Spjd * vap - file attributes for new znode 629168404Spjd * tx - dmu transaction id for zap operations 630168404Spjd * cr - credentials of caller 631168404Spjd * flag - flags: 632168404Spjd * IS_ROOT_NODE - new object will be root 633168404Spjd * IS_XATTR - new object is an attribute 634185029Spjd * bonuslen - length of bonus buffer 635185029Spjd * setaclp - File/Dir initial ACL 636185029Spjd * fuidp - Tracks fuid allocation. 637168404Spjd * 638185029Spjd * OUT: zpp - allocated znode 639168404Spjd * 640168404Spjd */ 641168404Spjdvoid 642185029Spjdzfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 643209962Smm uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids) 644168404Spjd{ 645185029Spjd dmu_buf_t *db; 646168404Spjd znode_phys_t *pzp; 647168404Spjd zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 648168404Spjd timestruc_t now; 649185029Spjd uint64_t gen, obj; 650168404Spjd int err; 651168404Spjd 652168404Spjd ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 653168404Spjd 654209962Smm if (zfsvfs->z_replay) { 655185029Spjd obj = vap->va_nodeid; 656168404Spjd now = vap->va_ctime; /* see zfs_replay_create() */ 657168404Spjd gen = vap->va_nblocks; /* ditto */ 658168404Spjd } else { 659185029Spjd obj = 0; 660168404Spjd gethrestime(&now); 661168404Spjd gen = dmu_tx_get_txg(tx); 662168404Spjd } 663168404Spjd 664168404Spjd /* 665168404Spjd * Create a new DMU object. 666168404Spjd */ 667168404Spjd /* 668168404Spjd * There's currently no mechanism for pre-reading the blocks that will 669168404Spjd * be to needed allocate a new object, so we accept the small chance 670168404Spjd * that there will be an i/o error and we will fail one of the 671168404Spjd * assertions below. 672168404Spjd */ 673168404Spjd if (vap->va_type == VDIR) { 674209962Smm if (zfsvfs->z_replay) { 675185029Spjd err = zap_create_claim_norm(zfsvfs->z_os, obj, 676185029Spjd zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 677168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 678168404Spjd ASSERT3U(err, ==, 0); 679168404Spjd } else { 680185029Spjd obj = zap_create_norm(zfsvfs->z_os, 681185029Spjd zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 682168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 683168404Spjd } 684168404Spjd } else { 685209962Smm if (zfsvfs->z_replay) { 686185029Spjd err = dmu_object_claim(zfsvfs->z_os, obj, 687168404Spjd DMU_OT_PLAIN_FILE_CONTENTS, 0, 688168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 689168404Spjd ASSERT3U(err, ==, 0); 690168404Spjd } else { 691185029Spjd obj = dmu_object_alloc(zfsvfs->z_os, 692168404Spjd DMU_OT_PLAIN_FILE_CONTENTS, 0, 693168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 694168404Spjd } 695168404Spjd } 696207334Spjd 697207334Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 698185029Spjd VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); 699185029Spjd dmu_buf_will_dirty(db, tx); 700168404Spjd 701168404Spjd /* 702168404Spjd * Initialize the znode physical data to zero. 703168404Spjd */ 704185029Spjd ASSERT(db->db_size >= sizeof (znode_phys_t)); 705185029Spjd bzero(db->db_data, db->db_size); 706185029Spjd pzp = db->db_data; 707168404Spjd 708168404Spjd /* 709168404Spjd * If this is the root, fix up the half-initialized parent pointer 710168404Spjd * to reference the just-allocated physical data area. 711168404Spjd */ 712168404Spjd if (flag & IS_ROOT_NODE) { 713185029Spjd dzp->z_dbuf = db; 714168404Spjd dzp->z_phys = pzp; 715185029Spjd dzp->z_id = obj; 716168404Spjd } 717168404Spjd 718168404Spjd /* 719168404Spjd * If parent is an xattr, so am I. 720168404Spjd */ 721168404Spjd if (dzp->z_phys->zp_flags & ZFS_XATTR) 722168404Spjd flag |= IS_XATTR; 723168404Spjd 724168404Spjd if (vap->va_type == VBLK || vap->va_type == VCHR) { 725168404Spjd pzp->zp_rdev = zfs_expldev(vap->va_rdev); 726168404Spjd } 727168404Spjd 728185029Spjd if (zfsvfs->z_use_fuids) 729185029Spjd pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 730185029Spjd 731168404Spjd if (vap->va_type == VDIR) { 732168404Spjd pzp->zp_size = 2; /* contents ("." and "..") */ 733168404Spjd pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 734168404Spjd } 735168404Spjd 736168404Spjd pzp->zp_parent = dzp->z_id; 737168404Spjd if (flag & IS_XATTR) 738168404Spjd pzp->zp_flags |= ZFS_XATTR; 739168404Spjd 740168404Spjd pzp->zp_gen = gen; 741168404Spjd 742168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 743168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 744168404Spjd 745168404Spjd if (vap->va_mask & AT_ATIME) { 746168404Spjd ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 747168404Spjd } else { 748168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_atime); 749168404Spjd } 750168404Spjd 751168404Spjd if (vap->va_mask & AT_MTIME) { 752168404Spjd ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 753168404Spjd } else { 754168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 755168404Spjd } 756168404Spjd 757168404Spjd pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); 758185029Spjd if (!(flag & IS_ROOT_NODE)) { 759185029Spjd *zpp = zfs_znode_alloc(zfsvfs, db, 0); 760185029Spjd } else { 761185029Spjd /* 762185029Spjd * If we are creating the root node, the "parent" we 763185029Spjd * passed in is the znode for the root. 764185029Spjd */ 765185029Spjd *zpp = dzp; 766185029Spjd } 767209962Smm pzp->zp_uid = acl_ids->z_fuid; 768209962Smm pzp->zp_gid = acl_ids->z_fgid; 769209962Smm pzp->zp_mode = acl_ids->z_mode; 770209962Smm VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); 771209962Smm if (vap->va_mask & AT_XVATTR) 772209962Smm zfs_xvattr_set(*zpp, (xvattr_t *)vap); 773207334Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 774185029Spjd if (!(flag & IS_ROOT_NODE)) { 775185029Spjd vnode_t *vp; 776168404Spjd 777185029Spjd vp = ZTOV(*zpp); 778185029Spjd vp->v_vflag |= VV_FORCEINSMQ; 779185029Spjd err = insmntque(vp, zfsvfs->z_vfs); 780185029Spjd vp->v_vflag &= ~VV_FORCEINSMQ; 781185029Spjd KASSERT(err == 0, ("insmntque() failed: error %d", err)); 782185029Spjd } 783185029Spjd} 784168404Spjd 785185029Spjdvoid 786185029Spjdzfs_xvattr_set(znode_t *zp, xvattr_t *xvap) 787185029Spjd{ 788185029Spjd xoptattr_t *xoap; 789168404Spjd 790185029Spjd xoap = xva_getxoptattr(xvap); 791185029Spjd ASSERT(xoap); 792168404Spjd 793185029Spjd if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 794185029Spjd ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); 795185029Spjd XVA_SET_RTN(xvap, XAT_CREATETIME); 796168404Spjd } 797185029Spjd if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 798185029Spjd ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); 799185029Spjd XVA_SET_RTN(xvap, XAT_READONLY); 800185029Spjd } 801185029Spjd if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 802185029Spjd ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); 803185029Spjd XVA_SET_RTN(xvap, XAT_HIDDEN); 804185029Spjd } 805185029Spjd if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 806185029Spjd ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); 807185029Spjd XVA_SET_RTN(xvap, XAT_SYSTEM); 808185029Spjd } 809185029Spjd if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 810185029Spjd ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); 811185029Spjd XVA_SET_RTN(xvap, XAT_ARCHIVE); 812185029Spjd } 813185029Spjd if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 814185029Spjd ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); 815185029Spjd XVA_SET_RTN(xvap, XAT_IMMUTABLE); 816185029Spjd } 817185029Spjd if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 818185029Spjd ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); 819185029Spjd XVA_SET_RTN(xvap, XAT_NOUNLINK); 820185029Spjd } 821185029Spjd if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 822185029Spjd ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); 823185029Spjd XVA_SET_RTN(xvap, XAT_APPENDONLY); 824185029Spjd } 825185029Spjd if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 826185029Spjd ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); 827185029Spjd XVA_SET_RTN(xvap, XAT_NODUMP); 828185029Spjd } 829185029Spjd if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 830185029Spjd ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); 831185029Spjd XVA_SET_RTN(xvap, XAT_OPAQUE); 832185029Spjd } 833185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 834185029Spjd ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 835185029Spjd xoap->xoa_av_quarantined); 836185029Spjd XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 837185029Spjd } 838185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 839185029Spjd ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); 840185029Spjd XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 841185029Spjd } 842185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 843185029Spjd (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, 844185029Spjd sizeof (xoap->xoa_av_scanstamp)); 845185029Spjd zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; 846185029Spjd XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 847185029Spjd } 848168404Spjd} 849168404Spjd 850168404Spjdint 851168404Spjdzfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 852168404Spjd{ 853168404Spjd dmu_object_info_t doi; 854168404Spjd dmu_buf_t *db; 855168404Spjd znode_t *zp; 856168404Spjd vnode_t *vp; 857185029Spjd int err, first = 1; 858168404Spjd 859168404Spjd *zpp = NULL; 860185029Spjdagain: 861168404Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 862168404Spjd 863168404Spjd err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 864168404Spjd if (err) { 865168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 866168404Spjd return (err); 867168404Spjd } 868168404Spjd 869168404Spjd dmu_object_info_from_db(db, &doi); 870168404Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 871168404Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 872168404Spjd dmu_buf_rele(db, NULL); 873168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 874168404Spjd return (EINVAL); 875168404Spjd } 876168404Spjd 877168404Spjd zp = dmu_buf_get_user(db); 878168404Spjd if (zp != NULL) { 879168404Spjd mutex_enter(&zp->z_lock); 880168404Spjd 881185029Spjd /* 882185029Spjd * Since we do immediate eviction of the z_dbuf, we 883185029Spjd * should never find a dbuf with a znode that doesn't 884185029Spjd * know about the dbuf. 885185029Spjd */ 886185029Spjd ASSERT3P(zp->z_dbuf, ==, db); 887168404Spjd ASSERT3U(zp->z_id, ==, obj_num); 888168404Spjd if (zp->z_unlinked) { 889185029Spjd err = ENOENT; 890168404Spjd } else { 891197458Spjd int dying = 0; 892197458Spjd 893197458Spjd vp = ZTOV(zp); 894197458Spjd if (vp == NULL) 895197458Spjd dying = 1; 896197458Spjd else { 897197458Spjd VN_HOLD(vp); 898197131Spjd if ((vp->v_iflag & VI_DOOMED) != 0) { 899197458Spjd dying = 1; 900197458Spjd /* 901197458Spjd * Don't VN_RELE() vnode here, because 902197458Spjd * it can call vn_lock() which creates 903197458Spjd * LOR between vnode lock and znode 904197458Spjd * lock. We will VN_RELE() the vnode 905197458Spjd * after droping znode lock. 906197458Spjd */ 907197458Spjd } 908197131Spjd } 909197458Spjd if (dying) { 910185029Spjd if (first) { 911185029Spjd ZFS_LOG(1, "dying znode detected (zp=%p)", zp); 912185029Spjd first = 0; 913185029Spjd } 914185029Spjd /* 915185029Spjd * znode is dying so we can't reuse it, we must 916185029Spjd * wait until destruction is completed. 917185029Spjd */ 918185029Spjd dmu_buf_rele(db, NULL); 919185029Spjd mutex_exit(&zp->z_lock); 920185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 921197458Spjd if (vp != NULL) 922197458Spjd VN_RELE(vp); 923185029Spjd tsleep(zp, 0, "zcollide", 1); 924185029Spjd goto again; 925185029Spjd } 926185029Spjd *zpp = zp; 927185029Spjd err = 0; 928168404Spjd } 929185029Spjd dmu_buf_rele(db, NULL); 930168404Spjd mutex_exit(&zp->z_lock); 931168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 932185029Spjd return (err); 933168404Spjd } 934168404Spjd 935168404Spjd /* 936168404Spjd * Not found create new znode/vnode 937207334Spjd * but only if file exists. 938207334Spjd * 939207334Spjd * There is a small window where zfs_vget() could 940207334Spjd * find this object while a file create is still in 941207334Spjd * progress. Since a gen number can never be zero 942207334Spjd * we will check that to determine if its an allocated 943207334Spjd * file. 944168404Spjd */ 945185029Spjd 946207334Spjd if (((znode_phys_t *)db->db_data)->zp_gen != 0) { 947207334Spjd zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); 948207334Spjd *zpp = zp; 949207334Spjd vp = ZTOV(zp); 950207334Spjd vp->v_vflag |= VV_FORCEINSMQ; 951207334Spjd err = insmntque(vp, zfsvfs->z_vfs); 952207334Spjd vp->v_vflag &= ~VV_FORCEINSMQ; 953207334Spjd KASSERT(err == 0, ("insmntque() failed: error %d", err)); 954207334Spjd VOP_UNLOCK(vp, 0); 955207334Spjd err = 0; 956207334Spjd } else { 957207334Spjd dmu_buf_rele(db, NULL); 958207334Spjd err = ENOENT; 959207334Spjd } 960168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 961207334Spjd return (err); 962168404Spjd} 963168404Spjd 964185029Spjdint 965185029Spjdzfs_rezget(znode_t *zp) 966185029Spjd{ 967185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 968185029Spjd dmu_object_info_t doi; 969185029Spjd dmu_buf_t *db; 970185029Spjd uint64_t obj_num = zp->z_id; 971185029Spjd int err; 972185029Spjd 973185029Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 974185029Spjd 975185029Spjd err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 976185029Spjd if (err) { 977185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 978185029Spjd return (err); 979185029Spjd } 980185029Spjd 981185029Spjd dmu_object_info_from_db(db, &doi); 982185029Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 983185029Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 984185029Spjd dmu_buf_rele(db, NULL); 985185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 986185029Spjd return (EINVAL); 987185029Spjd } 988185029Spjd 989185029Spjd if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { 990185029Spjd dmu_buf_rele(db, NULL); 991185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 992185029Spjd return (EIO); 993185029Spjd } 994185029Spjd 995211932Smm mutex_enter(&zp->z_acl_lock); 996211932Smm if (zp->z_acl_cached) { 997211932Smm zfs_acl_free(zp->z_acl_cached); 998211932Smm zp->z_acl_cached = NULL; 999211932Smm } 1000211932Smm mutex_exit(&zp->z_acl_lock); 1001211932Smm 1002185029Spjd zfs_znode_dmu_init(zfsvfs, zp, db); 1003185029Spjd zp->z_unlinked = (zp->z_phys->zp_links == 0); 1004185029Spjd zp->z_blksz = doi.doi_data_block_size; 1005185029Spjd 1006185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1007185029Spjd 1008185029Spjd return (0); 1009185029Spjd} 1010185029Spjd 1011168404Spjdvoid 1012168404Spjdzfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 1013168404Spjd{ 1014168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1015185029Spjd objset_t *os = zfsvfs->z_os; 1016185029Spjd uint64_t obj = zp->z_id; 1017185029Spjd uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; 1018168404Spjd 1019185029Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 1020185029Spjd if (acl_obj) 1021185029Spjd VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1022185029Spjd VERIFY(0 == dmu_object_free(os, obj, tx)); 1023185029Spjd zfs_znode_dmu_fini(zp); 1024185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 1025185029Spjd zfs_znode_free(zp); 1026168404Spjd} 1027168404Spjd 1028168404Spjdvoid 1029168404Spjdzfs_zinactive(znode_t *zp) 1030168404Spjd{ 1031168404Spjd vnode_t *vp = ZTOV(zp); 1032168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1033168404Spjd uint64_t z_id = zp->z_id; 1034201406Sdelphij int vfslocked; 1035168404Spjd 1036185029Spjd ASSERT(zp->z_dbuf && zp->z_phys); 1037168404Spjd 1038168404Spjd /* 1039168404Spjd * Don't allow a zfs_zget() while were trying to release this znode 1040168404Spjd */ 1041168404Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 1042168404Spjd 1043168404Spjd mutex_enter(&zp->z_lock); 1044168404Spjd VI_LOCK(vp); 1045168404Spjd if (vp->v_count > 0) { 1046168404Spjd /* 1047168404Spjd * If the hold count is greater than zero, somebody has 1048168404Spjd * obtained a new reference on this znode while we were 1049168404Spjd * processing it here, so we are done. 1050168404Spjd */ 1051168404Spjd VI_UNLOCK(vp); 1052168404Spjd mutex_exit(&zp->z_lock); 1053168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1054168404Spjd return; 1055168404Spjd } 1056168404Spjd VI_UNLOCK(vp); 1057168404Spjd 1058168404Spjd /* 1059168404Spjd * If this was the last reference to a file with no links, 1060168404Spjd * remove the file from the file system. 1061168404Spjd */ 1062168404Spjd if (zp->z_unlinked) { 1063168404Spjd mutex_exit(&zp->z_lock); 1064168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1065168404Spjd ASSERT(vp->v_count == 0); 1066168404Spjd vrecycle(vp, curthread); 1067201406Sdelphij vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs); 1068168404Spjd zfs_rmnode(zp); 1069201406Sdelphij VFS_UNLOCK_GIANT(vfslocked); 1070168404Spjd return; 1071168404Spjd } 1072168404Spjd mutex_exit(&zp->z_lock); 1073168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1074168404Spjd} 1075168404Spjd 1076168404Spjdvoid 1077168404Spjdzfs_znode_free(znode_t *zp) 1078168404Spjd{ 1079168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1080168404Spjd 1081185029Spjd ASSERT(ZTOV(zp) == NULL); 1082168404Spjd mutex_enter(&zfsvfs->z_znodes_lock); 1083185029Spjd POINTER_INVALIDATE(&zp->z_zfsvfs); 1084168404Spjd list_remove(&zfsvfs->z_all_znodes, zp); 1085168404Spjd mutex_exit(&zfsvfs->z_znodes_lock); 1086168404Spjd 1087211932Smm if (zp->z_acl_cached) { 1088211932Smm zfs_acl_free(zp->z_acl_cached); 1089211932Smm zp->z_acl_cached = NULL; 1090211932Smm } 1091211932Smm 1092168404Spjd kmem_cache_free(znode_cache, zp); 1093185029Spjd 1094185029Spjd VFS_RELE(zfsvfs->z_vfs); 1095168404Spjd} 1096168404Spjd 1097168404Spjdvoid 1098168404Spjdzfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1099168404Spjd{ 1100168404Spjd timestruc_t now; 1101168404Spjd 1102168404Spjd ASSERT(MUTEX_HELD(&zp->z_lock)); 1103168404Spjd 1104168404Spjd gethrestime(&now); 1105168404Spjd 1106168404Spjd if (tx) { 1107168404Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1108168404Spjd zp->z_atime_dirty = 0; 1109168404Spjd zp->z_seq++; 1110168404Spjd } else { 1111168404Spjd zp->z_atime_dirty = 1; 1112168404Spjd } 1113168404Spjd 1114168404Spjd if (flag & AT_ATIME) 1115168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1116168404Spjd 1117185029Spjd if (flag & AT_MTIME) { 1118168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1119185029Spjd if (zp->z_zfsvfs->z_use_fuids) 1120185029Spjd zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); 1121185029Spjd } 1122168404Spjd 1123185029Spjd if (flag & AT_CTIME) { 1124168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1125185029Spjd if (zp->z_zfsvfs->z_use_fuids) 1126185029Spjd zp->z_phys->zp_flags |= ZFS_ARCHIVE; 1127185029Spjd } 1128168404Spjd} 1129168404Spjd 1130168404Spjd/* 1131168404Spjd * Update the requested znode timestamps with the current time. 1132168404Spjd * If we are in a transaction, then go ahead and mark the znode 1133168404Spjd * dirty in the transaction so the timestamps will go to disk. 1134168404Spjd * Otherwise, we will get pushed next time the znode is updated 1135168404Spjd * in a transaction, or when this znode eventually goes inactive. 1136168404Spjd * 1137168404Spjd * Why is this OK? 1138168404Spjd * 1 - Only the ACCESS time is ever updated outside of a transaction. 1139168404Spjd * 2 - Multiple consecutive updates will be collapsed into a single 1140168404Spjd * znode update by the transaction grouping semantics of the DMU. 1141168404Spjd */ 1142168404Spjdvoid 1143168404Spjdzfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1144168404Spjd{ 1145168404Spjd mutex_enter(&zp->z_lock); 1146168404Spjd zfs_time_stamper_locked(zp, flag, tx); 1147168404Spjd mutex_exit(&zp->z_lock); 1148168404Spjd} 1149168404Spjd 1150168404Spjd/* 1151168404Spjd * Grow the block size for a file. 1152168404Spjd * 1153168404Spjd * IN: zp - znode of file to free data in. 1154168404Spjd * size - requested block size 1155168404Spjd * tx - open transaction. 1156168404Spjd * 1157168404Spjd * NOTE: this function assumes that the znode is write locked. 1158168404Spjd */ 1159168404Spjdvoid 1160168404Spjdzfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1161168404Spjd{ 1162168404Spjd int error; 1163168404Spjd u_longlong_t dummy; 1164168404Spjd 1165168404Spjd if (size <= zp->z_blksz) 1166168404Spjd return; 1167168404Spjd /* 1168168404Spjd * If the file size is already greater than the current blocksize, 1169168404Spjd * we will not grow. If there is more than one block in a file, 1170168404Spjd * the blocksize cannot change. 1171168404Spjd */ 1172168404Spjd if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1173168404Spjd return; 1174168404Spjd 1175168404Spjd error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1176168404Spjd size, 0, tx); 1177168404Spjd if (error == ENOTSUP) 1178168404Spjd return; 1179168404Spjd ASSERT3U(error, ==, 0); 1180168404Spjd 1181168404Spjd /* What blocksize did we actually get? */ 1182168404Spjd dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1183168404Spjd} 1184168404Spjd 1185168404Spjd/* 1186185029Spjd * Increase the file length 1187168404Spjd * 1188168404Spjd * IN: zp - znode of file to free data in. 1189185029Spjd * end - new end-of-file 1190168404Spjd * 1191168404Spjd * RETURN: 0 if success 1192168404Spjd * error code if failure 1193168404Spjd */ 1194185029Spjdstatic int 1195185029Spjdzfs_extend(znode_t *zp, uint64_t end) 1196168404Spjd{ 1197185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1198168404Spjd dmu_tx_t *tx; 1199168404Spjd rl_t *rl; 1200185029Spjd uint64_t newblksz; 1201168404Spjd int error; 1202168404Spjd 1203168404Spjd /* 1204185029Spjd * We will change zp_size, lock the whole file. 1205168404Spjd */ 1206185029Spjd rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1207168404Spjd 1208168404Spjd /* 1209168404Spjd * Nothing to do if file already at desired length. 1210168404Spjd */ 1211185029Spjd if (end <= zp->z_phys->zp_size) { 1212168404Spjd zfs_range_unlock(rl); 1213168404Spjd return (0); 1214168404Spjd } 1215185029Spjdtop: 1216168404Spjd tx = dmu_tx_create(zfsvfs->z_os); 1217168404Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1218185029Spjd if (end > zp->z_blksz && 1219168404Spjd (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1220168404Spjd /* 1221168404Spjd * We are growing the file past the current block size. 1222168404Spjd */ 1223168404Spjd if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1224168404Spjd ASSERT(!ISP2(zp->z_blksz)); 1225185029Spjd newblksz = MIN(end, SPA_MAXBLOCKSIZE); 1226168404Spjd } else { 1227185029Spjd newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1228168404Spjd } 1229185029Spjd dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1230185029Spjd } else { 1231185029Spjd newblksz = 0; 1232168404Spjd } 1233168404Spjd 1234209962Smm error = dmu_tx_assign(tx, TXG_NOWAIT); 1235168404Spjd if (error) { 1236209962Smm if (error == ERESTART) { 1237168404Spjd dmu_tx_wait(tx); 1238185029Spjd dmu_tx_abort(tx); 1239185029Spjd goto top; 1240185029Spjd } 1241168404Spjd dmu_tx_abort(tx); 1242168404Spjd zfs_range_unlock(rl); 1243168404Spjd return (error); 1244168404Spjd } 1245185029Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1246168404Spjd 1247185029Spjd if (newblksz) 1248185029Spjd zfs_grow_blocksize(zp, newblksz, tx); 1249168404Spjd 1250185029Spjd zp->z_phys->zp_size = end; 1251168404Spjd 1252185029Spjd zfs_range_unlock(rl); 1253168404Spjd 1254185029Spjd dmu_tx_commit(tx); 1255185029Spjd 1256185029Spjd vnode_pager_setsize(ZTOV(zp), end); 1257185029Spjd 1258185029Spjd return (0); 1259185029Spjd} 1260185029Spjd 1261185029Spjd/* 1262185029Spjd * Free space in a file. 1263185029Spjd * 1264185029Spjd * IN: zp - znode of file to free data in. 1265185029Spjd * off - start of section to free. 1266185029Spjd * len - length of section to free. 1267185029Spjd * 1268185029Spjd * RETURN: 0 if success 1269185029Spjd * error code if failure 1270185029Spjd */ 1271185029Spjdstatic int 1272185029Spjdzfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1273185029Spjd{ 1274185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1275185029Spjd rl_t *rl; 1276185029Spjd int error; 1277185029Spjd 1278185029Spjd /* 1279185029Spjd * Lock the range being freed. 1280185029Spjd */ 1281185029Spjd rl = zfs_range_lock(zp, off, len, RL_WRITER); 1282185029Spjd 1283185029Spjd /* 1284185029Spjd * Nothing to do if file already at desired length. 1285185029Spjd */ 1286185029Spjd if (off >= zp->z_phys->zp_size) { 1287185029Spjd zfs_range_unlock(rl); 1288185029Spjd return (0); 1289168404Spjd } 1290168404Spjd 1291185029Spjd if (off + len > zp->z_phys->zp_size) 1292185029Spjd len = zp->z_phys->zp_size - off; 1293185029Spjd 1294185029Spjd error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1295185029Spjd 1296185029Spjd if (error == 0) { 1297185029Spjd /* 1298185029Spjd * In FreeBSD we cannot free block in the middle of a file, 1299185029Spjd * but only at the end of a file. 1300185029Spjd */ 1301185029Spjd vnode_pager_setsize(ZTOV(zp), off); 1302168404Spjd } 1303168404Spjd 1304168404Spjd zfs_range_unlock(rl); 1305168404Spjd 1306185029Spjd return (error); 1307185029Spjd} 1308185029Spjd 1309185029Spjd/* 1310185029Spjd * Truncate a file 1311185029Spjd * 1312185029Spjd * IN: zp - znode of file to free data in. 1313185029Spjd * end - new end-of-file. 1314185029Spjd * 1315185029Spjd * RETURN: 0 if success 1316185029Spjd * error code if failure 1317185029Spjd */ 1318185029Spjdstatic int 1319185029Spjdzfs_trunc(znode_t *zp, uint64_t end) 1320185029Spjd{ 1321185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1322185029Spjd vnode_t *vp = ZTOV(zp); 1323185029Spjd dmu_tx_t *tx; 1324185029Spjd rl_t *rl; 1325185029Spjd int error; 1326185029Spjd 1327185029Spjd /* 1328185029Spjd * We will change zp_size, lock the whole file. 1329185029Spjd */ 1330185029Spjd rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1331185029Spjd 1332185029Spjd /* 1333185029Spjd * Nothing to do if file already at desired length. 1334185029Spjd */ 1335185029Spjd if (end >= zp->z_phys->zp_size) { 1336185029Spjd zfs_range_unlock(rl); 1337185029Spjd return (0); 1338185029Spjd } 1339185029Spjd 1340185029Spjd error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); 1341185029Spjd if (error) { 1342185029Spjd zfs_range_unlock(rl); 1343185029Spjd return (error); 1344185029Spjd } 1345185029Spjdtop: 1346185029Spjd tx = dmu_tx_create(zfsvfs->z_os); 1347185029Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1348209962Smm error = dmu_tx_assign(tx, TXG_NOWAIT); 1349185029Spjd if (error) { 1350209962Smm if (error == ERESTART) { 1351185029Spjd dmu_tx_wait(tx); 1352185029Spjd dmu_tx_abort(tx); 1353185029Spjd goto top; 1354185029Spjd } 1355185029Spjd dmu_tx_abort(tx); 1356185029Spjd zfs_range_unlock(rl); 1357185029Spjd return (error); 1358185029Spjd } 1359185029Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1360185029Spjd 1361185029Spjd zp->z_phys->zp_size = end; 1362185029Spjd 1363168404Spjd dmu_tx_commit(tx); 1364168404Spjd 1365168404Spjd /* 1366168404Spjd * Clear any mapped pages in the truncated region. This has to 1367168404Spjd * happen outside of the transaction to avoid the possibility of 1368168404Spjd * a deadlock with someone trying to push a page that we are 1369168404Spjd * about to invalidate. 1370168404Spjd */ 1371185029Spjd vnode_pager_setsize(vp, end); 1372168404Spjd 1373209962Smm zfs_range_unlock(rl); 1374209962Smm 1375168404Spjd return (0); 1376168404Spjd} 1377168404Spjd 1378185029Spjd/* 1379185029Spjd * Free space in a file 1380185029Spjd * 1381185029Spjd * IN: zp - znode of file to free data in. 1382185029Spjd * off - start of range 1383185029Spjd * len - end of range (0 => EOF) 1384185029Spjd * flag - current file open mode flags. 1385185029Spjd * log - TRUE if this action should be logged 1386185029Spjd * 1387185029Spjd * RETURN: 0 if success 1388185029Spjd * error code if failure 1389185029Spjd */ 1390185029Spjdint 1391185029Spjdzfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1392185029Spjd{ 1393185029Spjd vnode_t *vp = ZTOV(zp); 1394185029Spjd dmu_tx_t *tx; 1395185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1396185029Spjd zilog_t *zilog = zfsvfs->z_log; 1397185029Spjd int error; 1398185029Spjd 1399185029Spjd if (off > zp->z_phys->zp_size) { 1400185029Spjd error = zfs_extend(zp, off+len); 1401185029Spjd if (error == 0 && log) 1402185029Spjd goto log; 1403185029Spjd else 1404185029Spjd return (error); 1405185029Spjd } 1406185029Spjd 1407185029Spjd if (len == 0) { 1408185029Spjd error = zfs_trunc(zp, off); 1409185029Spjd } else { 1410185029Spjd if ((error = zfs_free_range(zp, off, len)) == 0 && 1411185029Spjd off + len > zp->z_phys->zp_size) 1412185029Spjd error = zfs_extend(zp, off+len); 1413185029Spjd } 1414185029Spjd if (error || !log) 1415185029Spjd return (error); 1416185029Spjdlog: 1417185029Spjd tx = dmu_tx_create(zfsvfs->z_os); 1418185029Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1419209962Smm error = dmu_tx_assign(tx, TXG_NOWAIT); 1420185029Spjd if (error) { 1421209962Smm if (error == ERESTART) { 1422185029Spjd dmu_tx_wait(tx); 1423185029Spjd dmu_tx_abort(tx); 1424185029Spjd goto log; 1425185029Spjd } 1426185029Spjd dmu_tx_abort(tx); 1427185029Spjd return (error); 1428185029Spjd } 1429185029Spjd 1430185029Spjd zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 1431185029Spjd zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1432185029Spjd 1433185029Spjd dmu_tx_commit(tx); 1434185029Spjd return (0); 1435185029Spjd} 1436185029Spjd 1437168404Spjdvoid 1438185029Spjdzfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1439168404Spjd{ 1440168404Spjd zfsvfs_t zfsvfs; 1441209962Smm uint64_t moid, obj, version; 1442185029Spjd uint64_t sense = ZFS_CASE_SENSITIVE; 1443185029Spjd uint64_t norm = 0; 1444185029Spjd nvpair_t *elem; 1445168404Spjd int error; 1446207334Spjd int i; 1447168404Spjd znode_t *rootzp = NULL; 1448199156Spjd vnode_t vnode; 1449168404Spjd vattr_t vattr; 1450185029Spjd znode_t *zp; 1451209962Smm zfs_acl_ids_t acl_ids; 1452168404Spjd 1453168404Spjd /* 1454168404Spjd * First attempt to create master node. 1455168404Spjd */ 1456168404Spjd /* 1457168404Spjd * In an empty objset, there are no blocks to read and thus 1458168404Spjd * there can be no i/o errors (which we assert below). 1459168404Spjd */ 1460168404Spjd moid = MASTER_NODE_OBJ; 1461168404Spjd error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1462168404Spjd DMU_OT_NONE, 0, tx); 1463168404Spjd ASSERT(error == 0); 1464168404Spjd 1465168404Spjd /* 1466168404Spjd * Set starting attributes. 1467168404Spjd */ 1468209962Smm if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE) 1469185029Spjd version = ZPL_VERSION; 1470209962Smm else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) 1471209962Smm version = ZPL_VERSION_USERSPACE - 1; 1472185029Spjd else 1473185029Spjd version = ZPL_VERSION_FUID - 1; 1474185029Spjd elem = NULL; 1475185029Spjd while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1476185029Spjd /* For the moment we expect all zpl props to be uint64_ts */ 1477185029Spjd uint64_t val; 1478185029Spjd char *name; 1479168404Spjd 1480185029Spjd ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1481185029Spjd VERIFY(nvpair_value_uint64(elem, &val) == 0); 1482185029Spjd name = nvpair_name(elem); 1483185029Spjd if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1484209962Smm if (val < version) 1485209962Smm version = val; 1486185029Spjd } else { 1487185029Spjd error = zap_update(os, moid, name, 8, 1, &val, tx); 1488185029Spjd } 1489185029Spjd ASSERT(error == 0); 1490185029Spjd if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1491185029Spjd norm = val; 1492185029Spjd else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1493185029Spjd sense = val; 1494185029Spjd } 1495185029Spjd ASSERT(version != 0); 1496209962Smm error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); 1497168404Spjd 1498168404Spjd /* 1499168404Spjd * Create a delete queue. 1500168404Spjd */ 1501209962Smm obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1502168404Spjd 1503209962Smm error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); 1504168404Spjd ASSERT(error == 0); 1505168404Spjd 1506168404Spjd /* 1507168404Spjd * Create root znode. Create minimal znode/vnode/zfsvfs 1508168404Spjd * to allow zfs_mknode to work. 1509168404Spjd */ 1510185029Spjd VATTR_NULL(&vattr); 1511168404Spjd vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1512168404Spjd vattr.va_type = VDIR; 1513168404Spjd vattr.va_mode = S_IFDIR|0755; 1514185029Spjd vattr.va_uid = crgetuid(cr); 1515185029Spjd vattr.va_gid = crgetgid(cr); 1516168404Spjd 1517168404Spjd rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1518199156Spjd zfs_znode_cache_constructor(rootzp, NULL, 0); 1519168404Spjd rootzp->z_unlinked = 0; 1520168404Spjd rootzp->z_atime_dirty = 0; 1521168404Spjd 1522199156Spjd vnode.v_type = VDIR; 1523199156Spjd vnode.v_data = rootzp; 1524199156Spjd rootzp->z_vnode = &vnode; 1525185029Spjd 1526168404Spjd bzero(&zfsvfs, sizeof (zfsvfs_t)); 1527168404Spjd 1528168404Spjd zfsvfs.z_os = os; 1529168404Spjd zfsvfs.z_parent = &zfsvfs; 1530185029Spjd zfsvfs.z_version = version; 1531185029Spjd zfsvfs.z_use_fuids = USE_FUIDS(version, os); 1532185029Spjd zfsvfs.z_norm = norm; 1533185029Spjd /* 1534185029Spjd * Fold case on file systems that are always or sometimes case 1535185029Spjd * insensitive. 1536185029Spjd */ 1537185029Spjd if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1538185029Spjd zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; 1539168404Spjd 1540168404Spjd mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1541168404Spjd list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1542168404Spjd offsetof(znode_t, z_link_node)); 1543168404Spjd 1544207334Spjd for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1545207334Spjd mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1546207334Spjd 1547185029Spjd ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); 1548185029Spjd rootzp->z_zfsvfs = &zfsvfs; 1549209962Smm VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, 1550209962Smm cr, NULL, &acl_ids)); 1551209962Smm zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids); 1552185029Spjd ASSERT3P(zp, ==, rootzp); 1553185029Spjd error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1554168404Spjd ASSERT(error == 0); 1555209962Smm zfs_acl_ids_free(&acl_ids); 1556185029Spjd POINTER_INVALIDATE(&rootzp->z_zfsvfs); 1557168404Spjd 1558185029Spjd dmu_buf_rele(rootzp->z_dbuf, NULL); 1559185029Spjd rootzp->z_dbuf = NULL; 1560209962Smm rootzp->z_vnode = NULL; 1561209962Smm kmem_cache_free(znode_cache, rootzp); 1562209962Smm 1563209962Smm /* 1564209962Smm * Create shares directory 1565209962Smm */ 1566209962Smm 1567209962Smm error = zfs_create_share_dir(&zfsvfs, tx); 1568209962Smm 1569209962Smm ASSERT(error == 0); 1570209962Smm 1571207334Spjd for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1572207334Spjd mutex_destroy(&zfsvfs.z_hold_mtx[i]); 1573168404Spjd} 1574185029Spjd 1575168404Spjd#endif /* _KERNEL */ 1576168404Spjd/* 1577168404Spjd * Given an object number, return its parent object number and whether 1578168404Spjd * or not the object is an extended attribute directory. 1579168404Spjd */ 1580168404Spjdstatic int 1581168404Spjdzfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) 1582168404Spjd{ 1583168404Spjd dmu_buf_t *db; 1584168404Spjd dmu_object_info_t doi; 1585168404Spjd znode_phys_t *zp; 1586168404Spjd int error; 1587168404Spjd 1588168404Spjd if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) 1589168404Spjd return (error); 1590168404Spjd 1591168404Spjd dmu_object_info_from_db(db, &doi); 1592168404Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 1593168404Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 1594168404Spjd dmu_buf_rele(db, FTAG); 1595168404Spjd return (EINVAL); 1596168404Spjd } 1597168404Spjd 1598168404Spjd zp = db->db_data; 1599168404Spjd *pobjp = zp->zp_parent; 1600168404Spjd *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && 1601168404Spjd S_ISDIR(zp->zp_mode); 1602168404Spjd dmu_buf_rele(db, FTAG); 1603168404Spjd 1604168404Spjd return (0); 1605168404Spjd} 1606168404Spjd 1607168404Spjdint 1608168404Spjdzfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 1609168404Spjd{ 1610168404Spjd char *path = buf + len - 1; 1611168404Spjd int error; 1612168404Spjd 1613168404Spjd *path = '\0'; 1614168404Spjd 1615168404Spjd for (;;) { 1616168404Spjd uint64_t pobj; 1617168404Spjd char component[MAXNAMELEN + 2]; 1618168404Spjd size_t complen; 1619168404Spjd int is_xattrdir; 1620168404Spjd 1621168404Spjd if ((error = zfs_obj_to_pobj(osp, obj, &pobj, 1622168404Spjd &is_xattrdir)) != 0) 1623168404Spjd break; 1624168404Spjd 1625168404Spjd if (pobj == obj) { 1626168404Spjd if (path[0] != '/') 1627168404Spjd *--path = '/'; 1628168404Spjd break; 1629168404Spjd } 1630168404Spjd 1631168404Spjd component[0] = '/'; 1632168404Spjd if (is_xattrdir) { 1633168404Spjd (void) sprintf(component + 1, "<xattrdir>"); 1634168404Spjd } else { 1635185029Spjd error = zap_value_search(osp, pobj, obj, 1636185029Spjd ZFS_DIRENT_OBJ(-1ULL), component + 1); 1637168404Spjd if (error != 0) 1638168404Spjd break; 1639168404Spjd } 1640168404Spjd 1641168404Spjd complen = strlen(component); 1642168404Spjd path -= complen; 1643168404Spjd ASSERT(path >= buf); 1644168404Spjd bcopy(component, path, complen); 1645168404Spjd obj = pobj; 1646168404Spjd } 1647168404Spjd 1648168404Spjd if (error == 0) 1649168404Spjd (void) memmove(buf, path, buf + len - path); 1650168404Spjd return (error); 1651168404Spjd} 1652