zfs_znode.c revision 210470
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22209962Smm * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23168404Spjd * Use is subject to license terms. 24168404Spjd */ 25168404Spjd 26169195Spjd/* Portions Copyright 2007 Jeremy Teo */ 27169195Spjd 28168404Spjd#ifdef _KERNEL 29168404Spjd#include <sys/types.h> 30168404Spjd#include <sys/param.h> 31168404Spjd#include <sys/time.h> 32168404Spjd#include <sys/systm.h> 33168404Spjd#include <sys/sysmacros.h> 34168404Spjd#include <sys/resource.h> 35168404Spjd#include <sys/mntent.h> 36185029Spjd#include <sys/u8_textprep.h> 37185029Spjd#include <sys/dsl_dataset.h> 38168404Spjd#include <sys/vfs.h> 39168404Spjd#include <sys/vnode.h> 40168404Spjd#include <sys/file.h> 41168404Spjd#include <sys/kmem.h> 42168404Spjd#include <sys/errno.h> 43168404Spjd#include <sys/unistd.h> 44168404Spjd#include <sys/atomic.h> 45168404Spjd#include <sys/zfs_dir.h> 46168404Spjd#include <sys/zfs_acl.h> 47168404Spjd#include <sys/zfs_ioctl.h> 48168404Spjd#include <sys/zfs_rlock.h> 49185029Spjd#include <sys/zfs_fuid.h> 50168404Spjd#include <sys/fs/zfs.h> 51185029Spjd#include <sys/kidmap.h> 52168404Spjd#endif /* _KERNEL */ 53168404Spjd 54168404Spjd#include <sys/dmu.h> 55168404Spjd#include <sys/refcount.h> 56168404Spjd#include <sys/stat.h> 57168404Spjd#include <sys/zap.h> 58168404Spjd#include <sys/zfs_znode.h> 59168404Spjd#include <sys/refcount.h> 60168404Spjd 61185029Spjd#include "zfs_prop.h" 62185029Spjd 63173268Slulf/* Used by fstat(1). */ 64173268SlulfSYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t), 65173268Slulf "sizeof(znode_t)"); 66173268Slulf 67168404Spjd/* 68185029Spjd * Define ZNODE_STATS to turn on statistic gathering. By default, it is only 69185029Spjd * turned on when DEBUG is also defined. 70185029Spjd */ 71185029Spjd#ifdef DEBUG 72185029Spjd#define ZNODE_STATS 73185029Spjd#endif /* DEBUG */ 74185029Spjd 75185029Spjd#ifdef ZNODE_STATS 76185029Spjd#define ZNODE_STAT_ADD(stat) ((stat)++) 77185029Spjd#else 78185029Spjd#define ZNODE_STAT_ADD(stat) /* nothing */ 79185029Spjd#endif /* ZNODE_STATS */ 80185029Spjd 81185029Spjd#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) 82185029Spjd#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) 83185029Spjd 84185029Spjd/* 85168404Spjd * Functions needed for userland (ie: libzpool) are not put under 86168404Spjd * #ifdef_KERNEL; the rest of the functions have dependencies 87168404Spjd * (such as VFS logic) that will not compile easily in userland. 88168404Spjd */ 89168404Spjd#ifdef _KERNEL 90210470Smm/* 91210470Smm * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to 92210470Smm * be freed before it can be safely accessed. 93210470Smm */ 94210470Smmkrwlock_t zfsvfs_lock; 95210470Smm 96185029Spjdstatic kmem_cache_t *znode_cache = NULL; 97168404Spjd 98168404Spjd/*ARGSUSED*/ 99168404Spjdstatic void 100185029Spjdznode_evict_error(dmu_buf_t *dbuf, void *user_ptr) 101168404Spjd{ 102185029Spjd#if 1 /* XXXPJD: From OpenSolaris. */ 103185029Spjd /* 104185029Spjd * We should never drop all dbuf refs without first clearing 105185029Spjd * the eviction callback. 106185029Spjd */ 107185029Spjd panic("evicting znode %p\n", user_ptr); 108185029Spjd#else /* XXXPJD */ 109168404Spjd znode_t *zp = user_ptr; 110168488Spjd vnode_t *vp; 111168404Spjd 112168404Spjd mutex_enter(&zp->z_lock); 113185029Spjd zp->z_dbuf = NULL; 114168488Spjd vp = ZTOV(zp); 115168404Spjd if (vp == NULL) { 116168404Spjd mutex_exit(&zp->z_lock); 117168404Spjd zfs_znode_free(zp); 118168404Spjd } else if (vp->v_count == 0) { 119197153Spjd zp->z_vnode = NULL; 120168488Spjd vhold(vp); 121168404Spjd mutex_exit(&zp->z_lock); 122185029Spjd vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); 123168404Spjd vrecycle(vp, curthread); 124175294Sattilio VOP_UNLOCK(vp, 0); 125168404Spjd vdrop(vp); 126168404Spjd zfs_znode_free(zp); 127168404Spjd } else { 128168404Spjd mutex_exit(&zp->z_lock); 129168404Spjd } 130185029Spjd#endif 131168404Spjd} 132168404Spjd 133168404Spjdextern struct vop_vector zfs_vnodeops; 134168404Spjdextern struct vop_vector zfs_fifoops; 135209962Smmextern struct vop_vector zfs_shareops; 136168404Spjd 137168404Spjd/* 138168404Spjd * XXX: We cannot use this function as a cache constructor, because 139168404Spjd * there is one global cache for all file systems and we need 140168404Spjd * to pass vfsp here, which is not possible, because argument 141168404Spjd * 'cdrarg' is defined at kmem_cache_create() time. 142168404Spjd */ 143168404Spjdstatic int 144185029Spjdzfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 145168404Spjd{ 146168404Spjd znode_t *zp = buf; 147169196Spjd vnode_t *vp; 148185029Spjd vfs_t *vfsp = arg; 149168404Spjd int error; 150168404Spjd 151185029Spjd POINTER_INVALIDATE(&zp->z_zfsvfs); 152185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 153185029Spjd 154199156Spjd if (vfsp != NULL) { 155199156Spjd error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp); 156199156Spjd if (error != 0 && (kmflags & KM_NOSLEEP)) 157199156Spjd return (-1); 158199156Spjd ASSERT(error == 0); 159199156Spjd vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 160199156Spjd zp->z_vnode = vp; 161199156Spjd vp->v_data = (caddr_t)zp; 162199156Spjd VN_LOCK_AREC(vp); 163199156Spjd } else { 164199156Spjd zp->z_vnode = NULL; 165199156Spjd } 166185029Spjd 167185029Spjd list_link_init(&zp->z_link_node); 168185029Spjd 169168404Spjd mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 170168404Spjd rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 171168404Spjd rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); 172168404Spjd mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 173168404Spjd 174168404Spjd mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); 175168404Spjd avl_create(&zp->z_range_avl, zfs_range_compare, 176168404Spjd sizeof (rl_t), offsetof(rl_t, r_node)); 177168404Spjd 178185029Spjd zp->z_dbuf = NULL; 179185029Spjd zp->z_dirlocks = NULL; 180168404Spjd return (0); 181168404Spjd} 182168404Spjd 183168404Spjd/*ARGSUSED*/ 184168404Spjdstatic void 185185029Spjdzfs_znode_cache_destructor(void *buf, void *arg) 186168404Spjd{ 187168404Spjd znode_t *zp = buf; 188168404Spjd 189185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 190185029Spjd ASSERT(ZTOV(zp) == NULL); 191185029Spjd vn_free(ZTOV(zp)); 192185029Spjd ASSERT(!list_link_active(&zp->z_link_node)); 193168404Spjd mutex_destroy(&zp->z_lock); 194168404Spjd rw_destroy(&zp->z_parent_lock); 195168404Spjd rw_destroy(&zp->z_name_lock); 196168404Spjd mutex_destroy(&zp->z_acl_lock); 197185029Spjd avl_destroy(&zp->z_range_avl); 198168404Spjd mutex_destroy(&zp->z_range_lock); 199168404Spjd 200185029Spjd ASSERT(zp->z_dbuf == NULL); 201185029Spjd ASSERT(zp->z_dirlocks == NULL); 202168404Spjd} 203168404Spjd 204185029Spjd#ifdef ZNODE_STATS 205185029Spjdstatic struct { 206185029Spjd uint64_t zms_zfsvfs_invalid; 207210470Smm uint64_t zms_zfsvfs_recheck1; 208185029Spjd uint64_t zms_zfsvfs_unmounted; 209210470Smm uint64_t zms_zfsvfs_recheck2; 210185029Spjd uint64_t zms_obj_held; 211185029Spjd uint64_t zms_vnode_locked; 212185029Spjd uint64_t zms_not_only_dnlc; 213185029Spjd} znode_move_stats; 214185029Spjd#endif /* ZNODE_STATS */ 215185029Spjd 216185029Spjd#if defined(sun) 217185029Spjdstatic void 218185029Spjdzfs_znode_move_impl(znode_t *ozp, znode_t *nzp) 219185029Spjd{ 220185029Spjd vnode_t *vp; 221185029Spjd 222185029Spjd /* Copy fields. */ 223185029Spjd nzp->z_zfsvfs = ozp->z_zfsvfs; 224185029Spjd 225185029Spjd /* Swap vnodes. */ 226185029Spjd vp = nzp->z_vnode; 227185029Spjd nzp->z_vnode = ozp->z_vnode; 228185029Spjd ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ 229185029Spjd ZTOV(ozp)->v_data = ozp; 230185029Spjd ZTOV(nzp)->v_data = nzp; 231185029Spjd 232185029Spjd nzp->z_id = ozp->z_id; 233185029Spjd ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ 234185029Spjd ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); 235185029Spjd nzp->z_unlinked = ozp->z_unlinked; 236185029Spjd nzp->z_atime_dirty = ozp->z_atime_dirty; 237185029Spjd nzp->z_zn_prefetch = ozp->z_zn_prefetch; 238185029Spjd nzp->z_blksz = ozp->z_blksz; 239185029Spjd nzp->z_seq = ozp->z_seq; 240185029Spjd nzp->z_mapcnt = ozp->z_mapcnt; 241185029Spjd nzp->z_last_itx = ozp->z_last_itx; 242185029Spjd nzp->z_gen = ozp->z_gen; 243185029Spjd nzp->z_sync_cnt = ozp->z_sync_cnt; 244185029Spjd nzp->z_phys = ozp->z_phys; 245185029Spjd nzp->z_dbuf = ozp->z_dbuf; 246185029Spjd 247185029Spjd /* Update back pointers. */ 248185029Spjd (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, 249185029Spjd znode_evict_error); 250185029Spjd 251185029Spjd /* 252185029Spjd * Invalidate the original znode by clearing fields that provide a 253185029Spjd * pointer back to the znode. Set the low bit of the vfs pointer to 254185029Spjd * ensure that zfs_znode_move() recognizes the znode as invalid in any 255185029Spjd * subsequent callback. 256185029Spjd */ 257185029Spjd ozp->z_dbuf = NULL; 258185029Spjd POINTER_INVALIDATE(&ozp->z_zfsvfs); 259185029Spjd} 260185029Spjd 261185029Spjd/*ARGSUSED*/ 262185029Spjdstatic kmem_cbrc_t 263185029Spjdzfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) 264185029Spjd{ 265185029Spjd znode_t *ozp = buf, *nzp = newbuf; 266185029Spjd zfsvfs_t *zfsvfs; 267185029Spjd vnode_t *vp; 268185029Spjd 269185029Spjd /* 270185029Spjd * The znode is on the file system's list of known znodes if the vfs 271185029Spjd * pointer is valid. We set the low bit of the vfs pointer when freeing 272185029Spjd * the znode to invalidate it, and the memory patterns written by kmem 273185029Spjd * (baddcafe and deadbeef) set at least one of the two low bits. A newly 274185029Spjd * created znode sets the vfs pointer last of all to indicate that the 275185029Spjd * znode is known and in a valid state to be moved by this function. 276185029Spjd */ 277185029Spjd zfsvfs = ozp->z_zfsvfs; 278185029Spjd if (!POINTER_IS_VALID(zfsvfs)) { 279185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); 280185029Spjd return (KMEM_CBRC_DONT_KNOW); 281185029Spjd } 282185029Spjd 283185029Spjd /* 284210470Smm * Close a small window in which it's possible that the filesystem could 285210470Smm * be unmounted and freed, and zfsvfs, though valid in the previous 286210470Smm * statement, could point to unrelated memory by the time we try to 287210470Smm * prevent the filesystem from being unmounted. 288185029Spjd */ 289210470Smm rw_enter(&zfsvfs_lock, RW_WRITER); 290210470Smm if (zfsvfs != ozp->z_zfsvfs) { 291210470Smm rw_exit(&zfsvfs_lock); 292210470Smm ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1); 293210470Smm return (KMEM_CBRC_DONT_KNOW); 294210470Smm } 295210470Smm 296210470Smm /* 297210470Smm * If the znode is still valid, then so is the file system. We know that 298210470Smm * no valid file system can be freed while we hold zfsvfs_lock, so we 299210470Smm * can safely ensure that the filesystem is not and will not be 300210470Smm * unmounted. The next statement is equivalent to ZFS_ENTER(). 301210470Smm */ 302209962Smm rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); 303209962Smm if (zfsvfs->z_unmounted) { 304209962Smm ZFS_EXIT(zfsvfs); 305210470Smm rw_exit(&zfsvfs_lock); 306185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); 307185029Spjd return (KMEM_CBRC_DONT_KNOW); 308185029Spjd } 309210470Smm rw_exit(&zfsvfs_lock); 310185029Spjd 311185029Spjd mutex_enter(&zfsvfs->z_znodes_lock); 312185029Spjd /* 313185029Spjd * Recheck the vfs pointer in case the znode was removed just before 314185029Spjd * acquiring the lock. 315185029Spjd */ 316185029Spjd if (zfsvfs != ozp->z_zfsvfs) { 317185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 318185029Spjd ZFS_EXIT(zfsvfs); 319210470Smm ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2); 320185029Spjd return (KMEM_CBRC_DONT_KNOW); 321185029Spjd } 322185029Spjd 323185029Spjd /* 324185029Spjd * At this point we know that as long as we hold z_znodes_lock, the 325185029Spjd * znode cannot be freed and fields within the znode can be safely 326185029Spjd * accessed. Now, prevent a race with zfs_zget(). 327185029Spjd */ 328185029Spjd if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { 329185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 330185029Spjd ZFS_EXIT(zfsvfs); 331185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); 332185029Spjd return (KMEM_CBRC_LATER); 333185029Spjd } 334185029Spjd 335185029Spjd vp = ZTOV(ozp); 336185029Spjd if (mutex_tryenter(&vp->v_lock) == 0) { 337185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 338185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 339185029Spjd ZFS_EXIT(zfsvfs); 340185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); 341185029Spjd return (KMEM_CBRC_LATER); 342185029Spjd } 343185029Spjd 344185029Spjd /* Only move znodes that are referenced _only_ by the DNLC. */ 345185029Spjd if (vp->v_count != 1 || !vn_in_dnlc(vp)) { 346185029Spjd mutex_exit(&vp->v_lock); 347185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 348185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 349185029Spjd ZFS_EXIT(zfsvfs); 350185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); 351185029Spjd return (KMEM_CBRC_LATER); 352185029Spjd } 353185029Spjd 354185029Spjd /* 355185029Spjd * The znode is known and in a valid state to move. We're holding the 356185029Spjd * locks needed to execute the critical section. 357185029Spjd */ 358185029Spjd zfs_znode_move_impl(ozp, nzp); 359185029Spjd mutex_exit(&vp->v_lock); 360185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 361185029Spjd 362185029Spjd list_link_replace(&ozp->z_link_node, &nzp->z_link_node); 363185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 364185029Spjd ZFS_EXIT(zfsvfs); 365185029Spjd 366185029Spjd return (KMEM_CBRC_YES); 367185029Spjd} 368185029Spjd#endif /* sun */ 369185029Spjd 370168404Spjdvoid 371168404Spjdzfs_znode_init(void) 372168404Spjd{ 373168404Spjd /* 374168404Spjd * Initialize zcache 375168404Spjd */ 376210470Smm rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); 377168404Spjd ASSERT(znode_cache == NULL); 378168404Spjd znode_cache = kmem_cache_create("zfs_znode_cache", 379168404Spjd sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL, 380168404Spjd zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 381185029Spjd#if defined(sun) 382185029Spjd kmem_cache_set_move(znode_cache, zfs_znode_move); 383185029Spjd#endif 384168404Spjd} 385168404Spjd 386168404Spjdvoid 387168404Spjdzfs_znode_fini(void) 388168404Spjd{ 389168404Spjd /* 390168404Spjd * Cleanup zcache 391168404Spjd */ 392168404Spjd if (znode_cache) 393168404Spjd kmem_cache_destroy(znode_cache); 394168404Spjd znode_cache = NULL; 395210470Smm rw_destroy(&zfsvfs_lock); 396168404Spjd} 397168404Spjd 398168404Spjdint 399209962Smmzfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) 400168404Spjd{ 401209962Smm zfs_acl_ids_t acl_ids; 402209962Smm vattr_t vattr; 403209962Smm znode_t *sharezp; 404209962Smm vnode_t *vp, vnode; 405209962Smm znode_t *zp; 406209962Smm int error; 407168404Spjd 408209962Smm vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 409209962Smm vattr.va_type = VDIR; 410209962Smm vattr.va_mode = S_IFDIR|0555; 411209962Smm vattr.va_uid = crgetuid(kcred); 412209962Smm vattr.va_gid = crgetgid(kcred); 413168404Spjd 414209962Smm sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); 415209962Smm zfs_znode_cache_constructor(sharezp, zfsvfs->z_parent->z_vfs, 0); 416209962Smm sharezp->z_unlinked = 0; 417209962Smm sharezp->z_atime_dirty = 0; 418209962Smm sharezp->z_zfsvfs = zfsvfs; 419168404Spjd 420209962Smm sharezp->z_vnode = &vnode; 421209962Smm vnode.v_data = sharezp; 422185029Spjd 423209962Smm vp = ZTOV(sharezp); 424209962Smm vp->v_type = VDIR; 425168404Spjd 426209962Smm VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, 427209962Smm kcred, NULL, &acl_ids)); 428209962Smm zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, 429209962Smm &zp, 0, &acl_ids); 430209962Smm ASSERT3P(zp, ==, sharezp); 431209962Smm POINTER_INVALIDATE(&sharezp->z_zfsvfs); 432209962Smm error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 433209962Smm ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); 434209962Smm zfsvfs->z_shares_dir = sharezp->z_id; 435168404Spjd 436209962Smm zfs_acl_ids_free(&acl_ids); 437209962Smm ZTOV(sharezp)->v_data = NULL; 438209962Smm ZTOV(sharezp)->v_count = 0; 439209962Smm ZTOV(sharezp)->v_holdcnt = 0; 440209962Smm zp->z_vnode = NULL; 441209962Smm sharezp->z_vnode = NULL; 442209962Smm dmu_buf_rele(sharezp->z_dbuf, NULL); 443209962Smm sharezp->z_dbuf = NULL; 444209962Smm kmem_cache_free(znode_cache, sharezp); 445168404Spjd 446209962Smm return (error); 447168404Spjd} 448168404Spjd 449168404Spjd/* 450168404Spjd * define a couple of values we need available 451168404Spjd * for both 64 and 32 bit environments. 452168404Spjd */ 453168404Spjd#ifndef NBITSMINOR64 454168404Spjd#define NBITSMINOR64 32 455168404Spjd#endif 456168404Spjd#ifndef MAXMAJ64 457168404Spjd#define MAXMAJ64 0xffffffffUL 458168404Spjd#endif 459168404Spjd#ifndef MAXMIN64 460168404Spjd#define MAXMIN64 0xffffffffUL 461168404Spjd#endif 462168404Spjd 463168404Spjd/* 464168404Spjd * Create special expldev for ZFS private use. 465168404Spjd * Can't use standard expldev since it doesn't do 466168404Spjd * what we want. The standard expldev() takes a 467168404Spjd * dev32_t in LP64 and expands it to a long dev_t. 468168404Spjd * We need an interface that takes a dev32_t in ILP32 469168404Spjd * and expands it to a long dev_t. 470168404Spjd */ 471168404Spjdstatic uint64_t 472168404Spjdzfs_expldev(dev_t dev) 473168404Spjd{ 474187830Sed return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); 475168404Spjd} 476168404Spjd/* 477168404Spjd * Special cmpldev for ZFS private use. 478168404Spjd * Can't use standard cmpldev since it takes 479168404Spjd * a long dev_t and compresses it to dev32_t in 480168404Spjd * LP64. We need to do a compaction of a long dev_t 481168404Spjd * to a dev32_t in ILP32. 482168404Spjd */ 483168404Spjddev_t 484168404Spjdzfs_cmpldev(uint64_t dev) 485168404Spjd{ 486168958Spjd return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); 487168404Spjd} 488168404Spjd 489185029Spjdstatic void 490185029Spjdzfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) 491185029Spjd{ 492185029Spjd znode_t *nzp; 493185029Spjd 494185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); 495185029Spjd ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); 496185029Spjd 497185029Spjd mutex_enter(&zp->z_lock); 498185029Spjd 499185029Spjd ASSERT(zp->z_dbuf == NULL); 500185029Spjd zp->z_dbuf = db; 501185029Spjd nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); 502185029Spjd 503185029Spjd /* 504185029Spjd * there should be no 505185029Spjd * concurrent zgets on this object. 506185029Spjd */ 507185029Spjd if (nzp != NULL) 508185029Spjd panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); 509185029Spjd 510185029Spjd /* 511185029Spjd * Slap on VROOT if we are the root znode 512185029Spjd */ 513185029Spjd if (zp->z_id == zfsvfs->z_root) 514185029Spjd ZTOV(zp)->v_flag |= VROOT; 515185029Spjd 516185029Spjd mutex_exit(&zp->z_lock); 517185029Spjd vn_exists(ZTOV(zp)); 518185029Spjd} 519185029Spjd 520185029Spjdvoid 521185029Spjdzfs_znode_dmu_fini(znode_t *zp) 522185029Spjd{ 523185029Spjd dmu_buf_t *db = zp->z_dbuf; 524185029Spjd ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || 525185029Spjd zp->z_unlinked || 526185029Spjd RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); 527185029Spjd ASSERT(zp->z_dbuf != NULL); 528185029Spjd zp->z_dbuf = NULL; 529185029Spjd VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); 530185029Spjd dmu_buf_rele(db, NULL); 531185029Spjd} 532185029Spjd 533168404Spjd/* 534168404Spjd * Construct a new znode/vnode and intialize. 535168404Spjd * 536168404Spjd * This does not do a call to dmu_set_user() that is 537168404Spjd * up to the caller to do, in case you don't want to 538168404Spjd * return the znode 539168404Spjd */ 540168404Spjdstatic znode_t * 541185029Spjdzfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) 542168404Spjd{ 543168404Spjd znode_t *zp; 544168404Spjd vnode_t *vp; 545168404Spjd 546168404Spjd zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 547185029Spjd zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0); 548168404Spjd 549168404Spjd ASSERT(zp->z_dirlocks == NULL); 550185029Spjd ASSERT(zp->z_dbuf == NULL); 551185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 552168404Spjd 553185029Spjd /* 554185029Spjd * Defer setting z_zfsvfs until the znode is ready to be a candidate for 555185029Spjd * the zfs_znode_move() callback. 556185029Spjd */ 557185029Spjd zp->z_phys = NULL; 558168404Spjd zp->z_unlinked = 0; 559168404Spjd zp->z_atime_dirty = 0; 560168404Spjd zp->z_mapcnt = 0; 561168404Spjd zp->z_last_itx = 0; 562185029Spjd zp->z_id = db->db_object; 563168404Spjd zp->z_blksz = blksz; 564168404Spjd zp->z_seq = 0x7A4653; 565168404Spjd zp->z_sync_cnt = 0; 566168404Spjd 567185029Spjd vp = ZTOV(zp); 568185029Spjd#ifdef TODO 569185029Spjd vn_reinit(vp); 570185029Spjd#endif 571168404Spjd 572185029Spjd zfs_znode_dmu_init(zfsvfs, zp, db); 573185029Spjd 574185029Spjd zp->z_gen = zp->z_phys->zp_gen; 575185029Spjd 576185029Spjd#if 0 577168404Spjd if (vp == NULL) 578168404Spjd return (zp); 579185029Spjd#endif 580168404Spjd 581168404Spjd vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 582168404Spjd switch (vp->v_type) { 583168404Spjd case VDIR: 584168404Spjd zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 585168404Spjd break; 586168404Spjd case VFIFO: 587168404Spjd vp->v_op = &zfs_fifoops; 588168404Spjd break; 589209962Smm case VREG: 590209962Smm if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir) { 591209962Smm vp->v_op = &zfs_shareops; 592209962Smm } 593209962Smm break; 594168404Spjd } 595210172Sjhb if (vp->v_type != VFIFO) 596189696Sjhb VN_LOCK_ASHARE(vp); 597168404Spjd 598185029Spjd mutex_enter(&zfsvfs->z_znodes_lock); 599185029Spjd list_insert_tail(&zfsvfs->z_all_znodes, zp); 600185029Spjd membar_producer(); 601168404Spjd /* 602185029Spjd * Everything else must be valid before assigning z_zfsvfs makes the 603185029Spjd * znode eligible for zfs_znode_move(). 604168404Spjd */ 605185029Spjd zp->z_zfsvfs = zfsvfs; 606185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 607168404Spjd 608168404Spjd VFS_HOLD(zfsvfs->z_vfs); 609185029Spjd return (zp); 610168404Spjd} 611168404Spjd 612168404Spjd/* 613168404Spjd * Create a new DMU object to hold a zfs znode. 614168404Spjd * 615168404Spjd * IN: dzp - parent directory for new znode 616168404Spjd * vap - file attributes for new znode 617168404Spjd * tx - dmu transaction id for zap operations 618168404Spjd * cr - credentials of caller 619168404Spjd * flag - flags: 620168404Spjd * IS_ROOT_NODE - new object will be root 621168404Spjd * IS_XATTR - new object is an attribute 622185029Spjd * bonuslen - length of bonus buffer 623185029Spjd * setaclp - File/Dir initial ACL 624185029Spjd * fuidp - Tracks fuid allocation. 625168404Spjd * 626185029Spjd * OUT: zpp - allocated znode 627168404Spjd * 628168404Spjd */ 629168404Spjdvoid 630185029Spjdzfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 631209962Smm uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids) 632168404Spjd{ 633185029Spjd dmu_buf_t *db; 634168404Spjd znode_phys_t *pzp; 635168404Spjd zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 636168404Spjd timestruc_t now; 637185029Spjd uint64_t gen, obj; 638168404Spjd int err; 639168404Spjd 640168404Spjd ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 641168404Spjd 642209962Smm if (zfsvfs->z_replay) { 643185029Spjd obj = vap->va_nodeid; 644168404Spjd now = vap->va_ctime; /* see zfs_replay_create() */ 645168404Spjd gen = vap->va_nblocks; /* ditto */ 646168404Spjd } else { 647185029Spjd obj = 0; 648168404Spjd gethrestime(&now); 649168404Spjd gen = dmu_tx_get_txg(tx); 650168404Spjd } 651168404Spjd 652168404Spjd /* 653168404Spjd * Create a new DMU object. 654168404Spjd */ 655168404Spjd /* 656168404Spjd * There's currently no mechanism for pre-reading the blocks that will 657168404Spjd * be to needed allocate a new object, so we accept the small chance 658168404Spjd * that there will be an i/o error and we will fail one of the 659168404Spjd * assertions below. 660168404Spjd */ 661168404Spjd if (vap->va_type == VDIR) { 662209962Smm if (zfsvfs->z_replay) { 663185029Spjd err = zap_create_claim_norm(zfsvfs->z_os, obj, 664185029Spjd zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 665168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 666168404Spjd ASSERT3U(err, ==, 0); 667168404Spjd } else { 668185029Spjd obj = zap_create_norm(zfsvfs->z_os, 669185029Spjd zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 670168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 671168404Spjd } 672168404Spjd } else { 673209962Smm if (zfsvfs->z_replay) { 674185029Spjd err = dmu_object_claim(zfsvfs->z_os, obj, 675168404Spjd DMU_OT_PLAIN_FILE_CONTENTS, 0, 676168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 677168404Spjd ASSERT3U(err, ==, 0); 678168404Spjd } else { 679185029Spjd obj = dmu_object_alloc(zfsvfs->z_os, 680168404Spjd DMU_OT_PLAIN_FILE_CONTENTS, 0, 681168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 682168404Spjd } 683168404Spjd } 684207334Spjd 685207334Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 686185029Spjd VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); 687185029Spjd dmu_buf_will_dirty(db, tx); 688168404Spjd 689168404Spjd /* 690168404Spjd * Initialize the znode physical data to zero. 691168404Spjd */ 692185029Spjd ASSERT(db->db_size >= sizeof (znode_phys_t)); 693185029Spjd bzero(db->db_data, db->db_size); 694185029Spjd pzp = db->db_data; 695168404Spjd 696168404Spjd /* 697168404Spjd * If this is the root, fix up the half-initialized parent pointer 698168404Spjd * to reference the just-allocated physical data area. 699168404Spjd */ 700168404Spjd if (flag & IS_ROOT_NODE) { 701185029Spjd dzp->z_dbuf = db; 702168404Spjd dzp->z_phys = pzp; 703185029Spjd dzp->z_id = obj; 704168404Spjd } 705168404Spjd 706168404Spjd /* 707168404Spjd * If parent is an xattr, so am I. 708168404Spjd */ 709168404Spjd if (dzp->z_phys->zp_flags & ZFS_XATTR) 710168404Spjd flag |= IS_XATTR; 711168404Spjd 712168404Spjd if (vap->va_type == VBLK || vap->va_type == VCHR) { 713168404Spjd pzp->zp_rdev = zfs_expldev(vap->va_rdev); 714168404Spjd } 715168404Spjd 716185029Spjd if (zfsvfs->z_use_fuids) 717185029Spjd pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 718185029Spjd 719168404Spjd if (vap->va_type == VDIR) { 720168404Spjd pzp->zp_size = 2; /* contents ("." and "..") */ 721168404Spjd pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 722168404Spjd } 723168404Spjd 724168404Spjd pzp->zp_parent = dzp->z_id; 725168404Spjd if (flag & IS_XATTR) 726168404Spjd pzp->zp_flags |= ZFS_XATTR; 727168404Spjd 728168404Spjd pzp->zp_gen = gen; 729168404Spjd 730168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 731168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 732168404Spjd 733168404Spjd if (vap->va_mask & AT_ATIME) { 734168404Spjd ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 735168404Spjd } else { 736168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_atime); 737168404Spjd } 738168404Spjd 739168404Spjd if (vap->va_mask & AT_MTIME) { 740168404Spjd ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 741168404Spjd } else { 742168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 743168404Spjd } 744168404Spjd 745168404Spjd pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); 746185029Spjd if (!(flag & IS_ROOT_NODE)) { 747185029Spjd *zpp = zfs_znode_alloc(zfsvfs, db, 0); 748185029Spjd } else { 749185029Spjd /* 750185029Spjd * If we are creating the root node, the "parent" we 751185029Spjd * passed in is the znode for the root. 752185029Spjd */ 753185029Spjd *zpp = dzp; 754185029Spjd } 755209962Smm pzp->zp_uid = acl_ids->z_fuid; 756209962Smm pzp->zp_gid = acl_ids->z_fgid; 757209962Smm pzp->zp_mode = acl_ids->z_mode; 758209962Smm VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); 759209962Smm if (vap->va_mask & AT_XVATTR) 760209962Smm zfs_xvattr_set(*zpp, (xvattr_t *)vap); 761207334Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 762185029Spjd if (!(flag & IS_ROOT_NODE)) { 763185029Spjd vnode_t *vp; 764168404Spjd 765185029Spjd vp = ZTOV(*zpp); 766185029Spjd vp->v_vflag |= VV_FORCEINSMQ; 767185029Spjd err = insmntque(vp, zfsvfs->z_vfs); 768185029Spjd vp->v_vflag &= ~VV_FORCEINSMQ; 769185029Spjd KASSERT(err == 0, ("insmntque() failed: error %d", err)); 770185029Spjd } 771185029Spjd} 772168404Spjd 773185029Spjdvoid 774185029Spjdzfs_xvattr_set(znode_t *zp, xvattr_t *xvap) 775185029Spjd{ 776185029Spjd xoptattr_t *xoap; 777168404Spjd 778185029Spjd xoap = xva_getxoptattr(xvap); 779185029Spjd ASSERT(xoap); 780168404Spjd 781185029Spjd if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 782185029Spjd ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); 783185029Spjd XVA_SET_RTN(xvap, XAT_CREATETIME); 784168404Spjd } 785185029Spjd if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 786185029Spjd ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); 787185029Spjd XVA_SET_RTN(xvap, XAT_READONLY); 788185029Spjd } 789185029Spjd if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 790185029Spjd ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); 791185029Spjd XVA_SET_RTN(xvap, XAT_HIDDEN); 792185029Spjd } 793185029Spjd if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 794185029Spjd ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); 795185029Spjd XVA_SET_RTN(xvap, XAT_SYSTEM); 796185029Spjd } 797185029Spjd if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 798185029Spjd ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); 799185029Spjd XVA_SET_RTN(xvap, XAT_ARCHIVE); 800185029Spjd } 801185029Spjd if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 802185029Spjd ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); 803185029Spjd XVA_SET_RTN(xvap, XAT_IMMUTABLE); 804185029Spjd } 805185029Spjd if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 806185029Spjd ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); 807185029Spjd XVA_SET_RTN(xvap, XAT_NOUNLINK); 808185029Spjd } 809185029Spjd if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 810185029Spjd ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); 811185029Spjd XVA_SET_RTN(xvap, XAT_APPENDONLY); 812185029Spjd } 813185029Spjd if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 814185029Spjd ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); 815185029Spjd XVA_SET_RTN(xvap, XAT_NODUMP); 816185029Spjd } 817185029Spjd if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 818185029Spjd ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); 819185029Spjd XVA_SET_RTN(xvap, XAT_OPAQUE); 820185029Spjd } 821185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 822185029Spjd ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 823185029Spjd xoap->xoa_av_quarantined); 824185029Spjd XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 825185029Spjd } 826185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 827185029Spjd ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); 828185029Spjd XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 829185029Spjd } 830185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 831185029Spjd (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, 832185029Spjd sizeof (xoap->xoa_av_scanstamp)); 833185029Spjd zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; 834185029Spjd XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 835185029Spjd } 836168404Spjd} 837168404Spjd 838168404Spjdint 839168404Spjdzfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 840168404Spjd{ 841168404Spjd dmu_object_info_t doi; 842168404Spjd dmu_buf_t *db; 843168404Spjd znode_t *zp; 844168404Spjd vnode_t *vp; 845185029Spjd int err, first = 1; 846168404Spjd 847168404Spjd *zpp = NULL; 848185029Spjdagain: 849168404Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 850168404Spjd 851168404Spjd err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 852168404Spjd if (err) { 853168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 854168404Spjd return (err); 855168404Spjd } 856168404Spjd 857168404Spjd dmu_object_info_from_db(db, &doi); 858168404Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 859168404Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 860168404Spjd dmu_buf_rele(db, NULL); 861168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 862168404Spjd return (EINVAL); 863168404Spjd } 864168404Spjd 865168404Spjd zp = dmu_buf_get_user(db); 866168404Spjd if (zp != NULL) { 867168404Spjd mutex_enter(&zp->z_lock); 868168404Spjd 869185029Spjd /* 870185029Spjd * Since we do immediate eviction of the z_dbuf, we 871185029Spjd * should never find a dbuf with a znode that doesn't 872185029Spjd * know about the dbuf. 873185029Spjd */ 874185029Spjd ASSERT3P(zp->z_dbuf, ==, db); 875168404Spjd ASSERT3U(zp->z_id, ==, obj_num); 876168404Spjd if (zp->z_unlinked) { 877185029Spjd err = ENOENT; 878168404Spjd } else { 879197458Spjd int dying = 0; 880197458Spjd 881197458Spjd vp = ZTOV(zp); 882197458Spjd if (vp == NULL) 883197458Spjd dying = 1; 884197458Spjd else { 885197458Spjd VN_HOLD(vp); 886197131Spjd if ((vp->v_iflag & VI_DOOMED) != 0) { 887197458Spjd dying = 1; 888197458Spjd /* 889197458Spjd * Don't VN_RELE() vnode here, because 890197458Spjd * it can call vn_lock() which creates 891197458Spjd * LOR between vnode lock and znode 892197458Spjd * lock. We will VN_RELE() the vnode 893197458Spjd * after droping znode lock. 894197458Spjd */ 895197458Spjd } 896197131Spjd } 897197458Spjd if (dying) { 898185029Spjd if (first) { 899185029Spjd ZFS_LOG(1, "dying znode detected (zp=%p)", zp); 900185029Spjd first = 0; 901185029Spjd } 902185029Spjd /* 903185029Spjd * znode is dying so we can't reuse it, we must 904185029Spjd * wait until destruction is completed. 905185029Spjd */ 906185029Spjd dmu_buf_rele(db, NULL); 907185029Spjd mutex_exit(&zp->z_lock); 908185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 909197458Spjd if (vp != NULL) 910197458Spjd VN_RELE(vp); 911185029Spjd tsleep(zp, 0, "zcollide", 1); 912185029Spjd goto again; 913185029Spjd } 914185029Spjd *zpp = zp; 915185029Spjd err = 0; 916168404Spjd } 917185029Spjd dmu_buf_rele(db, NULL); 918168404Spjd mutex_exit(&zp->z_lock); 919168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 920185029Spjd return (err); 921168404Spjd } 922168404Spjd 923168404Spjd /* 924168404Spjd * Not found create new znode/vnode 925207334Spjd * but only if file exists. 926207334Spjd * 927207334Spjd * There is a small window where zfs_vget() could 928207334Spjd * find this object while a file create is still in 929207334Spjd * progress. Since a gen number can never be zero 930207334Spjd * we will check that to determine if its an allocated 931207334Spjd * file. 932168404Spjd */ 933185029Spjd 934207334Spjd if (((znode_phys_t *)db->db_data)->zp_gen != 0) { 935207334Spjd zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); 936207334Spjd *zpp = zp; 937207334Spjd vp = ZTOV(zp); 938207334Spjd vp->v_vflag |= VV_FORCEINSMQ; 939207334Spjd err = insmntque(vp, zfsvfs->z_vfs); 940207334Spjd vp->v_vflag &= ~VV_FORCEINSMQ; 941207334Spjd KASSERT(err == 0, ("insmntque() failed: error %d", err)); 942207334Spjd VOP_UNLOCK(vp, 0); 943207334Spjd err = 0; 944207334Spjd } else { 945207334Spjd dmu_buf_rele(db, NULL); 946207334Spjd err = ENOENT; 947207334Spjd } 948168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 949207334Spjd return (err); 950168404Spjd} 951168404Spjd 952185029Spjdint 953185029Spjdzfs_rezget(znode_t *zp) 954185029Spjd{ 955185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 956185029Spjd dmu_object_info_t doi; 957185029Spjd dmu_buf_t *db; 958185029Spjd uint64_t obj_num = zp->z_id; 959185029Spjd int err; 960185029Spjd 961185029Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 962185029Spjd 963185029Spjd err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 964185029Spjd if (err) { 965185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 966185029Spjd return (err); 967185029Spjd } 968185029Spjd 969185029Spjd dmu_object_info_from_db(db, &doi); 970185029Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 971185029Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 972185029Spjd dmu_buf_rele(db, NULL); 973185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 974185029Spjd return (EINVAL); 975185029Spjd } 976185029Spjd 977185029Spjd if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { 978185029Spjd dmu_buf_rele(db, NULL); 979185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 980185029Spjd return (EIO); 981185029Spjd } 982185029Spjd 983185029Spjd zfs_znode_dmu_init(zfsvfs, zp, db); 984185029Spjd zp->z_unlinked = (zp->z_phys->zp_links == 0); 985185029Spjd zp->z_blksz = doi.doi_data_block_size; 986185029Spjd 987185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 988185029Spjd 989185029Spjd return (0); 990185029Spjd} 991185029Spjd 992168404Spjdvoid 993168404Spjdzfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 994168404Spjd{ 995168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 996185029Spjd objset_t *os = zfsvfs->z_os; 997185029Spjd uint64_t obj = zp->z_id; 998185029Spjd uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; 999168404Spjd 1000185029Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 1001185029Spjd if (acl_obj) 1002185029Spjd VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1003185029Spjd VERIFY(0 == dmu_object_free(os, obj, tx)); 1004185029Spjd zfs_znode_dmu_fini(zp); 1005185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 1006185029Spjd zfs_znode_free(zp); 1007168404Spjd} 1008168404Spjd 1009168404Spjdvoid 1010168404Spjdzfs_zinactive(znode_t *zp) 1011168404Spjd{ 1012168404Spjd vnode_t *vp = ZTOV(zp); 1013168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1014168404Spjd uint64_t z_id = zp->z_id; 1015201406Sdelphij int vfslocked; 1016168404Spjd 1017185029Spjd ASSERT(zp->z_dbuf && zp->z_phys); 1018168404Spjd 1019168404Spjd /* 1020168404Spjd * Don't allow a zfs_zget() while were trying to release this znode 1021168404Spjd */ 1022168404Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 1023168404Spjd 1024168404Spjd mutex_enter(&zp->z_lock); 1025168404Spjd VI_LOCK(vp); 1026168404Spjd if (vp->v_count > 0) { 1027168404Spjd /* 1028168404Spjd * If the hold count is greater than zero, somebody has 1029168404Spjd * obtained a new reference on this znode while we were 1030168404Spjd * processing it here, so we are done. 1031168404Spjd */ 1032168404Spjd VI_UNLOCK(vp); 1033168404Spjd mutex_exit(&zp->z_lock); 1034168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1035168404Spjd return; 1036168404Spjd } 1037168404Spjd VI_UNLOCK(vp); 1038168404Spjd 1039168404Spjd /* 1040168404Spjd * If this was the last reference to a file with no links, 1041168404Spjd * remove the file from the file system. 1042168404Spjd */ 1043168404Spjd if (zp->z_unlinked) { 1044168404Spjd mutex_exit(&zp->z_lock); 1045168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1046168404Spjd ASSERT(vp->v_count == 0); 1047168404Spjd vrecycle(vp, curthread); 1048201406Sdelphij vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs); 1049168404Spjd zfs_rmnode(zp); 1050201406Sdelphij VFS_UNLOCK_GIANT(vfslocked); 1051168404Spjd return; 1052168404Spjd } 1053168404Spjd mutex_exit(&zp->z_lock); 1054168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1055168404Spjd} 1056168404Spjd 1057168404Spjdvoid 1058168404Spjdzfs_znode_free(znode_t *zp) 1059168404Spjd{ 1060168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1061168404Spjd 1062185029Spjd ASSERT(ZTOV(zp) == NULL); 1063168404Spjd mutex_enter(&zfsvfs->z_znodes_lock); 1064185029Spjd POINTER_INVALIDATE(&zp->z_zfsvfs); 1065168404Spjd list_remove(&zfsvfs->z_all_znodes, zp); 1066168404Spjd mutex_exit(&zfsvfs->z_znodes_lock); 1067168404Spjd 1068168404Spjd kmem_cache_free(znode_cache, zp); 1069185029Spjd 1070185029Spjd VFS_RELE(zfsvfs->z_vfs); 1071168404Spjd} 1072168404Spjd 1073168404Spjdvoid 1074168404Spjdzfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1075168404Spjd{ 1076168404Spjd timestruc_t now; 1077168404Spjd 1078168404Spjd ASSERT(MUTEX_HELD(&zp->z_lock)); 1079168404Spjd 1080168404Spjd gethrestime(&now); 1081168404Spjd 1082168404Spjd if (tx) { 1083168404Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1084168404Spjd zp->z_atime_dirty = 0; 1085168404Spjd zp->z_seq++; 1086168404Spjd } else { 1087168404Spjd zp->z_atime_dirty = 1; 1088168404Spjd } 1089168404Spjd 1090168404Spjd if (flag & AT_ATIME) 1091168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1092168404Spjd 1093185029Spjd if (flag & AT_MTIME) { 1094168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1095185029Spjd if (zp->z_zfsvfs->z_use_fuids) 1096185029Spjd zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); 1097185029Spjd } 1098168404Spjd 1099185029Spjd if (flag & AT_CTIME) { 1100168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1101185029Spjd if (zp->z_zfsvfs->z_use_fuids) 1102185029Spjd zp->z_phys->zp_flags |= ZFS_ARCHIVE; 1103185029Spjd } 1104168404Spjd} 1105168404Spjd 1106168404Spjd/* 1107168404Spjd * Update the requested znode timestamps with the current time. 1108168404Spjd * If we are in a transaction, then go ahead and mark the znode 1109168404Spjd * dirty in the transaction so the timestamps will go to disk. 1110168404Spjd * Otherwise, we will get pushed next time the znode is updated 1111168404Spjd * in a transaction, or when this znode eventually goes inactive. 1112168404Spjd * 1113168404Spjd * Why is this OK? 1114168404Spjd * 1 - Only the ACCESS time is ever updated outside of a transaction. 1115168404Spjd * 2 - Multiple consecutive updates will be collapsed into a single 1116168404Spjd * znode update by the transaction grouping semantics of the DMU. 1117168404Spjd */ 1118168404Spjdvoid 1119168404Spjdzfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1120168404Spjd{ 1121168404Spjd mutex_enter(&zp->z_lock); 1122168404Spjd zfs_time_stamper_locked(zp, flag, tx); 1123168404Spjd mutex_exit(&zp->z_lock); 1124168404Spjd} 1125168404Spjd 1126168404Spjd/* 1127168404Spjd * Grow the block size for a file. 1128168404Spjd * 1129168404Spjd * IN: zp - znode of file to free data in. 1130168404Spjd * size - requested block size 1131168404Spjd * tx - open transaction. 1132168404Spjd * 1133168404Spjd * NOTE: this function assumes that the znode is write locked. 1134168404Spjd */ 1135168404Spjdvoid 1136168404Spjdzfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1137168404Spjd{ 1138168404Spjd int error; 1139168404Spjd u_longlong_t dummy; 1140168404Spjd 1141168404Spjd if (size <= zp->z_blksz) 1142168404Spjd return; 1143168404Spjd /* 1144168404Spjd * If the file size is already greater than the current blocksize, 1145168404Spjd * we will not grow. If there is more than one block in a file, 1146168404Spjd * the blocksize cannot change. 1147168404Spjd */ 1148168404Spjd if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1149168404Spjd return; 1150168404Spjd 1151168404Spjd error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1152168404Spjd size, 0, tx); 1153168404Spjd if (error == ENOTSUP) 1154168404Spjd return; 1155168404Spjd ASSERT3U(error, ==, 0); 1156168404Spjd 1157168404Spjd /* What blocksize did we actually get? */ 1158168404Spjd dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1159168404Spjd} 1160168404Spjd 1161168404Spjd/* 1162185029Spjd * Increase the file length 1163168404Spjd * 1164168404Spjd * IN: zp - znode of file to free data in. 1165185029Spjd * end - new end-of-file 1166168404Spjd * 1167168404Spjd * RETURN: 0 if success 1168168404Spjd * error code if failure 1169168404Spjd */ 1170185029Spjdstatic int 1171185029Spjdzfs_extend(znode_t *zp, uint64_t end) 1172168404Spjd{ 1173185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1174168404Spjd dmu_tx_t *tx; 1175168404Spjd rl_t *rl; 1176185029Spjd uint64_t newblksz; 1177168404Spjd int error; 1178168404Spjd 1179168404Spjd /* 1180185029Spjd * We will change zp_size, lock the whole file. 1181168404Spjd */ 1182185029Spjd rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1183168404Spjd 1184168404Spjd /* 1185168404Spjd * Nothing to do if file already at desired length. 1186168404Spjd */ 1187185029Spjd if (end <= zp->z_phys->zp_size) { 1188168404Spjd zfs_range_unlock(rl); 1189168404Spjd return (0); 1190168404Spjd } 1191185029Spjdtop: 1192168404Spjd tx = dmu_tx_create(zfsvfs->z_os); 1193168404Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1194185029Spjd if (end > zp->z_blksz && 1195168404Spjd (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1196168404Spjd /* 1197168404Spjd * We are growing the file past the current block size. 1198168404Spjd */ 1199168404Spjd if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1200168404Spjd ASSERT(!ISP2(zp->z_blksz)); 1201185029Spjd newblksz = MIN(end, SPA_MAXBLOCKSIZE); 1202168404Spjd } else { 1203185029Spjd newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1204168404Spjd } 1205185029Spjd dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1206185029Spjd } else { 1207185029Spjd newblksz = 0; 1208168404Spjd } 1209168404Spjd 1210209962Smm error = dmu_tx_assign(tx, TXG_NOWAIT); 1211168404Spjd if (error) { 1212209962Smm if (error == ERESTART) { 1213168404Spjd dmu_tx_wait(tx); 1214185029Spjd dmu_tx_abort(tx); 1215185029Spjd goto top; 1216185029Spjd } 1217168404Spjd dmu_tx_abort(tx); 1218168404Spjd zfs_range_unlock(rl); 1219168404Spjd return (error); 1220168404Spjd } 1221185029Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1222168404Spjd 1223185029Spjd if (newblksz) 1224185029Spjd zfs_grow_blocksize(zp, newblksz, tx); 1225168404Spjd 1226185029Spjd zp->z_phys->zp_size = end; 1227168404Spjd 1228185029Spjd zfs_range_unlock(rl); 1229168404Spjd 1230185029Spjd dmu_tx_commit(tx); 1231185029Spjd 1232185029Spjd vnode_pager_setsize(ZTOV(zp), end); 1233185029Spjd 1234185029Spjd return (0); 1235185029Spjd} 1236185029Spjd 1237185029Spjd/* 1238185029Spjd * Free space in a file. 1239185029Spjd * 1240185029Spjd * IN: zp - znode of file to free data in. 1241185029Spjd * off - start of section to free. 1242185029Spjd * len - length of section to free. 1243185029Spjd * 1244185029Spjd * RETURN: 0 if success 1245185029Spjd * error code if failure 1246185029Spjd */ 1247185029Spjdstatic int 1248185029Spjdzfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1249185029Spjd{ 1250185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1251185029Spjd rl_t *rl; 1252185029Spjd int error; 1253185029Spjd 1254185029Spjd /* 1255185029Spjd * Lock the range being freed. 1256185029Spjd */ 1257185029Spjd rl = zfs_range_lock(zp, off, len, RL_WRITER); 1258185029Spjd 1259185029Spjd /* 1260185029Spjd * Nothing to do if file already at desired length. 1261185029Spjd */ 1262185029Spjd if (off >= zp->z_phys->zp_size) { 1263185029Spjd zfs_range_unlock(rl); 1264185029Spjd return (0); 1265168404Spjd } 1266168404Spjd 1267185029Spjd if (off + len > zp->z_phys->zp_size) 1268185029Spjd len = zp->z_phys->zp_size - off; 1269185029Spjd 1270185029Spjd error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1271185029Spjd 1272185029Spjd if (error == 0) { 1273185029Spjd /* 1274185029Spjd * In FreeBSD we cannot free block in the middle of a file, 1275185029Spjd * but only at the end of a file. 1276185029Spjd */ 1277185029Spjd vnode_pager_setsize(ZTOV(zp), off); 1278168404Spjd } 1279168404Spjd 1280168404Spjd zfs_range_unlock(rl); 1281168404Spjd 1282185029Spjd return (error); 1283185029Spjd} 1284185029Spjd 1285185029Spjd/* 1286185029Spjd * Truncate a file 1287185029Spjd * 1288185029Spjd * IN: zp - znode of file to free data in. 1289185029Spjd * end - new end-of-file. 1290185029Spjd * 1291185029Spjd * RETURN: 0 if success 1292185029Spjd * error code if failure 1293185029Spjd */ 1294185029Spjdstatic int 1295185029Spjdzfs_trunc(znode_t *zp, uint64_t end) 1296185029Spjd{ 1297185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1298185029Spjd vnode_t *vp = ZTOV(zp); 1299185029Spjd dmu_tx_t *tx; 1300185029Spjd rl_t *rl; 1301185029Spjd int error; 1302185029Spjd 1303185029Spjd /* 1304185029Spjd * We will change zp_size, lock the whole file. 1305185029Spjd */ 1306185029Spjd rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1307185029Spjd 1308185029Spjd /* 1309185029Spjd * Nothing to do if file already at desired length. 1310185029Spjd */ 1311185029Spjd if (end >= zp->z_phys->zp_size) { 1312185029Spjd zfs_range_unlock(rl); 1313185029Spjd return (0); 1314185029Spjd } 1315185029Spjd 1316185029Spjd error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); 1317185029Spjd if (error) { 1318185029Spjd zfs_range_unlock(rl); 1319185029Spjd return (error); 1320185029Spjd } 1321185029Spjdtop: 1322185029Spjd tx = dmu_tx_create(zfsvfs->z_os); 1323185029Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1324209962Smm error = dmu_tx_assign(tx, TXG_NOWAIT); 1325185029Spjd if (error) { 1326209962Smm if (error == ERESTART) { 1327185029Spjd dmu_tx_wait(tx); 1328185029Spjd dmu_tx_abort(tx); 1329185029Spjd goto top; 1330185029Spjd } 1331185029Spjd dmu_tx_abort(tx); 1332185029Spjd zfs_range_unlock(rl); 1333185029Spjd return (error); 1334185029Spjd } 1335185029Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1336185029Spjd 1337185029Spjd zp->z_phys->zp_size = end; 1338185029Spjd 1339168404Spjd dmu_tx_commit(tx); 1340168404Spjd 1341168404Spjd /* 1342168404Spjd * Clear any mapped pages in the truncated region. This has to 1343168404Spjd * happen outside of the transaction to avoid the possibility of 1344168404Spjd * a deadlock with someone trying to push a page that we are 1345168404Spjd * about to invalidate. 1346168404Spjd */ 1347185029Spjd vnode_pager_setsize(vp, end); 1348168404Spjd 1349209962Smm zfs_range_unlock(rl); 1350209962Smm 1351168404Spjd return (0); 1352168404Spjd} 1353168404Spjd 1354185029Spjd/* 1355185029Spjd * Free space in a file 1356185029Spjd * 1357185029Spjd * IN: zp - znode of file to free data in. 1358185029Spjd * off - start of range 1359185029Spjd * len - end of range (0 => EOF) 1360185029Spjd * flag - current file open mode flags. 1361185029Spjd * log - TRUE if this action should be logged 1362185029Spjd * 1363185029Spjd * RETURN: 0 if success 1364185029Spjd * error code if failure 1365185029Spjd */ 1366185029Spjdint 1367185029Spjdzfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1368185029Spjd{ 1369185029Spjd vnode_t *vp = ZTOV(zp); 1370185029Spjd dmu_tx_t *tx; 1371185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1372185029Spjd zilog_t *zilog = zfsvfs->z_log; 1373185029Spjd int error; 1374185029Spjd 1375185029Spjd if (off > zp->z_phys->zp_size) { 1376185029Spjd error = zfs_extend(zp, off+len); 1377185029Spjd if (error == 0 && log) 1378185029Spjd goto log; 1379185029Spjd else 1380185029Spjd return (error); 1381185029Spjd } 1382185029Spjd 1383185029Spjd if (len == 0) { 1384185029Spjd error = zfs_trunc(zp, off); 1385185029Spjd } else { 1386185029Spjd if ((error = zfs_free_range(zp, off, len)) == 0 && 1387185029Spjd off + len > zp->z_phys->zp_size) 1388185029Spjd error = zfs_extend(zp, off+len); 1389185029Spjd } 1390185029Spjd if (error || !log) 1391185029Spjd return (error); 1392185029Spjdlog: 1393185029Spjd tx = dmu_tx_create(zfsvfs->z_os); 1394185029Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1395209962Smm error = dmu_tx_assign(tx, TXG_NOWAIT); 1396185029Spjd if (error) { 1397209962Smm if (error == ERESTART) { 1398185029Spjd dmu_tx_wait(tx); 1399185029Spjd dmu_tx_abort(tx); 1400185029Spjd goto log; 1401185029Spjd } 1402185029Spjd dmu_tx_abort(tx); 1403185029Spjd return (error); 1404185029Spjd } 1405185029Spjd 1406185029Spjd zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 1407185029Spjd zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1408185029Spjd 1409185029Spjd dmu_tx_commit(tx); 1410185029Spjd return (0); 1411185029Spjd} 1412185029Spjd 1413168404Spjdvoid 1414185029Spjdzfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1415168404Spjd{ 1416168404Spjd zfsvfs_t zfsvfs; 1417209962Smm uint64_t moid, obj, version; 1418185029Spjd uint64_t sense = ZFS_CASE_SENSITIVE; 1419185029Spjd uint64_t norm = 0; 1420185029Spjd nvpair_t *elem; 1421168404Spjd int error; 1422207334Spjd int i; 1423168404Spjd znode_t *rootzp = NULL; 1424199156Spjd vnode_t vnode; 1425168404Spjd vattr_t vattr; 1426185029Spjd znode_t *zp; 1427209962Smm zfs_acl_ids_t acl_ids; 1428168404Spjd 1429168404Spjd /* 1430168404Spjd * First attempt to create master node. 1431168404Spjd */ 1432168404Spjd /* 1433168404Spjd * In an empty objset, there are no blocks to read and thus 1434168404Spjd * there can be no i/o errors (which we assert below). 1435168404Spjd */ 1436168404Spjd moid = MASTER_NODE_OBJ; 1437168404Spjd error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1438168404Spjd DMU_OT_NONE, 0, tx); 1439168404Spjd ASSERT(error == 0); 1440168404Spjd 1441168404Spjd /* 1442168404Spjd * Set starting attributes. 1443168404Spjd */ 1444209962Smm if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE) 1445185029Spjd version = ZPL_VERSION; 1446209962Smm else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) 1447209962Smm version = ZPL_VERSION_USERSPACE - 1; 1448185029Spjd else 1449185029Spjd version = ZPL_VERSION_FUID - 1; 1450185029Spjd elem = NULL; 1451185029Spjd while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1452185029Spjd /* For the moment we expect all zpl props to be uint64_ts */ 1453185029Spjd uint64_t val; 1454185029Spjd char *name; 1455168404Spjd 1456185029Spjd ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1457185029Spjd VERIFY(nvpair_value_uint64(elem, &val) == 0); 1458185029Spjd name = nvpair_name(elem); 1459185029Spjd if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1460209962Smm if (val < version) 1461209962Smm version = val; 1462185029Spjd } else { 1463185029Spjd error = zap_update(os, moid, name, 8, 1, &val, tx); 1464185029Spjd } 1465185029Spjd ASSERT(error == 0); 1466185029Spjd if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1467185029Spjd norm = val; 1468185029Spjd else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1469185029Spjd sense = val; 1470185029Spjd } 1471185029Spjd ASSERT(version != 0); 1472209962Smm error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); 1473168404Spjd 1474168404Spjd /* 1475168404Spjd * Create a delete queue. 1476168404Spjd */ 1477209962Smm obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1478168404Spjd 1479209962Smm error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); 1480168404Spjd ASSERT(error == 0); 1481168404Spjd 1482168404Spjd /* 1483168404Spjd * Create root znode. Create minimal znode/vnode/zfsvfs 1484168404Spjd * to allow zfs_mknode to work. 1485168404Spjd */ 1486185029Spjd VATTR_NULL(&vattr); 1487168404Spjd vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1488168404Spjd vattr.va_type = VDIR; 1489168404Spjd vattr.va_mode = S_IFDIR|0755; 1490185029Spjd vattr.va_uid = crgetuid(cr); 1491185029Spjd vattr.va_gid = crgetgid(cr); 1492168404Spjd 1493168404Spjd rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1494199156Spjd zfs_znode_cache_constructor(rootzp, NULL, 0); 1495168404Spjd rootzp->z_unlinked = 0; 1496168404Spjd rootzp->z_atime_dirty = 0; 1497168404Spjd 1498199156Spjd vnode.v_type = VDIR; 1499199156Spjd vnode.v_data = rootzp; 1500199156Spjd rootzp->z_vnode = &vnode; 1501185029Spjd 1502168404Spjd bzero(&zfsvfs, sizeof (zfsvfs_t)); 1503168404Spjd 1504168404Spjd zfsvfs.z_os = os; 1505168404Spjd zfsvfs.z_parent = &zfsvfs; 1506185029Spjd zfsvfs.z_version = version; 1507185029Spjd zfsvfs.z_use_fuids = USE_FUIDS(version, os); 1508185029Spjd zfsvfs.z_norm = norm; 1509185029Spjd /* 1510185029Spjd * Fold case on file systems that are always or sometimes case 1511185029Spjd * insensitive. 1512185029Spjd */ 1513185029Spjd if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1514185029Spjd zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; 1515168404Spjd 1516168404Spjd mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1517168404Spjd list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1518168404Spjd offsetof(znode_t, z_link_node)); 1519168404Spjd 1520207334Spjd for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1521207334Spjd mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1522207334Spjd 1523185029Spjd ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); 1524185029Spjd rootzp->z_zfsvfs = &zfsvfs; 1525209962Smm VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, 1526209962Smm cr, NULL, &acl_ids)); 1527209962Smm zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids); 1528185029Spjd ASSERT3P(zp, ==, rootzp); 1529185029Spjd error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1530168404Spjd ASSERT(error == 0); 1531209962Smm zfs_acl_ids_free(&acl_ids); 1532185029Spjd POINTER_INVALIDATE(&rootzp->z_zfsvfs); 1533168404Spjd 1534185029Spjd dmu_buf_rele(rootzp->z_dbuf, NULL); 1535185029Spjd rootzp->z_dbuf = NULL; 1536209962Smm rootzp->z_vnode = NULL; 1537209962Smm kmem_cache_free(znode_cache, rootzp); 1538209962Smm 1539209962Smm /* 1540209962Smm * Create shares directory 1541209962Smm */ 1542209962Smm 1543209962Smm error = zfs_create_share_dir(&zfsvfs, tx); 1544209962Smm 1545209962Smm ASSERT(error == 0); 1546209962Smm 1547207334Spjd for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1548207334Spjd mutex_destroy(&zfsvfs.z_hold_mtx[i]); 1549168404Spjd} 1550185029Spjd 1551168404Spjd#endif /* _KERNEL */ 1552168404Spjd/* 1553168404Spjd * Given an object number, return its parent object number and whether 1554168404Spjd * or not the object is an extended attribute directory. 1555168404Spjd */ 1556168404Spjdstatic int 1557168404Spjdzfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) 1558168404Spjd{ 1559168404Spjd dmu_buf_t *db; 1560168404Spjd dmu_object_info_t doi; 1561168404Spjd znode_phys_t *zp; 1562168404Spjd int error; 1563168404Spjd 1564168404Spjd if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) 1565168404Spjd return (error); 1566168404Spjd 1567168404Spjd dmu_object_info_from_db(db, &doi); 1568168404Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 1569168404Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 1570168404Spjd dmu_buf_rele(db, FTAG); 1571168404Spjd return (EINVAL); 1572168404Spjd } 1573168404Spjd 1574168404Spjd zp = db->db_data; 1575168404Spjd *pobjp = zp->zp_parent; 1576168404Spjd *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && 1577168404Spjd S_ISDIR(zp->zp_mode); 1578168404Spjd dmu_buf_rele(db, FTAG); 1579168404Spjd 1580168404Spjd return (0); 1581168404Spjd} 1582168404Spjd 1583168404Spjdint 1584168404Spjdzfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 1585168404Spjd{ 1586168404Spjd char *path = buf + len - 1; 1587168404Spjd int error; 1588168404Spjd 1589168404Spjd *path = '\0'; 1590168404Spjd 1591168404Spjd for (;;) { 1592168404Spjd uint64_t pobj; 1593168404Spjd char component[MAXNAMELEN + 2]; 1594168404Spjd size_t complen; 1595168404Spjd int is_xattrdir; 1596168404Spjd 1597168404Spjd if ((error = zfs_obj_to_pobj(osp, obj, &pobj, 1598168404Spjd &is_xattrdir)) != 0) 1599168404Spjd break; 1600168404Spjd 1601168404Spjd if (pobj == obj) { 1602168404Spjd if (path[0] != '/') 1603168404Spjd *--path = '/'; 1604168404Spjd break; 1605168404Spjd } 1606168404Spjd 1607168404Spjd component[0] = '/'; 1608168404Spjd if (is_xattrdir) { 1609168404Spjd (void) sprintf(component + 1, "<xattrdir>"); 1610168404Spjd } else { 1611185029Spjd error = zap_value_search(osp, pobj, obj, 1612185029Spjd ZFS_DIRENT_OBJ(-1ULL), component + 1); 1613168404Spjd if (error != 0) 1614168404Spjd break; 1615168404Spjd } 1616168404Spjd 1617168404Spjd complen = strlen(component); 1618168404Spjd path -= complen; 1619168404Spjd ASSERT(path >= buf); 1620168404Spjd bcopy(component, path, complen); 1621168404Spjd obj = pobj; 1622168404Spjd } 1623168404Spjd 1624168404Spjd if (error == 0) 1625168404Spjd (void) memmove(buf, path, buf + len - path); 1626168404Spjd return (error); 1627168404Spjd} 1628