zfs_znode.c revision 197458
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22185029Spjd * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23168404Spjd * Use is subject to license terms. 24168404Spjd */ 25168404Spjd 26169195Spjd/* Portions Copyright 2007 Jeremy Teo */ 27169195Spjd 28168404Spjd#ifdef _KERNEL 29168404Spjd#include <sys/types.h> 30168404Spjd#include <sys/param.h> 31168404Spjd#include <sys/time.h> 32168404Spjd#include <sys/systm.h> 33168404Spjd#include <sys/sysmacros.h> 34168404Spjd#include <sys/resource.h> 35168404Spjd#include <sys/mntent.h> 36185029Spjd#include <sys/u8_textprep.h> 37185029Spjd#include <sys/dsl_dataset.h> 38168404Spjd#include <sys/vfs.h> 39168404Spjd#include <sys/vnode.h> 40168404Spjd#include <sys/file.h> 41168404Spjd#include <sys/kmem.h> 42168404Spjd#include <sys/errno.h> 43168404Spjd#include <sys/unistd.h> 44168404Spjd#include <sys/atomic.h> 45168404Spjd#include <sys/zfs_dir.h> 46168404Spjd#include <sys/zfs_acl.h> 47168404Spjd#include <sys/zfs_ioctl.h> 48168404Spjd#include <sys/zfs_rlock.h> 49185029Spjd#include <sys/zfs_fuid.h> 50168404Spjd#include <sys/fs/zfs.h> 51185029Spjd#include <sys/kidmap.h> 52168404Spjd#endif /* _KERNEL */ 53168404Spjd 54168404Spjd#include <sys/dmu.h> 55168404Spjd#include <sys/refcount.h> 56168404Spjd#include <sys/stat.h> 57168404Spjd#include <sys/zap.h> 58168404Spjd#include <sys/zfs_znode.h> 59168404Spjd#include <sys/refcount.h> 60168404Spjd 61185029Spjd#include "zfs_prop.h" 62185029Spjd 63173268Slulf/* Used by fstat(1). */ 64173268SlulfSYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t), 65173268Slulf "sizeof(znode_t)"); 66173268Slulf 67168404Spjd/* 68185029Spjd * Define ZNODE_STATS to turn on statistic gathering. By default, it is only 69185029Spjd * turned on when DEBUG is also defined. 70185029Spjd */ 71185029Spjd#ifdef DEBUG 72185029Spjd#define ZNODE_STATS 73185029Spjd#endif /* DEBUG */ 74185029Spjd 75185029Spjd#ifdef ZNODE_STATS 76185029Spjd#define ZNODE_STAT_ADD(stat) ((stat)++) 77185029Spjd#else 78185029Spjd#define ZNODE_STAT_ADD(stat) /* nothing */ 79185029Spjd#endif /* ZNODE_STATS */ 80185029Spjd 81185029Spjd#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) 82185029Spjd#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) 83185029Spjd 84185029Spjd/* 85168404Spjd * Functions needed for userland (ie: libzpool) are not put under 86168404Spjd * #ifdef_KERNEL; the rest of the functions have dependencies 87168404Spjd * (such as VFS logic) that will not compile easily in userland. 88168404Spjd */ 89168404Spjd#ifdef _KERNEL 90185029Spjdstatic kmem_cache_t *znode_cache = NULL; 91168404Spjd 92168404Spjd/*ARGSUSED*/ 93168404Spjdstatic void 94185029Spjdznode_evict_error(dmu_buf_t *dbuf, void *user_ptr) 95168404Spjd{ 96185029Spjd#if 1 /* XXXPJD: From OpenSolaris. */ 97185029Spjd /* 98185029Spjd * We should never drop all dbuf refs without first clearing 99185029Spjd * the eviction callback. 100185029Spjd */ 101185029Spjd panic("evicting znode %p\n", user_ptr); 102185029Spjd#else /* XXXPJD */ 103168404Spjd znode_t *zp = user_ptr; 104168488Spjd vnode_t *vp; 105168404Spjd 106168404Spjd mutex_enter(&zp->z_lock); 107185029Spjd zp->z_dbuf = NULL; 108168488Spjd vp = ZTOV(zp); 109168404Spjd if (vp == NULL) { 110168404Spjd mutex_exit(&zp->z_lock); 111168404Spjd zfs_znode_free(zp); 112168404Spjd } else if (vp->v_count == 0) { 113197153Spjd zp->z_vnode = NULL; 114168488Spjd vhold(vp); 115168404Spjd mutex_exit(&zp->z_lock); 116185029Spjd vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); 117168404Spjd vrecycle(vp, curthread); 118175294Sattilio VOP_UNLOCK(vp, 0); 119168404Spjd vdrop(vp); 120168404Spjd zfs_znode_free(zp); 121168404Spjd } else { 122168404Spjd mutex_exit(&zp->z_lock); 123168404Spjd } 124185029Spjd#endif 125168404Spjd} 126168404Spjd 127168404Spjdextern struct vop_vector zfs_vnodeops; 128168404Spjdextern struct vop_vector zfs_fifoops; 129168404Spjd 130168404Spjd/* 131168404Spjd * XXX: We cannot use this function as a cache constructor, because 132168404Spjd * there is one global cache for all file systems and we need 133168404Spjd * to pass vfsp here, which is not possible, because argument 134168404Spjd * 'cdrarg' is defined at kmem_cache_create() time. 135168404Spjd */ 136168404Spjdstatic int 137185029Spjdzfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 138168404Spjd{ 139168404Spjd znode_t *zp = buf; 140169196Spjd vnode_t *vp; 141185029Spjd vfs_t *vfsp = arg; 142168404Spjd int error; 143168404Spjd 144185029Spjd POINTER_INVALIDATE(&zp->z_zfsvfs); 145185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 146185029Spjd ASSERT(vfsp != NULL); 147185029Spjd 148185029Spjd error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp); 149185029Spjd if (error != 0 && (kmflags & KM_NOSLEEP)) 150185029Spjd return (-1); 151185029Spjd ASSERT(error == 0); 152185029Spjd vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 153185029Spjd zp->z_vnode = vp; 154185029Spjd vp->v_data = (caddr_t)zp; 155185029Spjd VN_LOCK_AREC(vp); 156185029Spjd 157185029Spjd list_link_init(&zp->z_link_node); 158185029Spjd 159168404Spjd mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 160168404Spjd rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); 161168404Spjd rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 162168404Spjd rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); 163168404Spjd mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 164168404Spjd 165168404Spjd mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); 166168404Spjd avl_create(&zp->z_range_avl, zfs_range_compare, 167168404Spjd sizeof (rl_t), offsetof(rl_t, r_node)); 168168404Spjd 169185029Spjd zp->z_dbuf = NULL; 170185029Spjd zp->z_dirlocks = NULL; 171168404Spjd return (0); 172168404Spjd} 173168404Spjd 174168404Spjd/*ARGSUSED*/ 175168404Spjdstatic void 176185029Spjdzfs_znode_cache_destructor(void *buf, void *arg) 177168404Spjd{ 178168404Spjd znode_t *zp = buf; 179168404Spjd 180185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 181185029Spjd ASSERT(ZTOV(zp) == NULL); 182185029Spjd vn_free(ZTOV(zp)); 183185029Spjd ASSERT(!list_link_active(&zp->z_link_node)); 184168404Spjd mutex_destroy(&zp->z_lock); 185168404Spjd rw_destroy(&zp->z_map_lock); 186168404Spjd rw_destroy(&zp->z_parent_lock); 187168404Spjd rw_destroy(&zp->z_name_lock); 188168404Spjd mutex_destroy(&zp->z_acl_lock); 189185029Spjd avl_destroy(&zp->z_range_avl); 190168404Spjd mutex_destroy(&zp->z_range_lock); 191168404Spjd 192185029Spjd ASSERT(zp->z_dbuf == NULL); 193185029Spjd ASSERT(zp->z_dirlocks == NULL); 194168404Spjd} 195168404Spjd 196185029Spjd#ifdef ZNODE_STATS 197185029Spjdstatic struct { 198185029Spjd uint64_t zms_zfsvfs_invalid; 199185029Spjd uint64_t zms_zfsvfs_unmounted; 200185029Spjd uint64_t zms_zfsvfs_recheck_invalid; 201185029Spjd uint64_t zms_obj_held; 202185029Spjd uint64_t zms_vnode_locked; 203185029Spjd uint64_t zms_not_only_dnlc; 204185029Spjd} znode_move_stats; 205185029Spjd#endif /* ZNODE_STATS */ 206185029Spjd 207185029Spjd#if defined(sun) 208185029Spjdstatic void 209185029Spjdzfs_znode_move_impl(znode_t *ozp, znode_t *nzp) 210185029Spjd{ 211185029Spjd vnode_t *vp; 212185029Spjd 213185029Spjd /* Copy fields. */ 214185029Spjd nzp->z_zfsvfs = ozp->z_zfsvfs; 215185029Spjd 216185029Spjd /* Swap vnodes. */ 217185029Spjd vp = nzp->z_vnode; 218185029Spjd nzp->z_vnode = ozp->z_vnode; 219185029Spjd ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ 220185029Spjd ZTOV(ozp)->v_data = ozp; 221185029Spjd ZTOV(nzp)->v_data = nzp; 222185029Spjd 223185029Spjd nzp->z_id = ozp->z_id; 224185029Spjd ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ 225185029Spjd ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); 226185029Spjd nzp->z_unlinked = ozp->z_unlinked; 227185029Spjd nzp->z_atime_dirty = ozp->z_atime_dirty; 228185029Spjd nzp->z_zn_prefetch = ozp->z_zn_prefetch; 229185029Spjd nzp->z_blksz = ozp->z_blksz; 230185029Spjd nzp->z_seq = ozp->z_seq; 231185029Spjd nzp->z_mapcnt = ozp->z_mapcnt; 232185029Spjd nzp->z_last_itx = ozp->z_last_itx; 233185029Spjd nzp->z_gen = ozp->z_gen; 234185029Spjd nzp->z_sync_cnt = ozp->z_sync_cnt; 235185029Spjd nzp->z_phys = ozp->z_phys; 236185029Spjd nzp->z_dbuf = ozp->z_dbuf; 237185029Spjd 238185029Spjd /* Update back pointers. */ 239185029Spjd (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, 240185029Spjd znode_evict_error); 241185029Spjd 242185029Spjd /* 243185029Spjd * Invalidate the original znode by clearing fields that provide a 244185029Spjd * pointer back to the znode. Set the low bit of the vfs pointer to 245185029Spjd * ensure that zfs_znode_move() recognizes the znode as invalid in any 246185029Spjd * subsequent callback. 247185029Spjd */ 248185029Spjd ozp->z_dbuf = NULL; 249185029Spjd POINTER_INVALIDATE(&ozp->z_zfsvfs); 250185029Spjd} 251185029Spjd 252185029Spjd/* 253185029Spjd * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise 254185029Spjd * returns a non-zero error code. 255185029Spjd */ 256185029Spjdstatic int 257185029Spjdzfs_enter(zfsvfs_t *zfsvfs) 258185029Spjd{ 259185029Spjd ZFS_ENTER(zfsvfs); 260185029Spjd return (0); 261185029Spjd} 262185029Spjd 263185029Spjd/*ARGSUSED*/ 264185029Spjdstatic kmem_cbrc_t 265185029Spjdzfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) 266185029Spjd{ 267185029Spjd znode_t *ozp = buf, *nzp = newbuf; 268185029Spjd zfsvfs_t *zfsvfs; 269185029Spjd vnode_t *vp; 270185029Spjd 271185029Spjd /* 272185029Spjd * The znode is on the file system's list of known znodes if the vfs 273185029Spjd * pointer is valid. We set the low bit of the vfs pointer when freeing 274185029Spjd * the znode to invalidate it, and the memory patterns written by kmem 275185029Spjd * (baddcafe and deadbeef) set at least one of the two low bits. A newly 276185029Spjd * created znode sets the vfs pointer last of all to indicate that the 277185029Spjd * znode is known and in a valid state to be moved by this function. 278185029Spjd */ 279185029Spjd zfsvfs = ozp->z_zfsvfs; 280185029Spjd if (!POINTER_IS_VALID(zfsvfs)) { 281185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); 282185029Spjd return (KMEM_CBRC_DONT_KNOW); 283185029Spjd } 284185029Spjd 285185029Spjd /* 286185029Spjd * Ensure that the filesystem is not unmounted during the move. 287185029Spjd */ 288185029Spjd if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */ 289185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); 290185029Spjd return (KMEM_CBRC_DONT_KNOW); 291185029Spjd } 292185029Spjd 293185029Spjd mutex_enter(&zfsvfs->z_znodes_lock); 294185029Spjd /* 295185029Spjd * Recheck the vfs pointer in case the znode was removed just before 296185029Spjd * acquiring the lock. 297185029Spjd */ 298185029Spjd if (zfsvfs != ozp->z_zfsvfs) { 299185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 300185029Spjd ZFS_EXIT(zfsvfs); 301185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid); 302185029Spjd return (KMEM_CBRC_DONT_KNOW); 303185029Spjd } 304185029Spjd 305185029Spjd /* 306185029Spjd * At this point we know that as long as we hold z_znodes_lock, the 307185029Spjd * znode cannot be freed and fields within the znode can be safely 308185029Spjd * accessed. Now, prevent a race with zfs_zget(). 309185029Spjd */ 310185029Spjd if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { 311185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 312185029Spjd ZFS_EXIT(zfsvfs); 313185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); 314185029Spjd return (KMEM_CBRC_LATER); 315185029Spjd } 316185029Spjd 317185029Spjd vp = ZTOV(ozp); 318185029Spjd if (mutex_tryenter(&vp->v_lock) == 0) { 319185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 320185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 321185029Spjd ZFS_EXIT(zfsvfs); 322185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); 323185029Spjd return (KMEM_CBRC_LATER); 324185029Spjd } 325185029Spjd 326185029Spjd /* Only move znodes that are referenced _only_ by the DNLC. */ 327185029Spjd if (vp->v_count != 1 || !vn_in_dnlc(vp)) { 328185029Spjd mutex_exit(&vp->v_lock); 329185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 330185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 331185029Spjd ZFS_EXIT(zfsvfs); 332185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); 333185029Spjd return (KMEM_CBRC_LATER); 334185029Spjd } 335185029Spjd 336185029Spjd /* 337185029Spjd * The znode is known and in a valid state to move. We're holding the 338185029Spjd * locks needed to execute the critical section. 339185029Spjd */ 340185029Spjd zfs_znode_move_impl(ozp, nzp); 341185029Spjd mutex_exit(&vp->v_lock); 342185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 343185029Spjd 344185029Spjd list_link_replace(&ozp->z_link_node, &nzp->z_link_node); 345185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 346185029Spjd ZFS_EXIT(zfsvfs); 347185029Spjd 348185029Spjd return (KMEM_CBRC_YES); 349185029Spjd} 350185029Spjd#endif /* sun */ 351185029Spjd 352168404Spjdvoid 353168404Spjdzfs_znode_init(void) 354168404Spjd{ 355168404Spjd /* 356168404Spjd * Initialize zcache 357168404Spjd */ 358168404Spjd ASSERT(znode_cache == NULL); 359168404Spjd znode_cache = kmem_cache_create("zfs_znode_cache", 360168404Spjd sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL, 361168404Spjd zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 362185029Spjd#if defined(sun) 363185029Spjd kmem_cache_set_move(znode_cache, zfs_znode_move); 364185029Spjd#endif 365168404Spjd} 366168404Spjd 367168404Spjdvoid 368168404Spjdzfs_znode_fini(void) 369168404Spjd{ 370168404Spjd /* 371168404Spjd * Cleanup zcache 372168404Spjd */ 373168404Spjd if (znode_cache) 374168404Spjd kmem_cache_destroy(znode_cache); 375168404Spjd znode_cache = NULL; 376168404Spjd} 377168404Spjd 378168404Spjd/* 379168404Spjd * zfs_init_fs - Initialize the zfsvfs struct and the file system 380168404Spjd * incore "master" object. Verify version compatibility. 381168404Spjd */ 382168404Spjdint 383185029Spjdzfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp) 384168404Spjd{ 385168404Spjd objset_t *os = zfsvfs->z_os; 386168404Spjd int i, error; 387168404Spjd uint64_t fsid_guid; 388185029Spjd uint64_t zval; 389168404Spjd 390168404Spjd *zpp = NULL; 391168404Spjd 392185029Spjd error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 393168404Spjd if (error) { 394168404Spjd return (error); 395185029Spjd } else if (zfsvfs->z_version > ZPL_VERSION) { 396168404Spjd (void) printf("Mismatched versions: File system " 397185029Spjd "is version %llu on-disk format, which is " 398168404Spjd "incompatible with this software version %lld!", 399185029Spjd (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); 400168404Spjd return (ENOTSUP); 401168404Spjd } 402168404Spjd 403185029Spjd if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) 404185029Spjd return (error); 405185029Spjd zfsvfs->z_norm = (int)zval; 406185029Spjd if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) 407185029Spjd return (error); 408185029Spjd zfsvfs->z_utf8 = (zval != 0); 409185029Spjd if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) 410185029Spjd return (error); 411185029Spjd zfsvfs->z_case = (uint_t)zval; 412168404Spjd /* 413185029Spjd * Fold case on file systems that are always or sometimes case 414185029Spjd * insensitive. 415185029Spjd */ 416185029Spjd if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 417185029Spjd zfsvfs->z_case == ZFS_CASE_MIXED) 418185029Spjd zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 419185029Spjd 420185029Spjd /* 421168404Spjd * The fsid is 64 bits, composed of an 8-bit fs type, which 422168404Spjd * separates our fsid from any other filesystem types, and a 423168404Spjd * 56-bit objset unique ID. The objset unique ID is unique to 424168404Spjd * all objsets open on this system, provided by unique_create(). 425168404Spjd * The 8-bit fs type must be put in the low bits of fsid[1] 426168404Spjd * because that's where other Solaris filesystems put it. 427168404Spjd */ 428168404Spjd fsid_guid = dmu_objset_fsid_guid(os); 429168404Spjd ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 430168404Spjd zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid; 431168404Spjd zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | 432168404Spjd zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF; 433168404Spjd 434168404Spjd error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 435168404Spjd &zfsvfs->z_root); 436168404Spjd if (error) 437168404Spjd return (error); 438168404Spjd ASSERT(zfsvfs->z_root != 0); 439168404Spjd 440185029Spjd error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 441185029Spjd &zfsvfs->z_unlinkedobj); 442185029Spjd if (error) 443185029Spjd return (error); 444168404Spjd 445168404Spjd /* 446168404Spjd * Initialize zget mutex's 447168404Spjd */ 448168404Spjd for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 449168404Spjd mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 450168404Spjd 451168404Spjd error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); 452185029Spjd if (error) { 453185029Spjd /* 454185029Spjd * On error, we destroy the mutexes here since it's not 455185029Spjd * possible for the caller to determine if the mutexes were 456185029Spjd * initialized properly. 457185029Spjd */ 458185029Spjd for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 459185029Spjd mutex_destroy(&zfsvfs->z_hold_mtx[i]); 460168404Spjd return (error); 461185029Spjd } 462168404Spjd ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); 463185029Spjd error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 464185029Spjd &zfsvfs->z_fuid_obj); 465185029Spjd if (error == ENOENT) 466185029Spjd error = 0; 467168404Spjd 468168404Spjd return (0); 469168404Spjd} 470168404Spjd 471168404Spjd/* 472168404Spjd * define a couple of values we need available 473168404Spjd * for both 64 and 32 bit environments. 474168404Spjd */ 475168404Spjd#ifndef NBITSMINOR64 476168404Spjd#define NBITSMINOR64 32 477168404Spjd#endif 478168404Spjd#ifndef MAXMAJ64 479168404Spjd#define MAXMAJ64 0xffffffffUL 480168404Spjd#endif 481168404Spjd#ifndef MAXMIN64 482168404Spjd#define MAXMIN64 0xffffffffUL 483168404Spjd#endif 484168404Spjd 485168404Spjd/* 486168404Spjd * Create special expldev for ZFS private use. 487168404Spjd * Can't use standard expldev since it doesn't do 488168404Spjd * what we want. The standard expldev() takes a 489168404Spjd * dev32_t in LP64 and expands it to a long dev_t. 490168404Spjd * We need an interface that takes a dev32_t in ILP32 491168404Spjd * and expands it to a long dev_t. 492168404Spjd */ 493168404Spjdstatic uint64_t 494168404Spjdzfs_expldev(dev_t dev) 495168404Spjd{ 496187830Sed return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); 497168404Spjd} 498168404Spjd/* 499168404Spjd * Special cmpldev for ZFS private use. 500168404Spjd * Can't use standard cmpldev since it takes 501168404Spjd * a long dev_t and compresses it to dev32_t in 502168404Spjd * LP64. We need to do a compaction of a long dev_t 503168404Spjd * to a dev32_t in ILP32. 504168404Spjd */ 505168404Spjddev_t 506168404Spjdzfs_cmpldev(uint64_t dev) 507168404Spjd{ 508168958Spjd return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); 509168404Spjd} 510168404Spjd 511185029Spjdstatic void 512185029Spjdzfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) 513185029Spjd{ 514185029Spjd znode_t *nzp; 515185029Spjd 516185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); 517185029Spjd ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); 518185029Spjd 519185029Spjd mutex_enter(&zp->z_lock); 520185029Spjd 521185029Spjd ASSERT(zp->z_dbuf == NULL); 522185029Spjd zp->z_dbuf = db; 523185029Spjd nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); 524185029Spjd 525185029Spjd /* 526185029Spjd * there should be no 527185029Spjd * concurrent zgets on this object. 528185029Spjd */ 529185029Spjd if (nzp != NULL) 530185029Spjd panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); 531185029Spjd 532185029Spjd /* 533185029Spjd * Slap on VROOT if we are the root znode 534185029Spjd */ 535185029Spjd if (zp->z_id == zfsvfs->z_root) 536185029Spjd ZTOV(zp)->v_flag |= VROOT; 537185029Spjd 538185029Spjd mutex_exit(&zp->z_lock); 539185029Spjd vn_exists(ZTOV(zp)); 540185029Spjd} 541185029Spjd 542185029Spjdvoid 543185029Spjdzfs_znode_dmu_fini(znode_t *zp) 544185029Spjd{ 545185029Spjd dmu_buf_t *db = zp->z_dbuf; 546185029Spjd ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || 547185029Spjd zp->z_unlinked || 548185029Spjd RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); 549185029Spjd ASSERT(zp->z_dbuf != NULL); 550185029Spjd zp->z_dbuf = NULL; 551185029Spjd VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); 552185029Spjd dmu_buf_rele(db, NULL); 553185029Spjd} 554185029Spjd 555168404Spjd/* 556168404Spjd * Construct a new znode/vnode and intialize. 557168404Spjd * 558168404Spjd * This does not do a call to dmu_set_user() that is 559168404Spjd * up to the caller to do, in case you don't want to 560168404Spjd * return the znode 561168404Spjd */ 562168404Spjdstatic znode_t * 563185029Spjdzfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) 564168404Spjd{ 565168404Spjd znode_t *zp; 566168404Spjd vnode_t *vp; 567168404Spjd 568168404Spjd zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 569185029Spjd zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0); 570168404Spjd 571168404Spjd ASSERT(zp->z_dirlocks == NULL); 572185029Spjd ASSERT(zp->z_dbuf == NULL); 573185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 574168404Spjd 575185029Spjd /* 576185029Spjd * Defer setting z_zfsvfs until the znode is ready to be a candidate for 577185029Spjd * the zfs_znode_move() callback. 578185029Spjd */ 579185029Spjd zp->z_phys = NULL; 580168404Spjd zp->z_unlinked = 0; 581168404Spjd zp->z_atime_dirty = 0; 582168404Spjd zp->z_mapcnt = 0; 583168404Spjd zp->z_last_itx = 0; 584185029Spjd zp->z_id = db->db_object; 585168404Spjd zp->z_blksz = blksz; 586168404Spjd zp->z_seq = 0x7A4653; 587168404Spjd zp->z_sync_cnt = 0; 588168404Spjd 589185029Spjd vp = ZTOV(zp); 590185029Spjd#ifdef TODO 591185029Spjd vn_reinit(vp); 592185029Spjd#endif 593168404Spjd 594185029Spjd zfs_znode_dmu_init(zfsvfs, zp, db); 595185029Spjd 596185029Spjd zp->z_gen = zp->z_phys->zp_gen; 597185029Spjd 598185029Spjd#if 0 599168404Spjd if (vp == NULL) 600168404Spjd return (zp); 601185029Spjd#endif 602168404Spjd 603168404Spjd vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 604168404Spjd switch (vp->v_type) { 605168404Spjd case VDIR: 606168404Spjd zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 607168404Spjd break; 608168404Spjd case VFIFO: 609168404Spjd vp->v_op = &zfs_fifoops; 610168404Spjd break; 611168404Spjd } 612189696Sjhb if (vp->v_type != VFIFO) 613189696Sjhb VN_LOCK_ASHARE(vp); 614168404Spjd 615185029Spjd mutex_enter(&zfsvfs->z_znodes_lock); 616185029Spjd list_insert_tail(&zfsvfs->z_all_znodes, zp); 617185029Spjd membar_producer(); 618168404Spjd /* 619185029Spjd * Everything else must be valid before assigning z_zfsvfs makes the 620185029Spjd * znode eligible for zfs_znode_move(). 621168404Spjd */ 622185029Spjd zp->z_zfsvfs = zfsvfs; 623185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 624168404Spjd 625168404Spjd VFS_HOLD(zfsvfs->z_vfs); 626185029Spjd return (zp); 627168404Spjd} 628168404Spjd 629168404Spjd/* 630168404Spjd * Create a new DMU object to hold a zfs znode. 631168404Spjd * 632168404Spjd * IN: dzp - parent directory for new znode 633168404Spjd * vap - file attributes for new znode 634168404Spjd * tx - dmu transaction id for zap operations 635168404Spjd * cr - credentials of caller 636168404Spjd * flag - flags: 637168404Spjd * IS_ROOT_NODE - new object will be root 638168404Spjd * IS_XATTR - new object is an attribute 639168404Spjd * IS_REPLAY - intent log replay 640185029Spjd * bonuslen - length of bonus buffer 641185029Spjd * setaclp - File/Dir initial ACL 642185029Spjd * fuidp - Tracks fuid allocation. 643168404Spjd * 644185029Spjd * OUT: zpp - allocated znode 645168404Spjd * 646168404Spjd */ 647168404Spjdvoid 648185029Spjdzfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 649185029Spjd uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp, 650185029Spjd zfs_fuid_info_t **fuidp) 651168404Spjd{ 652185029Spjd dmu_buf_t *db; 653168404Spjd znode_phys_t *pzp; 654168404Spjd zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 655168404Spjd timestruc_t now; 656185029Spjd uint64_t gen, obj; 657168404Spjd int err; 658168404Spjd 659168404Spjd ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 660168404Spjd 661168404Spjd if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ 662185029Spjd obj = vap->va_nodeid; 663168404Spjd flag |= IS_REPLAY; 664168404Spjd now = vap->va_ctime; /* see zfs_replay_create() */ 665168404Spjd gen = vap->va_nblocks; /* ditto */ 666168404Spjd } else { 667185029Spjd obj = 0; 668168404Spjd gethrestime(&now); 669168404Spjd gen = dmu_tx_get_txg(tx); 670168404Spjd } 671168404Spjd 672168404Spjd /* 673168404Spjd * Create a new DMU object. 674168404Spjd */ 675168404Spjd /* 676168404Spjd * There's currently no mechanism for pre-reading the blocks that will 677168404Spjd * be to needed allocate a new object, so we accept the small chance 678168404Spjd * that there will be an i/o error and we will fail one of the 679168404Spjd * assertions below. 680168404Spjd */ 681168404Spjd if (vap->va_type == VDIR) { 682168404Spjd if (flag & IS_REPLAY) { 683185029Spjd err = zap_create_claim_norm(zfsvfs->z_os, obj, 684185029Spjd zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 685168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 686168404Spjd ASSERT3U(err, ==, 0); 687168404Spjd } else { 688185029Spjd obj = zap_create_norm(zfsvfs->z_os, 689185029Spjd zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 690168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 691168404Spjd } 692168404Spjd } else { 693168404Spjd if (flag & IS_REPLAY) { 694185029Spjd err = dmu_object_claim(zfsvfs->z_os, obj, 695168404Spjd DMU_OT_PLAIN_FILE_CONTENTS, 0, 696168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 697168404Spjd ASSERT3U(err, ==, 0); 698168404Spjd } else { 699185029Spjd obj = dmu_object_alloc(zfsvfs->z_os, 700168404Spjd DMU_OT_PLAIN_FILE_CONTENTS, 0, 701168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 702168404Spjd } 703168404Spjd } 704185029Spjd VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); 705185029Spjd dmu_buf_will_dirty(db, tx); 706168404Spjd 707168404Spjd /* 708168404Spjd * Initialize the znode physical data to zero. 709168404Spjd */ 710185029Spjd ASSERT(db->db_size >= sizeof (znode_phys_t)); 711185029Spjd bzero(db->db_data, db->db_size); 712185029Spjd pzp = db->db_data; 713168404Spjd 714168404Spjd /* 715168404Spjd * If this is the root, fix up the half-initialized parent pointer 716168404Spjd * to reference the just-allocated physical data area. 717168404Spjd */ 718168404Spjd if (flag & IS_ROOT_NODE) { 719185029Spjd dzp->z_dbuf = db; 720168404Spjd dzp->z_phys = pzp; 721185029Spjd dzp->z_id = obj; 722168404Spjd } 723168404Spjd 724168404Spjd /* 725168404Spjd * If parent is an xattr, so am I. 726168404Spjd */ 727168404Spjd if (dzp->z_phys->zp_flags & ZFS_XATTR) 728168404Spjd flag |= IS_XATTR; 729168404Spjd 730168404Spjd if (vap->va_type == VBLK || vap->va_type == VCHR) { 731168404Spjd pzp->zp_rdev = zfs_expldev(vap->va_rdev); 732168404Spjd } 733168404Spjd 734185029Spjd if (zfsvfs->z_use_fuids) 735185029Spjd pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 736185029Spjd 737168404Spjd if (vap->va_type == VDIR) { 738168404Spjd pzp->zp_size = 2; /* contents ("." and "..") */ 739168404Spjd pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 740168404Spjd } 741168404Spjd 742168404Spjd pzp->zp_parent = dzp->z_id; 743168404Spjd if (flag & IS_XATTR) 744168404Spjd pzp->zp_flags |= ZFS_XATTR; 745168404Spjd 746168404Spjd pzp->zp_gen = gen; 747168404Spjd 748168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 749168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 750168404Spjd 751168404Spjd if (vap->va_mask & AT_ATIME) { 752168404Spjd ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 753168404Spjd } else { 754168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_atime); 755168404Spjd } 756168404Spjd 757168404Spjd if (vap->va_mask & AT_MTIME) { 758168404Spjd ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 759168404Spjd } else { 760168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 761168404Spjd } 762168404Spjd 763168404Spjd pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); 764185029Spjd if (!(flag & IS_ROOT_NODE)) { 765185029Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 766185029Spjd *zpp = zfs_znode_alloc(zfsvfs, db, 0); 767185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 768185029Spjd } else { 769185029Spjd /* 770185029Spjd * If we are creating the root node, the "parent" we 771185029Spjd * passed in is the znode for the root. 772185029Spjd */ 773185029Spjd *zpp = dzp; 774185029Spjd } 775185029Spjd zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp); 776185029Spjd if (!(flag & IS_ROOT_NODE)) { 777185029Spjd vnode_t *vp; 778168404Spjd 779185029Spjd vp = ZTOV(*zpp); 780185029Spjd vp->v_vflag |= VV_FORCEINSMQ; 781185029Spjd err = insmntque(vp, zfsvfs->z_vfs); 782185029Spjd vp->v_vflag &= ~VV_FORCEINSMQ; 783185029Spjd KASSERT(err == 0, ("insmntque() failed: error %d", err)); 784185029Spjd } 785185029Spjd} 786168404Spjd 787185029Spjdvoid 788185029Spjdzfs_xvattr_set(znode_t *zp, xvattr_t *xvap) 789185029Spjd{ 790185029Spjd xoptattr_t *xoap; 791168404Spjd 792185029Spjd xoap = xva_getxoptattr(xvap); 793185029Spjd ASSERT(xoap); 794168404Spjd 795185029Spjd if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 796185029Spjd ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); 797185029Spjd XVA_SET_RTN(xvap, XAT_CREATETIME); 798168404Spjd } 799185029Spjd if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 800185029Spjd ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); 801185029Spjd XVA_SET_RTN(xvap, XAT_READONLY); 802185029Spjd } 803185029Spjd if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 804185029Spjd ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); 805185029Spjd XVA_SET_RTN(xvap, XAT_HIDDEN); 806185029Spjd } 807185029Spjd if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 808185029Spjd ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); 809185029Spjd XVA_SET_RTN(xvap, XAT_SYSTEM); 810185029Spjd } 811185029Spjd if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 812185029Spjd ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); 813185029Spjd XVA_SET_RTN(xvap, XAT_ARCHIVE); 814185029Spjd } 815185029Spjd if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 816185029Spjd ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); 817185029Spjd XVA_SET_RTN(xvap, XAT_IMMUTABLE); 818185029Spjd } 819185029Spjd if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 820185029Spjd ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); 821185029Spjd XVA_SET_RTN(xvap, XAT_NOUNLINK); 822185029Spjd } 823185029Spjd if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 824185029Spjd ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); 825185029Spjd XVA_SET_RTN(xvap, XAT_APPENDONLY); 826185029Spjd } 827185029Spjd if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 828185029Spjd ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); 829185029Spjd XVA_SET_RTN(xvap, XAT_NODUMP); 830185029Spjd } 831185029Spjd if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 832185029Spjd ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); 833185029Spjd XVA_SET_RTN(xvap, XAT_OPAQUE); 834185029Spjd } 835185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 836185029Spjd ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 837185029Spjd xoap->xoa_av_quarantined); 838185029Spjd XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 839185029Spjd } 840185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 841185029Spjd ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); 842185029Spjd XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 843185029Spjd } 844185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 845185029Spjd (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, 846185029Spjd sizeof (xoap->xoa_av_scanstamp)); 847185029Spjd zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; 848185029Spjd XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 849185029Spjd } 850168404Spjd} 851168404Spjd 852168404Spjdint 853168404Spjdzfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 854168404Spjd{ 855168404Spjd dmu_object_info_t doi; 856168404Spjd dmu_buf_t *db; 857168404Spjd znode_t *zp; 858168404Spjd vnode_t *vp; 859185029Spjd int err, first = 1; 860168404Spjd 861168404Spjd *zpp = NULL; 862185029Spjdagain: 863168404Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 864168404Spjd 865168404Spjd err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 866168404Spjd if (err) { 867168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 868168404Spjd return (err); 869168404Spjd } 870168404Spjd 871168404Spjd dmu_object_info_from_db(db, &doi); 872168404Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 873168404Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 874168404Spjd dmu_buf_rele(db, NULL); 875168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 876168404Spjd return (EINVAL); 877168404Spjd } 878168404Spjd 879168404Spjd zp = dmu_buf_get_user(db); 880168404Spjd if (zp != NULL) { 881168404Spjd mutex_enter(&zp->z_lock); 882168404Spjd 883185029Spjd /* 884185029Spjd * Since we do immediate eviction of the z_dbuf, we 885185029Spjd * should never find a dbuf with a znode that doesn't 886185029Spjd * know about the dbuf. 887185029Spjd */ 888185029Spjd ASSERT3P(zp->z_dbuf, ==, db); 889168404Spjd ASSERT3U(zp->z_id, ==, obj_num); 890168404Spjd if (zp->z_unlinked) { 891185029Spjd err = ENOENT; 892168404Spjd } else { 893197458Spjd int dying = 0; 894197458Spjd 895197458Spjd vp = ZTOV(zp); 896197458Spjd if (vp == NULL) 897197458Spjd dying = 1; 898197458Spjd else { 899197458Spjd VN_HOLD(vp); 900197131Spjd if ((vp->v_iflag & VI_DOOMED) != 0) { 901197458Spjd dying = 1; 902197458Spjd /* 903197458Spjd * Don't VN_RELE() vnode here, because 904197458Spjd * it can call vn_lock() which creates 905197458Spjd * LOR between vnode lock and znode 906197458Spjd * lock. We will VN_RELE() the vnode 907197458Spjd * after droping znode lock. 908197458Spjd */ 909197458Spjd } 910197131Spjd } 911197458Spjd if (dying) { 912185029Spjd if (first) { 913185029Spjd ZFS_LOG(1, "dying znode detected (zp=%p)", zp); 914185029Spjd first = 0; 915185029Spjd } 916185029Spjd /* 917185029Spjd * znode is dying so we can't reuse it, we must 918185029Spjd * wait until destruction is completed. 919185029Spjd */ 920185029Spjd dmu_buf_rele(db, NULL); 921185029Spjd mutex_exit(&zp->z_lock); 922185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 923197458Spjd if (vp != NULL) 924197458Spjd VN_RELE(vp); 925185029Spjd tsleep(zp, 0, "zcollide", 1); 926185029Spjd goto again; 927185029Spjd } 928185029Spjd *zpp = zp; 929185029Spjd err = 0; 930168404Spjd } 931185029Spjd dmu_buf_rele(db, NULL); 932168404Spjd mutex_exit(&zp->z_lock); 933168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 934185029Spjd return (err); 935168404Spjd } 936168404Spjd 937168404Spjd /* 938168404Spjd * Not found create new znode/vnode 939168404Spjd */ 940185029Spjd zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); 941185029Spjd 942185029Spjd vp = ZTOV(zp); 943185029Spjd vp->v_vflag |= VV_FORCEINSMQ; 944185029Spjd err = insmntque(vp, zfsvfs->z_vfs); 945185029Spjd vp->v_vflag &= ~VV_FORCEINSMQ; 946185029Spjd KASSERT(err == 0, ("insmntque() failed: error %d", err)); 947185029Spjd VOP_UNLOCK(vp, 0); 948185029Spjd 949168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 950168404Spjd *zpp = zp; 951168404Spjd return (0); 952168404Spjd} 953168404Spjd 954185029Spjdint 955185029Spjdzfs_rezget(znode_t *zp) 956185029Spjd{ 957185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 958185029Spjd dmu_object_info_t doi; 959185029Spjd dmu_buf_t *db; 960185029Spjd uint64_t obj_num = zp->z_id; 961185029Spjd int err; 962185029Spjd 963185029Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 964185029Spjd 965185029Spjd err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 966185029Spjd if (err) { 967185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 968185029Spjd return (err); 969185029Spjd } 970185029Spjd 971185029Spjd dmu_object_info_from_db(db, &doi); 972185029Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 973185029Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 974185029Spjd dmu_buf_rele(db, NULL); 975185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 976185029Spjd return (EINVAL); 977185029Spjd } 978185029Spjd 979185029Spjd if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { 980185029Spjd dmu_buf_rele(db, NULL); 981185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 982185029Spjd return (EIO); 983185029Spjd } 984185029Spjd 985185029Spjd zfs_znode_dmu_init(zfsvfs, zp, db); 986185029Spjd zp->z_unlinked = (zp->z_phys->zp_links == 0); 987185029Spjd zp->z_blksz = doi.doi_data_block_size; 988185029Spjd 989185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 990185029Spjd 991185029Spjd return (0); 992185029Spjd} 993185029Spjd 994168404Spjdvoid 995168404Spjdzfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 996168404Spjd{ 997168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 998185029Spjd objset_t *os = zfsvfs->z_os; 999185029Spjd uint64_t obj = zp->z_id; 1000185029Spjd uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; 1001168404Spjd 1002185029Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 1003185029Spjd if (acl_obj) 1004185029Spjd VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1005185029Spjd VERIFY(0 == dmu_object_free(os, obj, tx)); 1006185029Spjd zfs_znode_dmu_fini(zp); 1007185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 1008185029Spjd zfs_znode_free(zp); 1009168404Spjd} 1010168404Spjd 1011168404Spjdvoid 1012168404Spjdzfs_zinactive(znode_t *zp) 1013168404Spjd{ 1014168404Spjd vnode_t *vp = ZTOV(zp); 1015168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1016168404Spjd uint64_t z_id = zp->z_id; 1017168404Spjd 1018185029Spjd ASSERT(zp->z_dbuf && zp->z_phys); 1019168404Spjd 1020168404Spjd /* 1021168404Spjd * Don't allow a zfs_zget() while were trying to release this znode 1022168404Spjd */ 1023168404Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 1024168404Spjd 1025168404Spjd mutex_enter(&zp->z_lock); 1026168404Spjd VI_LOCK(vp); 1027168404Spjd if (vp->v_count > 0) { 1028168404Spjd /* 1029168404Spjd * If the hold count is greater than zero, somebody has 1030168404Spjd * obtained a new reference on this znode while we were 1031168404Spjd * processing it here, so we are done. 1032168404Spjd */ 1033168404Spjd VI_UNLOCK(vp); 1034168404Spjd mutex_exit(&zp->z_lock); 1035168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1036168404Spjd return; 1037168404Spjd } 1038168404Spjd VI_UNLOCK(vp); 1039168404Spjd 1040168404Spjd /* 1041168404Spjd * If this was the last reference to a file with no links, 1042168404Spjd * remove the file from the file system. 1043168404Spjd */ 1044168404Spjd if (zp->z_unlinked) { 1045168404Spjd mutex_exit(&zp->z_lock); 1046168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1047168404Spjd ASSERT(vp->v_count == 0); 1048168404Spjd vrecycle(vp, curthread); 1049168404Spjd zfs_rmnode(zp); 1050168404Spjd return; 1051168404Spjd } 1052168404Spjd mutex_exit(&zp->z_lock); 1053168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1054168404Spjd} 1055168404Spjd 1056168404Spjdvoid 1057168404Spjdzfs_znode_free(znode_t *zp) 1058168404Spjd{ 1059168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1060168404Spjd 1061185029Spjd ASSERT(ZTOV(zp) == NULL); 1062168404Spjd mutex_enter(&zfsvfs->z_znodes_lock); 1063185029Spjd POINTER_INVALIDATE(&zp->z_zfsvfs); 1064168404Spjd list_remove(&zfsvfs->z_all_znodes, zp); 1065168404Spjd mutex_exit(&zfsvfs->z_znodes_lock); 1066168404Spjd 1067168404Spjd kmem_cache_free(znode_cache, zp); 1068185029Spjd 1069185029Spjd VFS_RELE(zfsvfs->z_vfs); 1070168404Spjd} 1071168404Spjd 1072168404Spjdvoid 1073168404Spjdzfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1074168404Spjd{ 1075168404Spjd timestruc_t now; 1076168404Spjd 1077168404Spjd ASSERT(MUTEX_HELD(&zp->z_lock)); 1078168404Spjd 1079168404Spjd gethrestime(&now); 1080168404Spjd 1081168404Spjd if (tx) { 1082168404Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1083168404Spjd zp->z_atime_dirty = 0; 1084168404Spjd zp->z_seq++; 1085168404Spjd } else { 1086168404Spjd zp->z_atime_dirty = 1; 1087168404Spjd } 1088168404Spjd 1089168404Spjd if (flag & AT_ATIME) 1090168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1091168404Spjd 1092185029Spjd if (flag & AT_MTIME) { 1093168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1094185029Spjd if (zp->z_zfsvfs->z_use_fuids) 1095185029Spjd zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); 1096185029Spjd } 1097168404Spjd 1098185029Spjd if (flag & AT_CTIME) { 1099168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1100185029Spjd if (zp->z_zfsvfs->z_use_fuids) 1101185029Spjd zp->z_phys->zp_flags |= ZFS_ARCHIVE; 1102185029Spjd } 1103168404Spjd} 1104168404Spjd 1105168404Spjd/* 1106168404Spjd * Update the requested znode timestamps with the current time. 1107168404Spjd * If we are in a transaction, then go ahead and mark the znode 1108168404Spjd * dirty in the transaction so the timestamps will go to disk. 1109168404Spjd * Otherwise, we will get pushed next time the znode is updated 1110168404Spjd * in a transaction, or when this znode eventually goes inactive. 1111168404Spjd * 1112168404Spjd * Why is this OK? 1113168404Spjd * 1 - Only the ACCESS time is ever updated outside of a transaction. 1114168404Spjd * 2 - Multiple consecutive updates will be collapsed into a single 1115168404Spjd * znode update by the transaction grouping semantics of the DMU. 1116168404Spjd */ 1117168404Spjdvoid 1118168404Spjdzfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1119168404Spjd{ 1120168404Spjd mutex_enter(&zp->z_lock); 1121168404Spjd zfs_time_stamper_locked(zp, flag, tx); 1122168404Spjd mutex_exit(&zp->z_lock); 1123168404Spjd} 1124168404Spjd 1125168404Spjd/* 1126168404Spjd * Grow the block size for a file. 1127168404Spjd * 1128168404Spjd * IN: zp - znode of file to free data in. 1129168404Spjd * size - requested block size 1130168404Spjd * tx - open transaction. 1131168404Spjd * 1132168404Spjd * NOTE: this function assumes that the znode is write locked. 1133168404Spjd */ 1134168404Spjdvoid 1135168404Spjdzfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1136168404Spjd{ 1137168404Spjd int error; 1138168404Spjd u_longlong_t dummy; 1139168404Spjd 1140168404Spjd if (size <= zp->z_blksz) 1141168404Spjd return; 1142168404Spjd /* 1143168404Spjd * If the file size is already greater than the current blocksize, 1144168404Spjd * we will not grow. If there is more than one block in a file, 1145168404Spjd * the blocksize cannot change. 1146168404Spjd */ 1147168404Spjd if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1148168404Spjd return; 1149168404Spjd 1150168404Spjd error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1151168404Spjd size, 0, tx); 1152168404Spjd if (error == ENOTSUP) 1153168404Spjd return; 1154168404Spjd ASSERT3U(error, ==, 0); 1155168404Spjd 1156168404Spjd /* What blocksize did we actually get? */ 1157168404Spjd dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1158168404Spjd} 1159168404Spjd 1160168404Spjd/* 1161185029Spjd * Increase the file length 1162168404Spjd * 1163168404Spjd * IN: zp - znode of file to free data in. 1164185029Spjd * end - new end-of-file 1165168404Spjd * 1166168404Spjd * RETURN: 0 if success 1167168404Spjd * error code if failure 1168168404Spjd */ 1169185029Spjdstatic int 1170185029Spjdzfs_extend(znode_t *zp, uint64_t end) 1171168404Spjd{ 1172185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1173168404Spjd dmu_tx_t *tx; 1174168404Spjd rl_t *rl; 1175185029Spjd uint64_t newblksz; 1176168404Spjd int error; 1177168404Spjd 1178168404Spjd /* 1179185029Spjd * We will change zp_size, lock the whole file. 1180168404Spjd */ 1181185029Spjd rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1182168404Spjd 1183168404Spjd /* 1184168404Spjd * Nothing to do if file already at desired length. 1185168404Spjd */ 1186185029Spjd if (end <= zp->z_phys->zp_size) { 1187168404Spjd zfs_range_unlock(rl); 1188168404Spjd return (0); 1189168404Spjd } 1190185029Spjdtop: 1191168404Spjd tx = dmu_tx_create(zfsvfs->z_os); 1192168404Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1193185029Spjd if (end > zp->z_blksz && 1194168404Spjd (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1195168404Spjd /* 1196168404Spjd * We are growing the file past the current block size. 1197168404Spjd */ 1198168404Spjd if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1199168404Spjd ASSERT(!ISP2(zp->z_blksz)); 1200185029Spjd newblksz = MIN(end, SPA_MAXBLOCKSIZE); 1201168404Spjd } else { 1202185029Spjd newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1203168404Spjd } 1204185029Spjd dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1205185029Spjd } else { 1206185029Spjd newblksz = 0; 1207168404Spjd } 1208168404Spjd 1209168404Spjd error = dmu_tx_assign(tx, zfsvfs->z_assign); 1210168404Spjd if (error) { 1211185029Spjd if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1212168404Spjd dmu_tx_wait(tx); 1213185029Spjd dmu_tx_abort(tx); 1214185029Spjd goto top; 1215185029Spjd } 1216168404Spjd dmu_tx_abort(tx); 1217168404Spjd zfs_range_unlock(rl); 1218168404Spjd return (error); 1219168404Spjd } 1220185029Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1221168404Spjd 1222185029Spjd if (newblksz) 1223185029Spjd zfs_grow_blocksize(zp, newblksz, tx); 1224168404Spjd 1225185029Spjd zp->z_phys->zp_size = end; 1226168404Spjd 1227185029Spjd zfs_range_unlock(rl); 1228168404Spjd 1229185029Spjd dmu_tx_commit(tx); 1230185029Spjd 1231185029Spjd rw_enter(&zp->z_map_lock, RW_WRITER); 1232185029Spjd error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0); 1233185029Spjd ASSERT(error == 0); 1234185029Spjd vnode_pager_setsize(ZTOV(zp), end); 1235185029Spjd rw_exit(&zp->z_map_lock); 1236185029Spjd 1237185029Spjd return (0); 1238185029Spjd} 1239185029Spjd 1240185029Spjd/* 1241185029Spjd * Free space in a file. 1242185029Spjd * 1243185029Spjd * IN: zp - znode of file to free data in. 1244185029Spjd * off - start of section to free. 1245185029Spjd * len - length of section to free. 1246185029Spjd * 1247185029Spjd * RETURN: 0 if success 1248185029Spjd * error code if failure 1249185029Spjd */ 1250185029Spjdstatic int 1251185029Spjdzfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1252185029Spjd{ 1253185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1254185029Spjd rl_t *rl; 1255185029Spjd int error; 1256185029Spjd 1257185029Spjd /* 1258185029Spjd * Lock the range being freed. 1259185029Spjd */ 1260185029Spjd rl = zfs_range_lock(zp, off, len, RL_WRITER); 1261185029Spjd 1262185029Spjd /* 1263185029Spjd * Nothing to do if file already at desired length. 1264185029Spjd */ 1265185029Spjd if (off >= zp->z_phys->zp_size) { 1266185029Spjd zfs_range_unlock(rl); 1267185029Spjd return (0); 1268168404Spjd } 1269168404Spjd 1270185029Spjd if (off + len > zp->z_phys->zp_size) 1271185029Spjd len = zp->z_phys->zp_size - off; 1272185029Spjd 1273185029Spjd error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1274185029Spjd 1275185029Spjd if (error == 0) { 1276185029Spjd /* 1277185029Spjd * In FreeBSD we cannot free block in the middle of a file, 1278185029Spjd * but only at the end of a file. 1279185029Spjd */ 1280185029Spjd rw_enter(&zp->z_map_lock, RW_WRITER); 1281185029Spjd error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0); 1282185029Spjd ASSERT(error == 0); 1283185029Spjd vnode_pager_setsize(ZTOV(zp), off); 1284185029Spjd rw_exit(&zp->z_map_lock); 1285168404Spjd } 1286168404Spjd 1287168404Spjd zfs_range_unlock(rl); 1288168404Spjd 1289185029Spjd return (error); 1290185029Spjd} 1291185029Spjd 1292185029Spjd/* 1293185029Spjd * Truncate a file 1294185029Spjd * 1295185029Spjd * IN: zp - znode of file to free data in. 1296185029Spjd * end - new end-of-file. 1297185029Spjd * 1298185029Spjd * RETURN: 0 if success 1299185029Spjd * error code if failure 1300185029Spjd */ 1301185029Spjdstatic int 1302185029Spjdzfs_trunc(znode_t *zp, uint64_t end) 1303185029Spjd{ 1304185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1305185029Spjd vnode_t *vp = ZTOV(zp); 1306185029Spjd dmu_tx_t *tx; 1307185029Spjd rl_t *rl; 1308185029Spjd int error; 1309185029Spjd 1310185029Spjd /* 1311185029Spjd * We will change zp_size, lock the whole file. 1312185029Spjd */ 1313185029Spjd rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1314185029Spjd 1315185029Spjd /* 1316185029Spjd * Nothing to do if file already at desired length. 1317185029Spjd */ 1318185029Spjd if (end >= zp->z_phys->zp_size) { 1319185029Spjd zfs_range_unlock(rl); 1320185029Spjd return (0); 1321185029Spjd } 1322185029Spjd 1323185029Spjd error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); 1324185029Spjd if (error) { 1325185029Spjd zfs_range_unlock(rl); 1326185029Spjd return (error); 1327185029Spjd } 1328185029Spjdtop: 1329185029Spjd tx = dmu_tx_create(zfsvfs->z_os); 1330185029Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1331185029Spjd error = dmu_tx_assign(tx, zfsvfs->z_assign); 1332185029Spjd if (error) { 1333185029Spjd if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1334185029Spjd dmu_tx_wait(tx); 1335185029Spjd dmu_tx_abort(tx); 1336185029Spjd goto top; 1337185029Spjd } 1338185029Spjd dmu_tx_abort(tx); 1339185029Spjd zfs_range_unlock(rl); 1340185029Spjd return (error); 1341185029Spjd } 1342185029Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1343185029Spjd 1344185029Spjd zp->z_phys->zp_size = end; 1345185029Spjd 1346168404Spjd dmu_tx_commit(tx); 1347168404Spjd 1348185029Spjd zfs_range_unlock(rl); 1349185029Spjd 1350168404Spjd /* 1351168404Spjd * Clear any mapped pages in the truncated region. This has to 1352168404Spjd * happen outside of the transaction to avoid the possibility of 1353168404Spjd * a deadlock with someone trying to push a page that we are 1354168404Spjd * about to invalidate. 1355168404Spjd */ 1356168404Spjd rw_enter(&zp->z_map_lock, RW_WRITER); 1357168404Spjd#if 0 1358185029Spjd error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE); 1359168404Spjd#else 1360185029Spjd error = vinvalbuf(vp, V_SAVE, 0, 0); 1361185029Spjd ASSERT(error == 0); 1362185029Spjd vnode_pager_setsize(vp, end); 1363168404Spjd#endif 1364168404Spjd rw_exit(&zp->z_map_lock); 1365168404Spjd 1366168404Spjd return (0); 1367168404Spjd} 1368168404Spjd 1369185029Spjd/* 1370185029Spjd * Free space in a file 1371185029Spjd * 1372185029Spjd * IN: zp - znode of file to free data in. 1373185029Spjd * off - start of range 1374185029Spjd * len - end of range (0 => EOF) 1375185029Spjd * flag - current file open mode flags. 1376185029Spjd * log - TRUE if this action should be logged 1377185029Spjd * 1378185029Spjd * RETURN: 0 if success 1379185029Spjd * error code if failure 1380185029Spjd */ 1381185029Spjdint 1382185029Spjdzfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1383185029Spjd{ 1384185029Spjd vnode_t *vp = ZTOV(zp); 1385185029Spjd dmu_tx_t *tx; 1386185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1387185029Spjd zilog_t *zilog = zfsvfs->z_log; 1388185029Spjd int error; 1389185029Spjd 1390185029Spjd if (off > zp->z_phys->zp_size) { 1391185029Spjd error = zfs_extend(zp, off+len); 1392185029Spjd if (error == 0 && log) 1393185029Spjd goto log; 1394185029Spjd else 1395185029Spjd return (error); 1396185029Spjd } 1397185029Spjd 1398185029Spjd if (len == 0) { 1399185029Spjd error = zfs_trunc(zp, off); 1400185029Spjd } else { 1401185029Spjd if ((error = zfs_free_range(zp, off, len)) == 0 && 1402185029Spjd off + len > zp->z_phys->zp_size) 1403185029Spjd error = zfs_extend(zp, off+len); 1404185029Spjd } 1405185029Spjd if (error || !log) 1406185029Spjd return (error); 1407185029Spjdlog: 1408185029Spjd tx = dmu_tx_create(zfsvfs->z_os); 1409185029Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1410185029Spjd error = dmu_tx_assign(tx, zfsvfs->z_assign); 1411185029Spjd if (error) { 1412185029Spjd if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1413185029Spjd dmu_tx_wait(tx); 1414185029Spjd dmu_tx_abort(tx); 1415185029Spjd goto log; 1416185029Spjd } 1417185029Spjd dmu_tx_abort(tx); 1418185029Spjd return (error); 1419185029Spjd } 1420185029Spjd 1421185029Spjd zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 1422185029Spjd zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1423185029Spjd 1424185029Spjd dmu_tx_commit(tx); 1425185029Spjd return (0); 1426185029Spjd} 1427185029Spjd 1428168404Spjdvoid 1429185029Spjdzfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1430168404Spjd{ 1431168404Spjd zfsvfs_t zfsvfs; 1432185029Spjd uint64_t moid, doid, version; 1433185029Spjd uint64_t sense = ZFS_CASE_SENSITIVE; 1434185029Spjd uint64_t norm = 0; 1435185029Spjd nvpair_t *elem; 1436168404Spjd int error; 1437168404Spjd znode_t *rootzp = NULL; 1438185029Spjd vnode_t *vp; 1439168404Spjd vattr_t vattr; 1440185029Spjd znode_t *zp; 1441168404Spjd 1442168404Spjd /* 1443168404Spjd * First attempt to create master node. 1444168404Spjd */ 1445168404Spjd /* 1446168404Spjd * In an empty objset, there are no blocks to read and thus 1447168404Spjd * there can be no i/o errors (which we assert below). 1448168404Spjd */ 1449168404Spjd moid = MASTER_NODE_OBJ; 1450168404Spjd error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1451168404Spjd DMU_OT_NONE, 0, tx); 1452168404Spjd ASSERT(error == 0); 1453168404Spjd 1454168404Spjd /* 1455168404Spjd * Set starting attributes. 1456168404Spjd */ 1457185029Spjd if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) 1458185029Spjd version = ZPL_VERSION; 1459185029Spjd else 1460185029Spjd version = ZPL_VERSION_FUID - 1; 1461185029Spjd error = zap_update(os, moid, ZPL_VERSION_STR, 1462185029Spjd 8, 1, &version, tx); 1463185029Spjd elem = NULL; 1464185029Spjd while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1465185029Spjd /* For the moment we expect all zpl props to be uint64_ts */ 1466185029Spjd uint64_t val; 1467185029Spjd char *name; 1468168404Spjd 1469185029Spjd ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1470185029Spjd VERIFY(nvpair_value_uint64(elem, &val) == 0); 1471185029Spjd name = nvpair_name(elem); 1472185029Spjd if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1473185029Spjd version = val; 1474185029Spjd error = zap_update(os, moid, ZPL_VERSION_STR, 1475185029Spjd 8, 1, &version, tx); 1476185029Spjd } else { 1477185029Spjd error = zap_update(os, moid, name, 8, 1, &val, tx); 1478185029Spjd } 1479185029Spjd ASSERT(error == 0); 1480185029Spjd if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1481185029Spjd norm = val; 1482185029Spjd else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1483185029Spjd sense = val; 1484185029Spjd } 1485185029Spjd ASSERT(version != 0); 1486168404Spjd 1487168404Spjd /* 1488168404Spjd * Create a delete queue. 1489168404Spjd */ 1490168404Spjd doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1491168404Spjd 1492168404Spjd error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); 1493168404Spjd ASSERT(error == 0); 1494168404Spjd 1495168404Spjd /* 1496168404Spjd * Create root znode. Create minimal znode/vnode/zfsvfs 1497168404Spjd * to allow zfs_mknode to work. 1498168404Spjd */ 1499185029Spjd VATTR_NULL(&vattr); 1500168404Spjd vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1501168404Spjd vattr.va_type = VDIR; 1502168404Spjd vattr.va_mode = S_IFDIR|0755; 1503185029Spjd vattr.va_uid = crgetuid(cr); 1504185029Spjd vattr.va_gid = crgetgid(cr); 1505168404Spjd 1506168404Spjd rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1507185029Spjd zfs_znode_cache_constructor(rootzp, &zfsvfs, 0); 1508168404Spjd rootzp->z_unlinked = 0; 1509168404Spjd rootzp->z_atime_dirty = 0; 1510168404Spjd 1511185029Spjd vp = ZTOV(rootzp); 1512185029Spjd vp->v_type = VDIR; 1513189696Sjhb VN_LOCK_ASHARE(vp); 1514185029Spjd 1515168404Spjd bzero(&zfsvfs, sizeof (zfsvfs_t)); 1516168404Spjd 1517168404Spjd zfsvfs.z_os = os; 1518168404Spjd zfsvfs.z_assign = TXG_NOWAIT; 1519168404Spjd zfsvfs.z_parent = &zfsvfs; 1520185029Spjd zfsvfs.z_version = version; 1521185029Spjd zfsvfs.z_use_fuids = USE_FUIDS(version, os); 1522185029Spjd zfsvfs.z_norm = norm; 1523185029Spjd /* 1524185029Spjd * Fold case on file systems that are always or sometimes case 1525185029Spjd * insensitive. 1526185029Spjd */ 1527185029Spjd if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1528185029Spjd zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; 1529168404Spjd 1530168404Spjd mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1531168404Spjd list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1532168404Spjd offsetof(znode_t, z_link_node)); 1533168404Spjd 1534185029Spjd ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); 1535185029Spjd rootzp->z_zfsvfs = &zfsvfs; 1536185029Spjd zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL); 1537185029Spjd ASSERT3P(zp, ==, rootzp); 1538185029Spjd error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1539168404Spjd ASSERT(error == 0); 1540185029Spjd POINTER_INVALIDATE(&rootzp->z_zfsvfs); 1541168404Spjd 1542185029Spjd VI_LOCK(vp); 1543185029Spjd ZTOV(rootzp)->v_data = NULL; 1544185029Spjd ZTOV(rootzp)->v_count = 0; 1545185029Spjd ZTOV(rootzp)->v_holdcnt = 0; 1546197153Spjd rootzp->z_vnode = NULL; 1547185029Spjd VOP_UNLOCK(vp, 0); 1548185029Spjd vdestroy(vp); 1549185029Spjd dmu_buf_rele(rootzp->z_dbuf, NULL); 1550185029Spjd rootzp->z_dbuf = NULL; 1551169325Spjd mutex_destroy(&zfsvfs.z_znodes_lock); 1552168404Spjd kmem_cache_free(znode_cache, rootzp); 1553168404Spjd} 1554185029Spjd 1555168404Spjd#endif /* _KERNEL */ 1556168404Spjd/* 1557168404Spjd * Given an object number, return its parent object number and whether 1558168404Spjd * or not the object is an extended attribute directory. 1559168404Spjd */ 1560168404Spjdstatic int 1561168404Spjdzfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) 1562168404Spjd{ 1563168404Spjd dmu_buf_t *db; 1564168404Spjd dmu_object_info_t doi; 1565168404Spjd znode_phys_t *zp; 1566168404Spjd int error; 1567168404Spjd 1568168404Spjd if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) 1569168404Spjd return (error); 1570168404Spjd 1571168404Spjd dmu_object_info_from_db(db, &doi); 1572168404Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 1573168404Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 1574168404Spjd dmu_buf_rele(db, FTAG); 1575168404Spjd return (EINVAL); 1576168404Spjd } 1577168404Spjd 1578168404Spjd zp = db->db_data; 1579168404Spjd *pobjp = zp->zp_parent; 1580168404Spjd *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && 1581168404Spjd S_ISDIR(zp->zp_mode); 1582168404Spjd dmu_buf_rele(db, FTAG); 1583168404Spjd 1584168404Spjd return (0); 1585168404Spjd} 1586168404Spjd 1587168404Spjdint 1588168404Spjdzfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 1589168404Spjd{ 1590168404Spjd char *path = buf + len - 1; 1591168404Spjd int error; 1592168404Spjd 1593168404Spjd *path = '\0'; 1594168404Spjd 1595168404Spjd for (;;) { 1596168404Spjd uint64_t pobj; 1597168404Spjd char component[MAXNAMELEN + 2]; 1598168404Spjd size_t complen; 1599168404Spjd int is_xattrdir; 1600168404Spjd 1601168404Spjd if ((error = zfs_obj_to_pobj(osp, obj, &pobj, 1602168404Spjd &is_xattrdir)) != 0) 1603168404Spjd break; 1604168404Spjd 1605168404Spjd if (pobj == obj) { 1606168404Spjd if (path[0] != '/') 1607168404Spjd *--path = '/'; 1608168404Spjd break; 1609168404Spjd } 1610168404Spjd 1611168404Spjd component[0] = '/'; 1612168404Spjd if (is_xattrdir) { 1613168404Spjd (void) sprintf(component + 1, "<xattrdir>"); 1614168404Spjd } else { 1615185029Spjd error = zap_value_search(osp, pobj, obj, 1616185029Spjd ZFS_DIRENT_OBJ(-1ULL), component + 1); 1617168404Spjd if (error != 0) 1618168404Spjd break; 1619168404Spjd } 1620168404Spjd 1621168404Spjd complen = strlen(component); 1622168404Spjd path -= complen; 1623168404Spjd ASSERT(path >= buf); 1624168404Spjd bcopy(component, path, complen); 1625168404Spjd obj = pobj; 1626168404Spjd } 1627168404Spjd 1628168404Spjd if (error == 0) 1629168404Spjd (void) memmove(buf, path, buf + len - path); 1630168404Spjd return (error); 1631168404Spjd} 1632