zfs_znode.c revision 207334
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22185029Spjd * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23168404Spjd * Use is subject to license terms. 24168404Spjd */ 25168404Spjd 26169195Spjd/* Portions Copyright 2007 Jeremy Teo */ 27169195Spjd 28168404Spjd#ifdef _KERNEL 29168404Spjd#include <sys/types.h> 30168404Spjd#include <sys/param.h> 31168404Spjd#include <sys/time.h> 32168404Spjd#include <sys/systm.h> 33168404Spjd#include <sys/sysmacros.h> 34168404Spjd#include <sys/resource.h> 35168404Spjd#include <sys/mntent.h> 36185029Spjd#include <sys/u8_textprep.h> 37185029Spjd#include <sys/dsl_dataset.h> 38168404Spjd#include <sys/vfs.h> 39168404Spjd#include <sys/vnode.h> 40168404Spjd#include <sys/file.h> 41168404Spjd#include <sys/kmem.h> 42168404Spjd#include <sys/errno.h> 43168404Spjd#include <sys/unistd.h> 44168404Spjd#include <sys/atomic.h> 45168404Spjd#include <sys/zfs_dir.h> 46168404Spjd#include <sys/zfs_acl.h> 47168404Spjd#include <sys/zfs_ioctl.h> 48168404Spjd#include <sys/zfs_rlock.h> 49185029Spjd#include <sys/zfs_fuid.h> 50168404Spjd#include <sys/fs/zfs.h> 51185029Spjd#include <sys/kidmap.h> 52168404Spjd#endif /* _KERNEL */ 53168404Spjd 54168404Spjd#include <sys/dmu.h> 55168404Spjd#include <sys/refcount.h> 56168404Spjd#include <sys/stat.h> 57168404Spjd#include <sys/zap.h> 58168404Spjd#include <sys/zfs_znode.h> 59168404Spjd#include <sys/refcount.h> 60168404Spjd 61185029Spjd#include "zfs_prop.h" 62185029Spjd 63173268Slulf/* Used by fstat(1). */ 64173268SlulfSYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t), 65173268Slulf "sizeof(znode_t)"); 66173268Slulf 67168404Spjd/* 68185029Spjd * Define ZNODE_STATS to turn on statistic gathering. By default, it is only 69185029Spjd * turned on when DEBUG is also defined. 70185029Spjd */ 71185029Spjd#ifdef DEBUG 72185029Spjd#define ZNODE_STATS 73185029Spjd#endif /* DEBUG */ 74185029Spjd 75185029Spjd#ifdef ZNODE_STATS 76185029Spjd#define ZNODE_STAT_ADD(stat) ((stat)++) 77185029Spjd#else 78185029Spjd#define ZNODE_STAT_ADD(stat) /* nothing */ 79185029Spjd#endif /* ZNODE_STATS */ 80185029Spjd 81185029Spjd#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) 82185029Spjd#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) 83185029Spjd 84185029Spjd/* 85168404Spjd * Functions needed for userland (ie: libzpool) are not put under 86168404Spjd * #ifdef_KERNEL; the rest of the functions have dependencies 87168404Spjd * (such as VFS logic) that will not compile easily in userland. 88168404Spjd */ 89168404Spjd#ifdef _KERNEL 90185029Spjdstatic kmem_cache_t *znode_cache = NULL; 91168404Spjd 92168404Spjd/*ARGSUSED*/ 93168404Spjdstatic void 94185029Spjdznode_evict_error(dmu_buf_t *dbuf, void *user_ptr) 95168404Spjd{ 96185029Spjd#if 1 /* XXXPJD: From OpenSolaris. */ 97185029Spjd /* 98185029Spjd * We should never drop all dbuf refs without first clearing 99185029Spjd * the eviction callback. 100185029Spjd */ 101185029Spjd panic("evicting znode %p\n", user_ptr); 102185029Spjd#else /* XXXPJD */ 103168404Spjd znode_t *zp = user_ptr; 104168488Spjd vnode_t *vp; 105168404Spjd 106168404Spjd mutex_enter(&zp->z_lock); 107185029Spjd zp->z_dbuf = NULL; 108168488Spjd vp = ZTOV(zp); 109168404Spjd if (vp == NULL) { 110168404Spjd mutex_exit(&zp->z_lock); 111168404Spjd zfs_znode_free(zp); 112168404Spjd } else if (vp->v_count == 0) { 113197153Spjd zp->z_vnode = NULL; 114168488Spjd vhold(vp); 115168404Spjd mutex_exit(&zp->z_lock); 116185029Spjd vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); 117168404Spjd vrecycle(vp, curthread); 118175294Sattilio VOP_UNLOCK(vp, 0); 119168404Spjd vdrop(vp); 120168404Spjd zfs_znode_free(zp); 121168404Spjd } else { 122168404Spjd mutex_exit(&zp->z_lock); 123168404Spjd } 124185029Spjd#endif 125168404Spjd} 126168404Spjd 127168404Spjdextern struct vop_vector zfs_vnodeops; 128168404Spjdextern struct vop_vector zfs_fifoops; 129168404Spjd 130168404Spjd/* 131168404Spjd * XXX: We cannot use this function as a cache constructor, because 132168404Spjd * there is one global cache for all file systems and we need 133168404Spjd * to pass vfsp here, which is not possible, because argument 134168404Spjd * 'cdrarg' is defined at kmem_cache_create() time. 135168404Spjd */ 136168404Spjdstatic int 137185029Spjdzfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 138168404Spjd{ 139168404Spjd znode_t *zp = buf; 140169196Spjd vnode_t *vp; 141185029Spjd vfs_t *vfsp = arg; 142168404Spjd int error; 143168404Spjd 144185029Spjd POINTER_INVALIDATE(&zp->z_zfsvfs); 145185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 146185029Spjd 147199156Spjd if (vfsp != NULL) { 148199156Spjd error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp); 149199156Spjd if (error != 0 && (kmflags & KM_NOSLEEP)) 150199156Spjd return (-1); 151199156Spjd ASSERT(error == 0); 152199156Spjd vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 153199156Spjd zp->z_vnode = vp; 154199156Spjd vp->v_data = (caddr_t)zp; 155199156Spjd VN_LOCK_AREC(vp); 156199156Spjd } else { 157199156Spjd zp->z_vnode = NULL; 158199156Spjd } 159185029Spjd 160185029Spjd list_link_init(&zp->z_link_node); 161185029Spjd 162168404Spjd mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 163168404Spjd rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); 164168404Spjd rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 165168404Spjd rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); 166168404Spjd mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 167168404Spjd 168168404Spjd mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); 169168404Spjd avl_create(&zp->z_range_avl, zfs_range_compare, 170168404Spjd sizeof (rl_t), offsetof(rl_t, r_node)); 171168404Spjd 172185029Spjd zp->z_dbuf = NULL; 173185029Spjd zp->z_dirlocks = NULL; 174168404Spjd return (0); 175168404Spjd} 176168404Spjd 177168404Spjd/*ARGSUSED*/ 178168404Spjdstatic void 179185029Spjdzfs_znode_cache_destructor(void *buf, void *arg) 180168404Spjd{ 181168404Spjd znode_t *zp = buf; 182168404Spjd 183185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 184185029Spjd ASSERT(ZTOV(zp) == NULL); 185185029Spjd vn_free(ZTOV(zp)); 186185029Spjd ASSERT(!list_link_active(&zp->z_link_node)); 187168404Spjd mutex_destroy(&zp->z_lock); 188168404Spjd rw_destroy(&zp->z_map_lock); 189168404Spjd rw_destroy(&zp->z_parent_lock); 190168404Spjd rw_destroy(&zp->z_name_lock); 191168404Spjd mutex_destroy(&zp->z_acl_lock); 192185029Spjd avl_destroy(&zp->z_range_avl); 193168404Spjd mutex_destroy(&zp->z_range_lock); 194168404Spjd 195185029Spjd ASSERT(zp->z_dbuf == NULL); 196185029Spjd ASSERT(zp->z_dirlocks == NULL); 197168404Spjd} 198168404Spjd 199185029Spjd#ifdef ZNODE_STATS 200185029Spjdstatic struct { 201185029Spjd uint64_t zms_zfsvfs_invalid; 202185029Spjd uint64_t zms_zfsvfs_unmounted; 203185029Spjd uint64_t zms_zfsvfs_recheck_invalid; 204185029Spjd uint64_t zms_obj_held; 205185029Spjd uint64_t zms_vnode_locked; 206185029Spjd uint64_t zms_not_only_dnlc; 207185029Spjd} znode_move_stats; 208185029Spjd#endif /* ZNODE_STATS */ 209185029Spjd 210185029Spjd#if defined(sun) 211185029Spjdstatic void 212185029Spjdzfs_znode_move_impl(znode_t *ozp, znode_t *nzp) 213185029Spjd{ 214185029Spjd vnode_t *vp; 215185029Spjd 216185029Spjd /* Copy fields. */ 217185029Spjd nzp->z_zfsvfs = ozp->z_zfsvfs; 218185029Spjd 219185029Spjd /* Swap vnodes. */ 220185029Spjd vp = nzp->z_vnode; 221185029Spjd nzp->z_vnode = ozp->z_vnode; 222185029Spjd ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ 223185029Spjd ZTOV(ozp)->v_data = ozp; 224185029Spjd ZTOV(nzp)->v_data = nzp; 225185029Spjd 226185029Spjd nzp->z_id = ozp->z_id; 227185029Spjd ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ 228185029Spjd ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); 229185029Spjd nzp->z_unlinked = ozp->z_unlinked; 230185029Spjd nzp->z_atime_dirty = ozp->z_atime_dirty; 231185029Spjd nzp->z_zn_prefetch = ozp->z_zn_prefetch; 232185029Spjd nzp->z_blksz = ozp->z_blksz; 233185029Spjd nzp->z_seq = ozp->z_seq; 234185029Spjd nzp->z_mapcnt = ozp->z_mapcnt; 235185029Spjd nzp->z_last_itx = ozp->z_last_itx; 236185029Spjd nzp->z_gen = ozp->z_gen; 237185029Spjd nzp->z_sync_cnt = ozp->z_sync_cnt; 238185029Spjd nzp->z_phys = ozp->z_phys; 239185029Spjd nzp->z_dbuf = ozp->z_dbuf; 240185029Spjd 241185029Spjd /* Update back pointers. */ 242185029Spjd (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, 243185029Spjd znode_evict_error); 244185029Spjd 245185029Spjd /* 246185029Spjd * Invalidate the original znode by clearing fields that provide a 247185029Spjd * pointer back to the znode. Set the low bit of the vfs pointer to 248185029Spjd * ensure that zfs_znode_move() recognizes the znode as invalid in any 249185029Spjd * subsequent callback. 250185029Spjd */ 251185029Spjd ozp->z_dbuf = NULL; 252185029Spjd POINTER_INVALIDATE(&ozp->z_zfsvfs); 253185029Spjd} 254185029Spjd 255185029Spjd/* 256185029Spjd * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise 257185029Spjd * returns a non-zero error code. 258185029Spjd */ 259185029Spjdstatic int 260185029Spjdzfs_enter(zfsvfs_t *zfsvfs) 261185029Spjd{ 262185029Spjd ZFS_ENTER(zfsvfs); 263185029Spjd return (0); 264185029Spjd} 265185029Spjd 266185029Spjd/*ARGSUSED*/ 267185029Spjdstatic kmem_cbrc_t 268185029Spjdzfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) 269185029Spjd{ 270185029Spjd znode_t *ozp = buf, *nzp = newbuf; 271185029Spjd zfsvfs_t *zfsvfs; 272185029Spjd vnode_t *vp; 273185029Spjd 274185029Spjd /* 275185029Spjd * The znode is on the file system's list of known znodes if the vfs 276185029Spjd * pointer is valid. We set the low bit of the vfs pointer when freeing 277185029Spjd * the znode to invalidate it, and the memory patterns written by kmem 278185029Spjd * (baddcafe and deadbeef) set at least one of the two low bits. A newly 279185029Spjd * created znode sets the vfs pointer last of all to indicate that the 280185029Spjd * znode is known and in a valid state to be moved by this function. 281185029Spjd */ 282185029Spjd zfsvfs = ozp->z_zfsvfs; 283185029Spjd if (!POINTER_IS_VALID(zfsvfs)) { 284185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); 285185029Spjd return (KMEM_CBRC_DONT_KNOW); 286185029Spjd } 287185029Spjd 288185029Spjd /* 289185029Spjd * Ensure that the filesystem is not unmounted during the move. 290185029Spjd */ 291185029Spjd if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */ 292185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); 293185029Spjd return (KMEM_CBRC_DONT_KNOW); 294185029Spjd } 295185029Spjd 296185029Spjd mutex_enter(&zfsvfs->z_znodes_lock); 297185029Spjd /* 298185029Spjd * Recheck the vfs pointer in case the znode was removed just before 299185029Spjd * acquiring the lock. 300185029Spjd */ 301185029Spjd if (zfsvfs != ozp->z_zfsvfs) { 302185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 303185029Spjd ZFS_EXIT(zfsvfs); 304185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid); 305185029Spjd return (KMEM_CBRC_DONT_KNOW); 306185029Spjd } 307185029Spjd 308185029Spjd /* 309185029Spjd * At this point we know that as long as we hold z_znodes_lock, the 310185029Spjd * znode cannot be freed and fields within the znode can be safely 311185029Spjd * accessed. Now, prevent a race with zfs_zget(). 312185029Spjd */ 313185029Spjd if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { 314185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 315185029Spjd ZFS_EXIT(zfsvfs); 316185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); 317185029Spjd return (KMEM_CBRC_LATER); 318185029Spjd } 319185029Spjd 320185029Spjd vp = ZTOV(ozp); 321185029Spjd if (mutex_tryenter(&vp->v_lock) == 0) { 322185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 323185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 324185029Spjd ZFS_EXIT(zfsvfs); 325185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); 326185029Spjd return (KMEM_CBRC_LATER); 327185029Spjd } 328185029Spjd 329185029Spjd /* Only move znodes that are referenced _only_ by the DNLC. */ 330185029Spjd if (vp->v_count != 1 || !vn_in_dnlc(vp)) { 331185029Spjd mutex_exit(&vp->v_lock); 332185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 333185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 334185029Spjd ZFS_EXIT(zfsvfs); 335185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); 336185029Spjd return (KMEM_CBRC_LATER); 337185029Spjd } 338185029Spjd 339185029Spjd /* 340185029Spjd * The znode is known and in a valid state to move. We're holding the 341185029Spjd * locks needed to execute the critical section. 342185029Spjd */ 343185029Spjd zfs_znode_move_impl(ozp, nzp); 344185029Spjd mutex_exit(&vp->v_lock); 345185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 346185029Spjd 347185029Spjd list_link_replace(&ozp->z_link_node, &nzp->z_link_node); 348185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 349185029Spjd ZFS_EXIT(zfsvfs); 350185029Spjd 351185029Spjd return (KMEM_CBRC_YES); 352185029Spjd} 353185029Spjd#endif /* sun */ 354185029Spjd 355168404Spjdvoid 356168404Spjdzfs_znode_init(void) 357168404Spjd{ 358168404Spjd /* 359168404Spjd * Initialize zcache 360168404Spjd */ 361168404Spjd ASSERT(znode_cache == NULL); 362168404Spjd znode_cache = kmem_cache_create("zfs_znode_cache", 363168404Spjd sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL, 364168404Spjd zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 365185029Spjd#if defined(sun) 366185029Spjd kmem_cache_set_move(znode_cache, zfs_znode_move); 367185029Spjd#endif 368168404Spjd} 369168404Spjd 370168404Spjdvoid 371168404Spjdzfs_znode_fini(void) 372168404Spjd{ 373168404Spjd /* 374168404Spjd * Cleanup zcache 375168404Spjd */ 376168404Spjd if (znode_cache) 377168404Spjd kmem_cache_destroy(znode_cache); 378168404Spjd znode_cache = NULL; 379168404Spjd} 380168404Spjd 381168404Spjd/* 382168404Spjd * zfs_init_fs - Initialize the zfsvfs struct and the file system 383168404Spjd * incore "master" object. Verify version compatibility. 384168404Spjd */ 385168404Spjdint 386185029Spjdzfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp) 387168404Spjd{ 388168404Spjd objset_t *os = zfsvfs->z_os; 389168404Spjd int i, error; 390168404Spjd uint64_t fsid_guid; 391185029Spjd uint64_t zval; 392168404Spjd 393168404Spjd *zpp = NULL; 394168404Spjd 395185029Spjd error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 396168404Spjd if (error) { 397168404Spjd return (error); 398185029Spjd } else if (zfsvfs->z_version > ZPL_VERSION) { 399168404Spjd (void) printf("Mismatched versions: File system " 400185029Spjd "is version %llu on-disk format, which is " 401168404Spjd "incompatible with this software version %lld!", 402185029Spjd (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); 403168404Spjd return (ENOTSUP); 404168404Spjd } 405168404Spjd 406185029Spjd if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) 407185029Spjd return (error); 408185029Spjd zfsvfs->z_norm = (int)zval; 409185029Spjd if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) 410185029Spjd return (error); 411185029Spjd zfsvfs->z_utf8 = (zval != 0); 412185029Spjd if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) 413185029Spjd return (error); 414185029Spjd zfsvfs->z_case = (uint_t)zval; 415168404Spjd /* 416185029Spjd * Fold case on file systems that are always or sometimes case 417185029Spjd * insensitive. 418185029Spjd */ 419185029Spjd if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 420185029Spjd zfsvfs->z_case == ZFS_CASE_MIXED) 421185029Spjd zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 422185029Spjd 423185029Spjd /* 424168404Spjd * The fsid is 64 bits, composed of an 8-bit fs type, which 425168404Spjd * separates our fsid from any other filesystem types, and a 426168404Spjd * 56-bit objset unique ID. The objset unique ID is unique to 427168404Spjd * all objsets open on this system, provided by unique_create(). 428168404Spjd * The 8-bit fs type must be put in the low bits of fsid[1] 429168404Spjd * because that's where other Solaris filesystems put it. 430168404Spjd */ 431168404Spjd fsid_guid = dmu_objset_fsid_guid(os); 432168404Spjd ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 433168404Spjd zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid; 434168404Spjd zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | 435168404Spjd zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF; 436168404Spjd 437168404Spjd error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 438168404Spjd &zfsvfs->z_root); 439168404Spjd if (error) 440168404Spjd return (error); 441168404Spjd ASSERT(zfsvfs->z_root != 0); 442168404Spjd 443185029Spjd error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 444185029Spjd &zfsvfs->z_unlinkedobj); 445185029Spjd if (error) 446185029Spjd return (error); 447168404Spjd 448168404Spjd /* 449168404Spjd * Initialize zget mutex's 450168404Spjd */ 451168404Spjd for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 452168404Spjd mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 453168404Spjd 454168404Spjd error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); 455185029Spjd if (error) { 456185029Spjd /* 457185029Spjd * On error, we destroy the mutexes here since it's not 458185029Spjd * possible for the caller to determine if the mutexes were 459185029Spjd * initialized properly. 460185029Spjd */ 461185029Spjd for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 462185029Spjd mutex_destroy(&zfsvfs->z_hold_mtx[i]); 463168404Spjd return (error); 464185029Spjd } 465168404Spjd ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); 466185029Spjd error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 467185029Spjd &zfsvfs->z_fuid_obj); 468185029Spjd if (error == ENOENT) 469185029Spjd error = 0; 470168404Spjd 471168404Spjd return (0); 472168404Spjd} 473168404Spjd 474168404Spjd/* 475168404Spjd * define a couple of values we need available 476168404Spjd * for both 64 and 32 bit environments. 477168404Spjd */ 478168404Spjd#ifndef NBITSMINOR64 479168404Spjd#define NBITSMINOR64 32 480168404Spjd#endif 481168404Spjd#ifndef MAXMAJ64 482168404Spjd#define MAXMAJ64 0xffffffffUL 483168404Spjd#endif 484168404Spjd#ifndef MAXMIN64 485168404Spjd#define MAXMIN64 0xffffffffUL 486168404Spjd#endif 487168404Spjd 488168404Spjd/* 489168404Spjd * Create special expldev for ZFS private use. 490168404Spjd * Can't use standard expldev since it doesn't do 491168404Spjd * what we want. The standard expldev() takes a 492168404Spjd * dev32_t in LP64 and expands it to a long dev_t. 493168404Spjd * We need an interface that takes a dev32_t in ILP32 494168404Spjd * and expands it to a long dev_t. 495168404Spjd */ 496168404Spjdstatic uint64_t 497168404Spjdzfs_expldev(dev_t dev) 498168404Spjd{ 499187830Sed return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); 500168404Spjd} 501168404Spjd/* 502168404Spjd * Special cmpldev for ZFS private use. 503168404Spjd * Can't use standard cmpldev since it takes 504168404Spjd * a long dev_t and compresses it to dev32_t in 505168404Spjd * LP64. We need to do a compaction of a long dev_t 506168404Spjd * to a dev32_t in ILP32. 507168404Spjd */ 508168404Spjddev_t 509168404Spjdzfs_cmpldev(uint64_t dev) 510168404Spjd{ 511168958Spjd return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); 512168404Spjd} 513168404Spjd 514185029Spjdstatic void 515185029Spjdzfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) 516185029Spjd{ 517185029Spjd znode_t *nzp; 518185029Spjd 519185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); 520185029Spjd ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); 521185029Spjd 522185029Spjd mutex_enter(&zp->z_lock); 523185029Spjd 524185029Spjd ASSERT(zp->z_dbuf == NULL); 525185029Spjd zp->z_dbuf = db; 526185029Spjd nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); 527185029Spjd 528185029Spjd /* 529185029Spjd * there should be no 530185029Spjd * concurrent zgets on this object. 531185029Spjd */ 532185029Spjd if (nzp != NULL) 533185029Spjd panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); 534185029Spjd 535185029Spjd /* 536185029Spjd * Slap on VROOT if we are the root znode 537185029Spjd */ 538185029Spjd if (zp->z_id == zfsvfs->z_root) 539185029Spjd ZTOV(zp)->v_flag |= VROOT; 540185029Spjd 541185029Spjd mutex_exit(&zp->z_lock); 542185029Spjd vn_exists(ZTOV(zp)); 543185029Spjd} 544185029Spjd 545185029Spjdvoid 546185029Spjdzfs_znode_dmu_fini(znode_t *zp) 547185029Spjd{ 548185029Spjd dmu_buf_t *db = zp->z_dbuf; 549185029Spjd ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || 550185029Spjd zp->z_unlinked || 551185029Spjd RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); 552185029Spjd ASSERT(zp->z_dbuf != NULL); 553185029Spjd zp->z_dbuf = NULL; 554185029Spjd VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); 555185029Spjd dmu_buf_rele(db, NULL); 556185029Spjd} 557185029Spjd 558168404Spjd/* 559168404Spjd * Construct a new znode/vnode and intialize. 560168404Spjd * 561168404Spjd * This does not do a call to dmu_set_user() that is 562168404Spjd * up to the caller to do, in case you don't want to 563168404Spjd * return the znode 564168404Spjd */ 565168404Spjdstatic znode_t * 566185029Spjdzfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) 567168404Spjd{ 568168404Spjd znode_t *zp; 569168404Spjd vnode_t *vp; 570168404Spjd 571168404Spjd zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 572185029Spjd zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0); 573168404Spjd 574168404Spjd ASSERT(zp->z_dirlocks == NULL); 575185029Spjd ASSERT(zp->z_dbuf == NULL); 576185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 577168404Spjd 578185029Spjd /* 579185029Spjd * Defer setting z_zfsvfs until the znode is ready to be a candidate for 580185029Spjd * the zfs_znode_move() callback. 581185029Spjd */ 582185029Spjd zp->z_phys = NULL; 583168404Spjd zp->z_unlinked = 0; 584168404Spjd zp->z_atime_dirty = 0; 585168404Spjd zp->z_mapcnt = 0; 586168404Spjd zp->z_last_itx = 0; 587185029Spjd zp->z_id = db->db_object; 588168404Spjd zp->z_blksz = blksz; 589168404Spjd zp->z_seq = 0x7A4653; 590168404Spjd zp->z_sync_cnt = 0; 591168404Spjd 592185029Spjd vp = ZTOV(zp); 593185029Spjd#ifdef TODO 594185029Spjd vn_reinit(vp); 595185029Spjd#endif 596168404Spjd 597185029Spjd zfs_znode_dmu_init(zfsvfs, zp, db); 598185029Spjd 599185029Spjd zp->z_gen = zp->z_phys->zp_gen; 600185029Spjd 601185029Spjd#if 0 602168404Spjd if (vp == NULL) 603168404Spjd return (zp); 604185029Spjd#endif 605168404Spjd 606168404Spjd vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 607168404Spjd switch (vp->v_type) { 608168404Spjd case VDIR: 609168404Spjd zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 610168404Spjd break; 611168404Spjd case VFIFO: 612168404Spjd vp->v_op = &zfs_fifoops; 613168404Spjd break; 614168404Spjd } 615189696Sjhb if (vp->v_type != VFIFO) 616189696Sjhb VN_LOCK_ASHARE(vp); 617168404Spjd 618185029Spjd mutex_enter(&zfsvfs->z_znodes_lock); 619185029Spjd list_insert_tail(&zfsvfs->z_all_znodes, zp); 620185029Spjd membar_producer(); 621168404Spjd /* 622185029Spjd * Everything else must be valid before assigning z_zfsvfs makes the 623185029Spjd * znode eligible for zfs_znode_move(). 624168404Spjd */ 625185029Spjd zp->z_zfsvfs = zfsvfs; 626185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 627168404Spjd 628168404Spjd VFS_HOLD(zfsvfs->z_vfs); 629185029Spjd return (zp); 630168404Spjd} 631168404Spjd 632168404Spjd/* 633168404Spjd * Create a new DMU object to hold a zfs znode. 634168404Spjd * 635168404Spjd * IN: dzp - parent directory for new znode 636168404Spjd * vap - file attributes for new znode 637168404Spjd * tx - dmu transaction id for zap operations 638168404Spjd * cr - credentials of caller 639168404Spjd * flag - flags: 640168404Spjd * IS_ROOT_NODE - new object will be root 641168404Spjd * IS_XATTR - new object is an attribute 642168404Spjd * IS_REPLAY - intent log replay 643185029Spjd * bonuslen - length of bonus buffer 644185029Spjd * setaclp - File/Dir initial ACL 645185029Spjd * fuidp - Tracks fuid allocation. 646168404Spjd * 647185029Spjd * OUT: zpp - allocated znode 648168404Spjd * 649168404Spjd */ 650168404Spjdvoid 651185029Spjdzfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 652185029Spjd uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp, 653185029Spjd zfs_fuid_info_t **fuidp) 654168404Spjd{ 655185029Spjd dmu_buf_t *db; 656168404Spjd znode_phys_t *pzp; 657168404Spjd zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 658168404Spjd timestruc_t now; 659185029Spjd uint64_t gen, obj; 660168404Spjd int err; 661168404Spjd 662168404Spjd ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 663168404Spjd 664168404Spjd if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ 665185029Spjd obj = vap->va_nodeid; 666168404Spjd flag |= IS_REPLAY; 667168404Spjd now = vap->va_ctime; /* see zfs_replay_create() */ 668168404Spjd gen = vap->va_nblocks; /* ditto */ 669168404Spjd } else { 670185029Spjd obj = 0; 671168404Spjd gethrestime(&now); 672168404Spjd gen = dmu_tx_get_txg(tx); 673168404Spjd } 674168404Spjd 675168404Spjd /* 676168404Spjd * Create a new DMU object. 677168404Spjd */ 678168404Spjd /* 679168404Spjd * There's currently no mechanism for pre-reading the blocks that will 680168404Spjd * be to needed allocate a new object, so we accept the small chance 681168404Spjd * that there will be an i/o error and we will fail one of the 682168404Spjd * assertions below. 683168404Spjd */ 684168404Spjd if (vap->va_type == VDIR) { 685168404Spjd if (flag & IS_REPLAY) { 686185029Spjd err = zap_create_claim_norm(zfsvfs->z_os, obj, 687185029Spjd zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 688168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 689168404Spjd ASSERT3U(err, ==, 0); 690168404Spjd } else { 691185029Spjd obj = zap_create_norm(zfsvfs->z_os, 692185029Spjd zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 693168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 694168404Spjd } 695168404Spjd } else { 696168404Spjd if (flag & IS_REPLAY) { 697185029Spjd err = dmu_object_claim(zfsvfs->z_os, obj, 698168404Spjd DMU_OT_PLAIN_FILE_CONTENTS, 0, 699168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 700168404Spjd ASSERT3U(err, ==, 0); 701168404Spjd } else { 702185029Spjd obj = dmu_object_alloc(zfsvfs->z_os, 703168404Spjd DMU_OT_PLAIN_FILE_CONTENTS, 0, 704168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 705168404Spjd } 706168404Spjd } 707207334Spjd 708207334Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 709185029Spjd VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); 710185029Spjd dmu_buf_will_dirty(db, tx); 711168404Spjd 712168404Spjd /* 713168404Spjd * Initialize the znode physical data to zero. 714168404Spjd */ 715185029Spjd ASSERT(db->db_size >= sizeof (znode_phys_t)); 716185029Spjd bzero(db->db_data, db->db_size); 717185029Spjd pzp = db->db_data; 718168404Spjd 719168404Spjd /* 720168404Spjd * If this is the root, fix up the half-initialized parent pointer 721168404Spjd * to reference the just-allocated physical data area. 722168404Spjd */ 723168404Spjd if (flag & IS_ROOT_NODE) { 724185029Spjd dzp->z_dbuf = db; 725168404Spjd dzp->z_phys = pzp; 726185029Spjd dzp->z_id = obj; 727168404Spjd } 728168404Spjd 729168404Spjd /* 730168404Spjd * If parent is an xattr, so am I. 731168404Spjd */ 732168404Spjd if (dzp->z_phys->zp_flags & ZFS_XATTR) 733168404Spjd flag |= IS_XATTR; 734168404Spjd 735168404Spjd if (vap->va_type == VBLK || vap->va_type == VCHR) { 736168404Spjd pzp->zp_rdev = zfs_expldev(vap->va_rdev); 737168404Spjd } 738168404Spjd 739185029Spjd if (zfsvfs->z_use_fuids) 740185029Spjd pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 741185029Spjd 742168404Spjd if (vap->va_type == VDIR) { 743168404Spjd pzp->zp_size = 2; /* contents ("." and "..") */ 744168404Spjd pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 745168404Spjd } 746168404Spjd 747168404Spjd pzp->zp_parent = dzp->z_id; 748168404Spjd if (flag & IS_XATTR) 749168404Spjd pzp->zp_flags |= ZFS_XATTR; 750168404Spjd 751168404Spjd pzp->zp_gen = gen; 752168404Spjd 753168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 754168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 755168404Spjd 756168404Spjd if (vap->va_mask & AT_ATIME) { 757168404Spjd ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 758168404Spjd } else { 759168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_atime); 760168404Spjd } 761168404Spjd 762168404Spjd if (vap->va_mask & AT_MTIME) { 763168404Spjd ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 764168404Spjd } else { 765168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 766168404Spjd } 767168404Spjd 768168404Spjd pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); 769185029Spjd if (!(flag & IS_ROOT_NODE)) { 770185029Spjd *zpp = zfs_znode_alloc(zfsvfs, db, 0); 771185029Spjd } else { 772185029Spjd /* 773185029Spjd * If we are creating the root node, the "parent" we 774185029Spjd * passed in is the znode for the root. 775185029Spjd */ 776185029Spjd *zpp = dzp; 777185029Spjd } 778185029Spjd zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp); 779207334Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 780185029Spjd if (!(flag & IS_ROOT_NODE)) { 781185029Spjd vnode_t *vp; 782168404Spjd 783185029Spjd vp = ZTOV(*zpp); 784185029Spjd vp->v_vflag |= VV_FORCEINSMQ; 785185029Spjd err = insmntque(vp, zfsvfs->z_vfs); 786185029Spjd vp->v_vflag &= ~VV_FORCEINSMQ; 787185029Spjd KASSERT(err == 0, ("insmntque() failed: error %d", err)); 788185029Spjd } 789185029Spjd} 790168404Spjd 791185029Spjdvoid 792185029Spjdzfs_xvattr_set(znode_t *zp, xvattr_t *xvap) 793185029Spjd{ 794185029Spjd xoptattr_t *xoap; 795168404Spjd 796185029Spjd xoap = xva_getxoptattr(xvap); 797185029Spjd ASSERT(xoap); 798168404Spjd 799185029Spjd if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 800185029Spjd ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); 801185029Spjd XVA_SET_RTN(xvap, XAT_CREATETIME); 802168404Spjd } 803185029Spjd if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 804185029Spjd ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); 805185029Spjd XVA_SET_RTN(xvap, XAT_READONLY); 806185029Spjd } 807185029Spjd if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 808185029Spjd ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); 809185029Spjd XVA_SET_RTN(xvap, XAT_HIDDEN); 810185029Spjd } 811185029Spjd if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 812185029Spjd ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); 813185029Spjd XVA_SET_RTN(xvap, XAT_SYSTEM); 814185029Spjd } 815185029Spjd if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 816185029Spjd ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); 817185029Spjd XVA_SET_RTN(xvap, XAT_ARCHIVE); 818185029Spjd } 819185029Spjd if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 820185029Spjd ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); 821185029Spjd XVA_SET_RTN(xvap, XAT_IMMUTABLE); 822185029Spjd } 823185029Spjd if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 824185029Spjd ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); 825185029Spjd XVA_SET_RTN(xvap, XAT_NOUNLINK); 826185029Spjd } 827185029Spjd if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 828185029Spjd ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); 829185029Spjd XVA_SET_RTN(xvap, XAT_APPENDONLY); 830185029Spjd } 831185029Spjd if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 832185029Spjd ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); 833185029Spjd XVA_SET_RTN(xvap, XAT_NODUMP); 834185029Spjd } 835185029Spjd if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 836185029Spjd ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); 837185029Spjd XVA_SET_RTN(xvap, XAT_OPAQUE); 838185029Spjd } 839185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 840185029Spjd ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 841185029Spjd xoap->xoa_av_quarantined); 842185029Spjd XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 843185029Spjd } 844185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 845185029Spjd ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); 846185029Spjd XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 847185029Spjd } 848185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 849185029Spjd (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, 850185029Spjd sizeof (xoap->xoa_av_scanstamp)); 851185029Spjd zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; 852185029Spjd XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 853185029Spjd } 854168404Spjd} 855168404Spjd 856168404Spjdint 857168404Spjdzfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 858168404Spjd{ 859168404Spjd dmu_object_info_t doi; 860168404Spjd dmu_buf_t *db; 861168404Spjd znode_t *zp; 862168404Spjd vnode_t *vp; 863185029Spjd int err, first = 1; 864168404Spjd 865168404Spjd *zpp = NULL; 866185029Spjdagain: 867168404Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 868168404Spjd 869168404Spjd err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 870168404Spjd if (err) { 871168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 872168404Spjd return (err); 873168404Spjd } 874168404Spjd 875168404Spjd dmu_object_info_from_db(db, &doi); 876168404Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 877168404Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 878168404Spjd dmu_buf_rele(db, NULL); 879168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 880168404Spjd return (EINVAL); 881168404Spjd } 882168404Spjd 883168404Spjd zp = dmu_buf_get_user(db); 884168404Spjd if (zp != NULL) { 885168404Spjd mutex_enter(&zp->z_lock); 886168404Spjd 887185029Spjd /* 888185029Spjd * Since we do immediate eviction of the z_dbuf, we 889185029Spjd * should never find a dbuf with a znode that doesn't 890185029Spjd * know about the dbuf. 891185029Spjd */ 892185029Spjd ASSERT3P(zp->z_dbuf, ==, db); 893168404Spjd ASSERT3U(zp->z_id, ==, obj_num); 894168404Spjd if (zp->z_unlinked) { 895185029Spjd err = ENOENT; 896168404Spjd } else { 897197458Spjd int dying = 0; 898197458Spjd 899197458Spjd vp = ZTOV(zp); 900197458Spjd if (vp == NULL) 901197458Spjd dying = 1; 902197458Spjd else { 903197458Spjd VN_HOLD(vp); 904197131Spjd if ((vp->v_iflag & VI_DOOMED) != 0) { 905197458Spjd dying = 1; 906197458Spjd /* 907197458Spjd * Don't VN_RELE() vnode here, because 908197458Spjd * it can call vn_lock() which creates 909197458Spjd * LOR between vnode lock and znode 910197458Spjd * lock. We will VN_RELE() the vnode 911197458Spjd * after droping znode lock. 912197458Spjd */ 913197458Spjd } 914197131Spjd } 915197458Spjd if (dying) { 916185029Spjd if (first) { 917185029Spjd ZFS_LOG(1, "dying znode detected (zp=%p)", zp); 918185029Spjd first = 0; 919185029Spjd } 920185029Spjd /* 921185029Spjd * znode is dying so we can't reuse it, we must 922185029Spjd * wait until destruction is completed. 923185029Spjd */ 924185029Spjd dmu_buf_rele(db, NULL); 925185029Spjd mutex_exit(&zp->z_lock); 926185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 927197458Spjd if (vp != NULL) 928197458Spjd VN_RELE(vp); 929185029Spjd tsleep(zp, 0, "zcollide", 1); 930185029Spjd goto again; 931185029Spjd } 932185029Spjd *zpp = zp; 933185029Spjd err = 0; 934168404Spjd } 935185029Spjd dmu_buf_rele(db, NULL); 936168404Spjd mutex_exit(&zp->z_lock); 937168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 938185029Spjd return (err); 939168404Spjd } 940168404Spjd 941168404Spjd /* 942168404Spjd * Not found create new znode/vnode 943207334Spjd * but only if file exists. 944207334Spjd * 945207334Spjd * There is a small window where zfs_vget() could 946207334Spjd * find this object while a file create is still in 947207334Spjd * progress. Since a gen number can never be zero 948207334Spjd * we will check that to determine if its an allocated 949207334Spjd * file. 950168404Spjd */ 951185029Spjd 952207334Spjd if (((znode_phys_t *)db->db_data)->zp_gen != 0) { 953207334Spjd zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); 954207334Spjd *zpp = zp; 955207334Spjd vp = ZTOV(zp); 956207334Spjd vp->v_vflag |= VV_FORCEINSMQ; 957207334Spjd err = insmntque(vp, zfsvfs->z_vfs); 958207334Spjd vp->v_vflag &= ~VV_FORCEINSMQ; 959207334Spjd KASSERT(err == 0, ("insmntque() failed: error %d", err)); 960207334Spjd VOP_UNLOCK(vp, 0); 961207334Spjd err = 0; 962207334Spjd } else { 963207334Spjd dmu_buf_rele(db, NULL); 964207334Spjd err = ENOENT; 965207334Spjd } 966168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 967207334Spjd return (err); 968168404Spjd} 969168404Spjd 970185029Spjdint 971185029Spjdzfs_rezget(znode_t *zp) 972185029Spjd{ 973185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 974185029Spjd dmu_object_info_t doi; 975185029Spjd dmu_buf_t *db; 976185029Spjd uint64_t obj_num = zp->z_id; 977185029Spjd int err; 978185029Spjd 979185029Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 980185029Spjd 981185029Spjd err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 982185029Spjd if (err) { 983185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 984185029Spjd return (err); 985185029Spjd } 986185029Spjd 987185029Spjd dmu_object_info_from_db(db, &doi); 988185029Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 989185029Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 990185029Spjd dmu_buf_rele(db, NULL); 991185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 992185029Spjd return (EINVAL); 993185029Spjd } 994185029Spjd 995185029Spjd if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { 996185029Spjd dmu_buf_rele(db, NULL); 997185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 998185029Spjd return (EIO); 999185029Spjd } 1000185029Spjd 1001185029Spjd zfs_znode_dmu_init(zfsvfs, zp, db); 1002185029Spjd zp->z_unlinked = (zp->z_phys->zp_links == 0); 1003185029Spjd zp->z_blksz = doi.doi_data_block_size; 1004185029Spjd 1005185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1006185029Spjd 1007185029Spjd return (0); 1008185029Spjd} 1009185029Spjd 1010168404Spjdvoid 1011168404Spjdzfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 1012168404Spjd{ 1013168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1014185029Spjd objset_t *os = zfsvfs->z_os; 1015185029Spjd uint64_t obj = zp->z_id; 1016185029Spjd uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; 1017168404Spjd 1018185029Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 1019185029Spjd if (acl_obj) 1020185029Spjd VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1021185029Spjd VERIFY(0 == dmu_object_free(os, obj, tx)); 1022185029Spjd zfs_znode_dmu_fini(zp); 1023185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 1024185029Spjd zfs_znode_free(zp); 1025168404Spjd} 1026168404Spjd 1027168404Spjdvoid 1028168404Spjdzfs_zinactive(znode_t *zp) 1029168404Spjd{ 1030168404Spjd vnode_t *vp = ZTOV(zp); 1031168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1032168404Spjd uint64_t z_id = zp->z_id; 1033201406Sdelphij int vfslocked; 1034168404Spjd 1035185029Spjd ASSERT(zp->z_dbuf && zp->z_phys); 1036168404Spjd 1037168404Spjd /* 1038168404Spjd * Don't allow a zfs_zget() while were trying to release this znode 1039168404Spjd */ 1040168404Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 1041168404Spjd 1042168404Spjd mutex_enter(&zp->z_lock); 1043168404Spjd VI_LOCK(vp); 1044168404Spjd if (vp->v_count > 0) { 1045168404Spjd /* 1046168404Spjd * If the hold count is greater than zero, somebody has 1047168404Spjd * obtained a new reference on this znode while we were 1048168404Spjd * processing it here, so we are done. 1049168404Spjd */ 1050168404Spjd VI_UNLOCK(vp); 1051168404Spjd mutex_exit(&zp->z_lock); 1052168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1053168404Spjd return; 1054168404Spjd } 1055168404Spjd VI_UNLOCK(vp); 1056168404Spjd 1057168404Spjd /* 1058168404Spjd * If this was the last reference to a file with no links, 1059168404Spjd * remove the file from the file system. 1060168404Spjd */ 1061168404Spjd if (zp->z_unlinked) { 1062168404Spjd mutex_exit(&zp->z_lock); 1063168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1064168404Spjd ASSERT(vp->v_count == 0); 1065168404Spjd vrecycle(vp, curthread); 1066201406Sdelphij vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs); 1067168404Spjd zfs_rmnode(zp); 1068201406Sdelphij VFS_UNLOCK_GIANT(vfslocked); 1069168404Spjd return; 1070168404Spjd } 1071168404Spjd mutex_exit(&zp->z_lock); 1072168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1073168404Spjd} 1074168404Spjd 1075168404Spjdvoid 1076168404Spjdzfs_znode_free(znode_t *zp) 1077168404Spjd{ 1078168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1079168404Spjd 1080185029Spjd ASSERT(ZTOV(zp) == NULL); 1081168404Spjd mutex_enter(&zfsvfs->z_znodes_lock); 1082185029Spjd POINTER_INVALIDATE(&zp->z_zfsvfs); 1083168404Spjd list_remove(&zfsvfs->z_all_znodes, zp); 1084168404Spjd mutex_exit(&zfsvfs->z_znodes_lock); 1085168404Spjd 1086168404Spjd kmem_cache_free(znode_cache, zp); 1087185029Spjd 1088185029Spjd VFS_RELE(zfsvfs->z_vfs); 1089168404Spjd} 1090168404Spjd 1091168404Spjdvoid 1092168404Spjdzfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1093168404Spjd{ 1094168404Spjd timestruc_t now; 1095168404Spjd 1096168404Spjd ASSERT(MUTEX_HELD(&zp->z_lock)); 1097168404Spjd 1098168404Spjd gethrestime(&now); 1099168404Spjd 1100168404Spjd if (tx) { 1101168404Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1102168404Spjd zp->z_atime_dirty = 0; 1103168404Spjd zp->z_seq++; 1104168404Spjd } else { 1105168404Spjd zp->z_atime_dirty = 1; 1106168404Spjd } 1107168404Spjd 1108168404Spjd if (flag & AT_ATIME) 1109168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1110168404Spjd 1111185029Spjd if (flag & AT_MTIME) { 1112168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1113185029Spjd if (zp->z_zfsvfs->z_use_fuids) 1114185029Spjd zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); 1115185029Spjd } 1116168404Spjd 1117185029Spjd if (flag & AT_CTIME) { 1118168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1119185029Spjd if (zp->z_zfsvfs->z_use_fuids) 1120185029Spjd zp->z_phys->zp_flags |= ZFS_ARCHIVE; 1121185029Spjd } 1122168404Spjd} 1123168404Spjd 1124168404Spjd/* 1125168404Spjd * Update the requested znode timestamps with the current time. 1126168404Spjd * If we are in a transaction, then go ahead and mark the znode 1127168404Spjd * dirty in the transaction so the timestamps will go to disk. 1128168404Spjd * Otherwise, we will get pushed next time the znode is updated 1129168404Spjd * in a transaction, or when this znode eventually goes inactive. 1130168404Spjd * 1131168404Spjd * Why is this OK? 1132168404Spjd * 1 - Only the ACCESS time is ever updated outside of a transaction. 1133168404Spjd * 2 - Multiple consecutive updates will be collapsed into a single 1134168404Spjd * znode update by the transaction grouping semantics of the DMU. 1135168404Spjd */ 1136168404Spjdvoid 1137168404Spjdzfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1138168404Spjd{ 1139168404Spjd mutex_enter(&zp->z_lock); 1140168404Spjd zfs_time_stamper_locked(zp, flag, tx); 1141168404Spjd mutex_exit(&zp->z_lock); 1142168404Spjd} 1143168404Spjd 1144168404Spjd/* 1145168404Spjd * Grow the block size for a file. 1146168404Spjd * 1147168404Spjd * IN: zp - znode of file to free data in. 1148168404Spjd * size - requested block size 1149168404Spjd * tx - open transaction. 1150168404Spjd * 1151168404Spjd * NOTE: this function assumes that the znode is write locked. 1152168404Spjd */ 1153168404Spjdvoid 1154168404Spjdzfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1155168404Spjd{ 1156168404Spjd int error; 1157168404Spjd u_longlong_t dummy; 1158168404Spjd 1159168404Spjd if (size <= zp->z_blksz) 1160168404Spjd return; 1161168404Spjd /* 1162168404Spjd * If the file size is already greater than the current blocksize, 1163168404Spjd * we will not grow. If there is more than one block in a file, 1164168404Spjd * the blocksize cannot change. 1165168404Spjd */ 1166168404Spjd if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1167168404Spjd return; 1168168404Spjd 1169168404Spjd error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1170168404Spjd size, 0, tx); 1171168404Spjd if (error == ENOTSUP) 1172168404Spjd return; 1173168404Spjd ASSERT3U(error, ==, 0); 1174168404Spjd 1175168404Spjd /* What blocksize did we actually get? */ 1176168404Spjd dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1177168404Spjd} 1178168404Spjd 1179168404Spjd/* 1180185029Spjd * Increase the file length 1181168404Spjd * 1182168404Spjd * IN: zp - znode of file to free data in. 1183185029Spjd * end - new end-of-file 1184168404Spjd * 1185168404Spjd * RETURN: 0 if success 1186168404Spjd * error code if failure 1187168404Spjd */ 1188185029Spjdstatic int 1189185029Spjdzfs_extend(znode_t *zp, uint64_t end) 1190168404Spjd{ 1191185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1192168404Spjd dmu_tx_t *tx; 1193168404Spjd rl_t *rl; 1194185029Spjd uint64_t newblksz; 1195168404Spjd int error; 1196168404Spjd 1197168404Spjd /* 1198185029Spjd * We will change zp_size, lock the whole file. 1199168404Spjd */ 1200185029Spjd rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1201168404Spjd 1202168404Spjd /* 1203168404Spjd * Nothing to do if file already at desired length. 1204168404Spjd */ 1205185029Spjd if (end <= zp->z_phys->zp_size) { 1206168404Spjd zfs_range_unlock(rl); 1207168404Spjd return (0); 1208168404Spjd } 1209185029Spjdtop: 1210168404Spjd tx = dmu_tx_create(zfsvfs->z_os); 1211168404Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1212185029Spjd if (end > zp->z_blksz && 1213168404Spjd (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1214168404Spjd /* 1215168404Spjd * We are growing the file past the current block size. 1216168404Spjd */ 1217168404Spjd if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1218168404Spjd ASSERT(!ISP2(zp->z_blksz)); 1219185029Spjd newblksz = MIN(end, SPA_MAXBLOCKSIZE); 1220168404Spjd } else { 1221185029Spjd newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1222168404Spjd } 1223185029Spjd dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1224185029Spjd } else { 1225185029Spjd newblksz = 0; 1226168404Spjd } 1227168404Spjd 1228168404Spjd error = dmu_tx_assign(tx, zfsvfs->z_assign); 1229168404Spjd if (error) { 1230185029Spjd if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1231168404Spjd dmu_tx_wait(tx); 1232185029Spjd dmu_tx_abort(tx); 1233185029Spjd goto top; 1234185029Spjd } 1235168404Spjd dmu_tx_abort(tx); 1236168404Spjd zfs_range_unlock(rl); 1237168404Spjd return (error); 1238168404Spjd } 1239185029Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1240168404Spjd 1241185029Spjd if (newblksz) 1242185029Spjd zfs_grow_blocksize(zp, newblksz, tx); 1243168404Spjd 1244185029Spjd zp->z_phys->zp_size = end; 1245168404Spjd 1246185029Spjd zfs_range_unlock(rl); 1247168404Spjd 1248185029Spjd dmu_tx_commit(tx); 1249185029Spjd 1250185029Spjd rw_enter(&zp->z_map_lock, RW_WRITER); 1251185029Spjd error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0); 1252185029Spjd ASSERT(error == 0); 1253185029Spjd vnode_pager_setsize(ZTOV(zp), end); 1254185029Spjd rw_exit(&zp->z_map_lock); 1255185029Spjd 1256185029Spjd return (0); 1257185029Spjd} 1258185029Spjd 1259185029Spjd/* 1260185029Spjd * Free space in a file. 1261185029Spjd * 1262185029Spjd * IN: zp - znode of file to free data in. 1263185029Spjd * off - start of section to free. 1264185029Spjd * len - length of section to free. 1265185029Spjd * 1266185029Spjd * RETURN: 0 if success 1267185029Spjd * error code if failure 1268185029Spjd */ 1269185029Spjdstatic int 1270185029Spjdzfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1271185029Spjd{ 1272185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1273185029Spjd rl_t *rl; 1274185029Spjd int error; 1275185029Spjd 1276185029Spjd /* 1277185029Spjd * Lock the range being freed. 1278185029Spjd */ 1279185029Spjd rl = zfs_range_lock(zp, off, len, RL_WRITER); 1280185029Spjd 1281185029Spjd /* 1282185029Spjd * Nothing to do if file already at desired length. 1283185029Spjd */ 1284185029Spjd if (off >= zp->z_phys->zp_size) { 1285185029Spjd zfs_range_unlock(rl); 1286185029Spjd return (0); 1287168404Spjd } 1288168404Spjd 1289185029Spjd if (off + len > zp->z_phys->zp_size) 1290185029Spjd len = zp->z_phys->zp_size - off; 1291185029Spjd 1292185029Spjd error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1293185029Spjd 1294185029Spjd if (error == 0) { 1295185029Spjd /* 1296185029Spjd * In FreeBSD we cannot free block in the middle of a file, 1297185029Spjd * but only at the end of a file. 1298185029Spjd */ 1299185029Spjd rw_enter(&zp->z_map_lock, RW_WRITER); 1300185029Spjd error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0); 1301185029Spjd ASSERT(error == 0); 1302185029Spjd vnode_pager_setsize(ZTOV(zp), off); 1303185029Spjd rw_exit(&zp->z_map_lock); 1304168404Spjd } 1305168404Spjd 1306168404Spjd zfs_range_unlock(rl); 1307168404Spjd 1308185029Spjd return (error); 1309185029Spjd} 1310185029Spjd 1311185029Spjd/* 1312185029Spjd * Truncate a file 1313185029Spjd * 1314185029Spjd * IN: zp - znode of file to free data in. 1315185029Spjd * end - new end-of-file. 1316185029Spjd * 1317185029Spjd * RETURN: 0 if success 1318185029Spjd * error code if failure 1319185029Spjd */ 1320185029Spjdstatic int 1321185029Spjdzfs_trunc(znode_t *zp, uint64_t end) 1322185029Spjd{ 1323185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1324185029Spjd vnode_t *vp = ZTOV(zp); 1325185029Spjd dmu_tx_t *tx; 1326185029Spjd rl_t *rl; 1327185029Spjd int error; 1328185029Spjd 1329185029Spjd /* 1330185029Spjd * We will change zp_size, lock the whole file. 1331185029Spjd */ 1332185029Spjd rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1333185029Spjd 1334185029Spjd /* 1335185029Spjd * Nothing to do if file already at desired length. 1336185029Spjd */ 1337185029Spjd if (end >= zp->z_phys->zp_size) { 1338185029Spjd zfs_range_unlock(rl); 1339185029Spjd return (0); 1340185029Spjd } 1341185029Spjd 1342185029Spjd error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); 1343185029Spjd if (error) { 1344185029Spjd zfs_range_unlock(rl); 1345185029Spjd return (error); 1346185029Spjd } 1347185029Spjdtop: 1348185029Spjd tx = dmu_tx_create(zfsvfs->z_os); 1349185029Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1350185029Spjd error = dmu_tx_assign(tx, zfsvfs->z_assign); 1351185029Spjd if (error) { 1352185029Spjd if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1353185029Spjd dmu_tx_wait(tx); 1354185029Spjd dmu_tx_abort(tx); 1355185029Spjd goto top; 1356185029Spjd } 1357185029Spjd dmu_tx_abort(tx); 1358185029Spjd zfs_range_unlock(rl); 1359185029Spjd return (error); 1360185029Spjd } 1361185029Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1362185029Spjd 1363185029Spjd zp->z_phys->zp_size = end; 1364185029Spjd 1365168404Spjd dmu_tx_commit(tx); 1366168404Spjd 1367185029Spjd zfs_range_unlock(rl); 1368185029Spjd 1369168404Spjd /* 1370168404Spjd * Clear any mapped pages in the truncated region. This has to 1371168404Spjd * happen outside of the transaction to avoid the possibility of 1372168404Spjd * a deadlock with someone trying to push a page that we are 1373168404Spjd * about to invalidate. 1374168404Spjd */ 1375168404Spjd rw_enter(&zp->z_map_lock, RW_WRITER); 1376168404Spjd#if 0 1377185029Spjd error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE); 1378168404Spjd#else 1379185029Spjd error = vinvalbuf(vp, V_SAVE, 0, 0); 1380185029Spjd ASSERT(error == 0); 1381185029Spjd vnode_pager_setsize(vp, end); 1382168404Spjd#endif 1383168404Spjd rw_exit(&zp->z_map_lock); 1384168404Spjd 1385168404Spjd return (0); 1386168404Spjd} 1387168404Spjd 1388185029Spjd/* 1389185029Spjd * Free space in a file 1390185029Spjd * 1391185029Spjd * IN: zp - znode of file to free data in. 1392185029Spjd * off - start of range 1393185029Spjd * len - end of range (0 => EOF) 1394185029Spjd * flag - current file open mode flags. 1395185029Spjd * log - TRUE if this action should be logged 1396185029Spjd * 1397185029Spjd * RETURN: 0 if success 1398185029Spjd * error code if failure 1399185029Spjd */ 1400185029Spjdint 1401185029Spjdzfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1402185029Spjd{ 1403185029Spjd vnode_t *vp = ZTOV(zp); 1404185029Spjd dmu_tx_t *tx; 1405185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1406185029Spjd zilog_t *zilog = zfsvfs->z_log; 1407185029Spjd int error; 1408185029Spjd 1409185029Spjd if (off > zp->z_phys->zp_size) { 1410185029Spjd error = zfs_extend(zp, off+len); 1411185029Spjd if (error == 0 && log) 1412185029Spjd goto log; 1413185029Spjd else 1414185029Spjd return (error); 1415185029Spjd } 1416185029Spjd 1417185029Spjd if (len == 0) { 1418185029Spjd error = zfs_trunc(zp, off); 1419185029Spjd } else { 1420185029Spjd if ((error = zfs_free_range(zp, off, len)) == 0 && 1421185029Spjd off + len > zp->z_phys->zp_size) 1422185029Spjd error = zfs_extend(zp, off+len); 1423185029Spjd } 1424185029Spjd if (error || !log) 1425185029Spjd return (error); 1426185029Spjdlog: 1427185029Spjd tx = dmu_tx_create(zfsvfs->z_os); 1428185029Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1429185029Spjd error = dmu_tx_assign(tx, zfsvfs->z_assign); 1430185029Spjd if (error) { 1431185029Spjd if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1432185029Spjd dmu_tx_wait(tx); 1433185029Spjd dmu_tx_abort(tx); 1434185029Spjd goto log; 1435185029Spjd } 1436185029Spjd dmu_tx_abort(tx); 1437185029Spjd return (error); 1438185029Spjd } 1439185029Spjd 1440185029Spjd zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 1441185029Spjd zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1442185029Spjd 1443185029Spjd dmu_tx_commit(tx); 1444185029Spjd return (0); 1445185029Spjd} 1446185029Spjd 1447168404Spjdvoid 1448185029Spjdzfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1449168404Spjd{ 1450168404Spjd zfsvfs_t zfsvfs; 1451185029Spjd uint64_t moid, doid, version; 1452185029Spjd uint64_t sense = ZFS_CASE_SENSITIVE; 1453185029Spjd uint64_t norm = 0; 1454185029Spjd nvpair_t *elem; 1455168404Spjd int error; 1456207334Spjd int i; 1457168404Spjd znode_t *rootzp = NULL; 1458199156Spjd vnode_t vnode; 1459168404Spjd vattr_t vattr; 1460185029Spjd znode_t *zp; 1461168404Spjd 1462168404Spjd /* 1463168404Spjd * First attempt to create master node. 1464168404Spjd */ 1465168404Spjd /* 1466168404Spjd * In an empty objset, there are no blocks to read and thus 1467168404Spjd * there can be no i/o errors (which we assert below). 1468168404Spjd */ 1469168404Spjd moid = MASTER_NODE_OBJ; 1470168404Spjd error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1471168404Spjd DMU_OT_NONE, 0, tx); 1472168404Spjd ASSERT(error == 0); 1473168404Spjd 1474168404Spjd /* 1475168404Spjd * Set starting attributes. 1476168404Spjd */ 1477185029Spjd if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) 1478185029Spjd version = ZPL_VERSION; 1479185029Spjd else 1480185029Spjd version = ZPL_VERSION_FUID - 1; 1481185029Spjd error = zap_update(os, moid, ZPL_VERSION_STR, 1482185029Spjd 8, 1, &version, tx); 1483185029Spjd elem = NULL; 1484185029Spjd while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1485185029Spjd /* For the moment we expect all zpl props to be uint64_ts */ 1486185029Spjd uint64_t val; 1487185029Spjd char *name; 1488168404Spjd 1489185029Spjd ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1490185029Spjd VERIFY(nvpair_value_uint64(elem, &val) == 0); 1491185029Spjd name = nvpair_name(elem); 1492185029Spjd if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1493185029Spjd version = val; 1494185029Spjd error = zap_update(os, moid, ZPL_VERSION_STR, 1495185029Spjd 8, 1, &version, tx); 1496185029Spjd } else { 1497185029Spjd error = zap_update(os, moid, name, 8, 1, &val, tx); 1498185029Spjd } 1499185029Spjd ASSERT(error == 0); 1500185029Spjd if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1501185029Spjd norm = val; 1502185029Spjd else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1503185029Spjd sense = val; 1504185029Spjd } 1505185029Spjd ASSERT(version != 0); 1506168404Spjd 1507168404Spjd /* 1508168404Spjd * Create a delete queue. 1509168404Spjd */ 1510168404Spjd doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1511168404Spjd 1512168404Spjd error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); 1513168404Spjd ASSERT(error == 0); 1514168404Spjd 1515168404Spjd /* 1516168404Spjd * Create root znode. Create minimal znode/vnode/zfsvfs 1517168404Spjd * to allow zfs_mknode to work. 1518168404Spjd */ 1519185029Spjd VATTR_NULL(&vattr); 1520168404Spjd vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1521168404Spjd vattr.va_type = VDIR; 1522168404Spjd vattr.va_mode = S_IFDIR|0755; 1523185029Spjd vattr.va_uid = crgetuid(cr); 1524185029Spjd vattr.va_gid = crgetgid(cr); 1525168404Spjd 1526168404Spjd rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1527199156Spjd zfs_znode_cache_constructor(rootzp, NULL, 0); 1528168404Spjd rootzp->z_unlinked = 0; 1529168404Spjd rootzp->z_atime_dirty = 0; 1530168404Spjd 1531199156Spjd vnode.v_type = VDIR; 1532199156Spjd vnode.v_data = rootzp; 1533199156Spjd rootzp->z_vnode = &vnode; 1534185029Spjd 1535168404Spjd bzero(&zfsvfs, sizeof (zfsvfs_t)); 1536168404Spjd 1537168404Spjd zfsvfs.z_os = os; 1538168404Spjd zfsvfs.z_assign = TXG_NOWAIT; 1539168404Spjd zfsvfs.z_parent = &zfsvfs; 1540185029Spjd zfsvfs.z_version = version; 1541185029Spjd zfsvfs.z_use_fuids = USE_FUIDS(version, os); 1542185029Spjd zfsvfs.z_norm = norm; 1543185029Spjd /* 1544185029Spjd * Fold case on file systems that are always or sometimes case 1545185029Spjd * insensitive. 1546185029Spjd */ 1547185029Spjd if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1548185029Spjd zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; 1549168404Spjd 1550168404Spjd mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1551168404Spjd list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1552168404Spjd offsetof(znode_t, z_link_node)); 1553168404Spjd 1554207334Spjd for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1555207334Spjd mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1556207334Spjd 1557185029Spjd ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); 1558185029Spjd rootzp->z_zfsvfs = &zfsvfs; 1559185029Spjd zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL); 1560185029Spjd ASSERT3P(zp, ==, rootzp); 1561185029Spjd error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1562168404Spjd ASSERT(error == 0); 1563185029Spjd POINTER_INVALIDATE(&rootzp->z_zfsvfs); 1564168404Spjd 1565185029Spjd dmu_buf_rele(rootzp->z_dbuf, NULL); 1566185029Spjd rootzp->z_dbuf = NULL; 1567207334Spjd for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1568207334Spjd mutex_destroy(&zfsvfs.z_hold_mtx[i]); 1569169325Spjd mutex_destroy(&zfsvfs.z_znodes_lock); 1570199156Spjd rootzp->z_vnode = NULL; 1571168404Spjd kmem_cache_free(znode_cache, rootzp); 1572168404Spjd} 1573185029Spjd 1574168404Spjd#endif /* _KERNEL */ 1575168404Spjd/* 1576168404Spjd * Given an object number, return its parent object number and whether 1577168404Spjd * or not the object is an extended attribute directory. 1578168404Spjd */ 1579168404Spjdstatic int 1580168404Spjdzfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) 1581168404Spjd{ 1582168404Spjd dmu_buf_t *db; 1583168404Spjd dmu_object_info_t doi; 1584168404Spjd znode_phys_t *zp; 1585168404Spjd int error; 1586168404Spjd 1587168404Spjd if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) 1588168404Spjd return (error); 1589168404Spjd 1590168404Spjd dmu_object_info_from_db(db, &doi); 1591168404Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 1592168404Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 1593168404Spjd dmu_buf_rele(db, FTAG); 1594168404Spjd return (EINVAL); 1595168404Spjd } 1596168404Spjd 1597168404Spjd zp = db->db_data; 1598168404Spjd *pobjp = zp->zp_parent; 1599168404Spjd *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && 1600168404Spjd S_ISDIR(zp->zp_mode); 1601168404Spjd dmu_buf_rele(db, FTAG); 1602168404Spjd 1603168404Spjd return (0); 1604168404Spjd} 1605168404Spjd 1606168404Spjdint 1607168404Spjdzfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 1608168404Spjd{ 1609168404Spjd char *path = buf + len - 1; 1610168404Spjd int error; 1611168404Spjd 1612168404Spjd *path = '\0'; 1613168404Spjd 1614168404Spjd for (;;) { 1615168404Spjd uint64_t pobj; 1616168404Spjd char component[MAXNAMELEN + 2]; 1617168404Spjd size_t complen; 1618168404Spjd int is_xattrdir; 1619168404Spjd 1620168404Spjd if ((error = zfs_obj_to_pobj(osp, obj, &pobj, 1621168404Spjd &is_xattrdir)) != 0) 1622168404Spjd break; 1623168404Spjd 1624168404Spjd if (pobj == obj) { 1625168404Spjd if (path[0] != '/') 1626168404Spjd *--path = '/'; 1627168404Spjd break; 1628168404Spjd } 1629168404Spjd 1630168404Spjd component[0] = '/'; 1631168404Spjd if (is_xattrdir) { 1632168404Spjd (void) sprintf(component + 1, "<xattrdir>"); 1633168404Spjd } else { 1634185029Spjd error = zap_value_search(osp, pobj, obj, 1635185029Spjd ZFS_DIRENT_OBJ(-1ULL), component + 1); 1636168404Spjd if (error != 0) 1637168404Spjd break; 1638168404Spjd } 1639168404Spjd 1640168404Spjd complen = strlen(component); 1641168404Spjd path -= complen; 1642168404Spjd ASSERT(path >= buf); 1643168404Spjd bcopy(component, path, complen); 1644168404Spjd obj = pobj; 1645168404Spjd } 1646168404Spjd 1647168404Spjd if (error == 0) 1648168404Spjd (void) memmove(buf, path, buf + len - path); 1649168404Spjd return (error); 1650168404Spjd} 1651