zfs_znode.c revision 209962
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26/* Portions Copyright 2007 Jeremy Teo */ 27 28#ifdef _KERNEL 29#include <sys/types.h> 30#include <sys/param.h> 31#include <sys/time.h> 32#include <sys/systm.h> 33#include <sys/sysmacros.h> 34#include <sys/resource.h> 35#include <sys/mntent.h> 36#include <sys/u8_textprep.h> 37#include <sys/dsl_dataset.h> 38#include <sys/vfs.h> 39#include <sys/vnode.h> 40#include <sys/file.h> 41#include <sys/kmem.h> 42#include <sys/errno.h> 43#include <sys/unistd.h> 44#include <sys/atomic.h> 45#include <sys/zfs_dir.h> 46#include <sys/zfs_acl.h> 47#include <sys/zfs_ioctl.h> 48#include <sys/zfs_rlock.h> 49#include <sys/zfs_fuid.h> 50#include <sys/fs/zfs.h> 51#include <sys/kidmap.h> 52#endif /* _KERNEL */ 53 54#include <sys/dmu.h> 55#include <sys/refcount.h> 56#include <sys/stat.h> 57#include <sys/zap.h> 58#include <sys/zfs_znode.h> 59#include <sys/refcount.h> 60 61#include "zfs_prop.h" 62 63/* Used by fstat(1). */ 64SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t), 65 "sizeof(znode_t)"); 66 67/* 68 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only 69 * turned on when DEBUG is also defined. 70 */ 71#ifdef DEBUG 72#define ZNODE_STATS 73#endif /* DEBUG */ 74 75#ifdef ZNODE_STATS 76#define ZNODE_STAT_ADD(stat) ((stat)++) 77#else 78#define ZNODE_STAT_ADD(stat) /* nothing */ 79#endif /* ZNODE_STATS */ 80 81#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) 82#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) 83 84/* 85 * Functions needed for userland (ie: libzpool) are not put under 86 * #ifdef_KERNEL; the rest of the functions have dependencies 87 * (such as VFS logic) that will not compile easily in userland. 88 */ 89#ifdef _KERNEL 90static kmem_cache_t *znode_cache = NULL; 91 92/*ARGSUSED*/ 93static void 94znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) 95{ 96#if 1 /* XXXPJD: From OpenSolaris. */ 97 /* 98 * We should never drop all dbuf refs without first clearing 99 * the eviction callback. 100 */ 101 panic("evicting znode %p\n", user_ptr); 102#else /* XXXPJD */ 103 znode_t *zp = user_ptr; 104 vnode_t *vp; 105 106 mutex_enter(&zp->z_lock); 107 zp->z_dbuf = NULL; 108 vp = ZTOV(zp); 109 if (vp == NULL) { 110 mutex_exit(&zp->z_lock); 111 zfs_znode_free(zp); 112 } else if (vp->v_count == 0) { 113 zp->z_vnode = NULL; 114 vhold(vp); 115 mutex_exit(&zp->z_lock); 116 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); 117 vrecycle(vp, curthread); 118 VOP_UNLOCK(vp, 0); 119 vdrop(vp); 120 zfs_znode_free(zp); 121 } else { 122 mutex_exit(&zp->z_lock); 123 } 124#endif 125} 126 127extern struct vop_vector zfs_vnodeops; 128extern struct vop_vector zfs_fifoops; 129extern struct vop_vector zfs_shareops; 130 131/* 132 * XXX: We cannot use this function as a cache constructor, because 133 * there is one global cache for all file systems and we need 134 * to pass vfsp here, which is not possible, because argument 135 * 'cdrarg' is defined at kmem_cache_create() time. 136 */ 137static int 138zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 139{ 140 znode_t *zp = buf; 141 vnode_t *vp; 142 vfs_t *vfsp = arg; 143 int error; 144 145 POINTER_INVALIDATE(&zp->z_zfsvfs); 146 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 147 148 if (vfsp != NULL) { 149 error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp); 150 if (error != 0 && (kmflags & KM_NOSLEEP)) 151 return (-1); 152 ASSERT(error == 0); 153 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 154 zp->z_vnode = vp; 155 vp->v_data = (caddr_t)zp; 156 VN_LOCK_AREC(vp); 157 } else { 158 zp->z_vnode = NULL; 159 } 160 161 list_link_init(&zp->z_link_node); 162 163 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 164 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 165 rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); 166 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 167 168 mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); 169 avl_create(&zp->z_range_avl, zfs_range_compare, 170 sizeof (rl_t), offsetof(rl_t, r_node)); 171 172 zp->z_dbuf = NULL; 173 zp->z_dirlocks = NULL; 174 return (0); 175} 176 177/*ARGSUSED*/ 178static void 179zfs_znode_cache_destructor(void *buf, void *arg) 180{ 181 znode_t *zp = buf; 182 183 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 184 ASSERT(ZTOV(zp) == NULL); 185 vn_free(ZTOV(zp)); 186 ASSERT(!list_link_active(&zp->z_link_node)); 187 mutex_destroy(&zp->z_lock); 188 rw_destroy(&zp->z_parent_lock); 189 rw_destroy(&zp->z_name_lock); 190 mutex_destroy(&zp->z_acl_lock); 191 avl_destroy(&zp->z_range_avl); 192 mutex_destroy(&zp->z_range_lock); 193 194 ASSERT(zp->z_dbuf == NULL); 195 ASSERT(zp->z_dirlocks == NULL); 196} 197 198#ifdef ZNODE_STATS 199static struct { 200 uint64_t zms_zfsvfs_invalid; 201 uint64_t zms_zfsvfs_unmounted; 202 uint64_t zms_zfsvfs_recheck_invalid; 203 uint64_t zms_obj_held; 204 uint64_t zms_vnode_locked; 205 uint64_t zms_not_only_dnlc; 206} znode_move_stats; 207#endif /* ZNODE_STATS */ 208 209#if defined(sun) 210static void 211zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) 212{ 213 vnode_t *vp; 214 215 /* Copy fields. */ 216 nzp->z_zfsvfs = ozp->z_zfsvfs; 217 218 /* Swap vnodes. */ 219 vp = nzp->z_vnode; 220 nzp->z_vnode = ozp->z_vnode; 221 ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ 222 ZTOV(ozp)->v_data = ozp; 223 ZTOV(nzp)->v_data = nzp; 224 225 nzp->z_id = ozp->z_id; 226 ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ 227 ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); 228 nzp->z_unlinked = ozp->z_unlinked; 229 nzp->z_atime_dirty = ozp->z_atime_dirty; 230 nzp->z_zn_prefetch = ozp->z_zn_prefetch; 231 nzp->z_blksz = ozp->z_blksz; 232 nzp->z_seq = ozp->z_seq; 233 nzp->z_mapcnt = ozp->z_mapcnt; 234 nzp->z_last_itx = ozp->z_last_itx; 235 nzp->z_gen = ozp->z_gen; 236 nzp->z_sync_cnt = ozp->z_sync_cnt; 237 nzp->z_phys = ozp->z_phys; 238 nzp->z_dbuf = ozp->z_dbuf; 239 240 /* Update back pointers. */ 241 (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, 242 znode_evict_error); 243 244 /* 245 * Invalidate the original znode by clearing fields that provide a 246 * pointer back to the znode. Set the low bit of the vfs pointer to 247 * ensure that zfs_znode_move() recognizes the znode as invalid in any 248 * subsequent callback. 249 */ 250 ozp->z_dbuf = NULL; 251 POINTER_INVALIDATE(&ozp->z_zfsvfs); 252} 253 254/*ARGSUSED*/ 255static kmem_cbrc_t 256zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) 257{ 258 znode_t *ozp = buf, *nzp = newbuf; 259 zfsvfs_t *zfsvfs; 260 vnode_t *vp; 261 262 /* 263 * The znode is on the file system's list of known znodes if the vfs 264 * pointer is valid. We set the low bit of the vfs pointer when freeing 265 * the znode to invalidate it, and the memory patterns written by kmem 266 * (baddcafe and deadbeef) set at least one of the two low bits. A newly 267 * created znode sets the vfs pointer last of all to indicate that the 268 * znode is known and in a valid state to be moved by this function. 269 */ 270 zfsvfs = ozp->z_zfsvfs; 271 if (!POINTER_IS_VALID(zfsvfs)) { 272 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); 273 return (KMEM_CBRC_DONT_KNOW); 274 } 275 276 /* 277 * Ensure that the filesystem is not unmounted during the move. 278 * This is the equivalent to ZFS_ENTER(). 279 */ 280 rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); 281 if (zfsvfs->z_unmounted) { 282 ZFS_EXIT(zfsvfs); 283 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); 284 return (KMEM_CBRC_DONT_KNOW); 285 } 286 287 mutex_enter(&zfsvfs->z_znodes_lock); 288 /* 289 * Recheck the vfs pointer in case the znode was removed just before 290 * acquiring the lock. 291 */ 292 if (zfsvfs != ozp->z_zfsvfs) { 293 mutex_exit(&zfsvfs->z_znodes_lock); 294 ZFS_EXIT(zfsvfs); 295 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid); 296 return (KMEM_CBRC_DONT_KNOW); 297 } 298 299 /* 300 * At this point we know that as long as we hold z_znodes_lock, the 301 * znode cannot be freed and fields within the znode can be safely 302 * accessed. Now, prevent a race with zfs_zget(). 303 */ 304 if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { 305 mutex_exit(&zfsvfs->z_znodes_lock); 306 ZFS_EXIT(zfsvfs); 307 ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); 308 return (KMEM_CBRC_LATER); 309 } 310 311 vp = ZTOV(ozp); 312 if (mutex_tryenter(&vp->v_lock) == 0) { 313 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 314 mutex_exit(&zfsvfs->z_znodes_lock); 315 ZFS_EXIT(zfsvfs); 316 ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); 317 return (KMEM_CBRC_LATER); 318 } 319 320 /* Only move znodes that are referenced _only_ by the DNLC. */ 321 if (vp->v_count != 1 || !vn_in_dnlc(vp)) { 322 mutex_exit(&vp->v_lock); 323 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 324 mutex_exit(&zfsvfs->z_znodes_lock); 325 ZFS_EXIT(zfsvfs); 326 ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); 327 return (KMEM_CBRC_LATER); 328 } 329 330 /* 331 * The znode is known and in a valid state to move. We're holding the 332 * locks needed to execute the critical section. 333 */ 334 zfs_znode_move_impl(ozp, nzp); 335 mutex_exit(&vp->v_lock); 336 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 337 338 list_link_replace(&ozp->z_link_node, &nzp->z_link_node); 339 mutex_exit(&zfsvfs->z_znodes_lock); 340 ZFS_EXIT(zfsvfs); 341 342 return (KMEM_CBRC_YES); 343} 344#endif /* sun */ 345 346void 347zfs_znode_init(void) 348{ 349 /* 350 * Initialize zcache 351 */ 352 ASSERT(znode_cache == NULL); 353 znode_cache = kmem_cache_create("zfs_znode_cache", 354 sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL, 355 zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 356#if defined(sun) 357 kmem_cache_set_move(znode_cache, zfs_znode_move); 358#endif 359} 360 361void 362zfs_znode_fini(void) 363{ 364 /* 365 * Cleanup zcache 366 */ 367 if (znode_cache) 368 kmem_cache_destroy(znode_cache); 369 znode_cache = NULL; 370} 371 372int 373zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) 374{ 375 zfs_acl_ids_t acl_ids; 376 vattr_t vattr; 377 znode_t *sharezp; 378 vnode_t *vp, vnode; 379 znode_t *zp; 380 int error; 381 382 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 383 vattr.va_type = VDIR; 384 vattr.va_mode = S_IFDIR|0555; 385 vattr.va_uid = crgetuid(kcred); 386 vattr.va_gid = crgetgid(kcred); 387 388 sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); 389 zfs_znode_cache_constructor(sharezp, zfsvfs->z_parent->z_vfs, 0); 390 sharezp->z_unlinked = 0; 391 sharezp->z_atime_dirty = 0; 392 sharezp->z_zfsvfs = zfsvfs; 393 394 sharezp->z_vnode = &vnode; 395 vnode.v_data = sharezp; 396 397 vp = ZTOV(sharezp); 398 vp->v_type = VDIR; 399 400 VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, 401 kcred, NULL, &acl_ids)); 402 zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, 403 &zp, 0, &acl_ids); 404 ASSERT3P(zp, ==, sharezp); 405 POINTER_INVALIDATE(&sharezp->z_zfsvfs); 406 error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 407 ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); 408 zfsvfs->z_shares_dir = sharezp->z_id; 409 410 zfs_acl_ids_free(&acl_ids); 411 ZTOV(sharezp)->v_data = NULL; 412 ZTOV(sharezp)->v_count = 0; 413 ZTOV(sharezp)->v_holdcnt = 0; 414 zp->z_vnode = NULL; 415 sharezp->z_vnode = NULL; 416 dmu_buf_rele(sharezp->z_dbuf, NULL); 417 sharezp->z_dbuf = NULL; 418 kmem_cache_free(znode_cache, sharezp); 419 420 return (error); 421} 422 423/* 424 * define a couple of values we need available 425 * for both 64 and 32 bit environments. 426 */ 427#ifndef NBITSMINOR64 428#define NBITSMINOR64 32 429#endif 430#ifndef MAXMAJ64 431#define MAXMAJ64 0xffffffffUL 432#endif 433#ifndef MAXMIN64 434#define MAXMIN64 0xffffffffUL 435#endif 436 437/* 438 * Create special expldev for ZFS private use. 439 * Can't use standard expldev since it doesn't do 440 * what we want. The standard expldev() takes a 441 * dev32_t in LP64 and expands it to a long dev_t. 442 * We need an interface that takes a dev32_t in ILP32 443 * and expands it to a long dev_t. 444 */ 445static uint64_t 446zfs_expldev(dev_t dev) 447{ 448 return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); 449} 450/* 451 * Special cmpldev for ZFS private use. 452 * Can't use standard cmpldev since it takes 453 * a long dev_t and compresses it to dev32_t in 454 * LP64. We need to do a compaction of a long dev_t 455 * to a dev32_t in ILP32. 456 */ 457dev_t 458zfs_cmpldev(uint64_t dev) 459{ 460 return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); 461} 462 463static void 464zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) 465{ 466 znode_t *nzp; 467 468 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); 469 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); 470 471 mutex_enter(&zp->z_lock); 472 473 ASSERT(zp->z_dbuf == NULL); 474 zp->z_dbuf = db; 475 nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); 476 477 /* 478 * there should be no 479 * concurrent zgets on this object. 480 */ 481 if (nzp != NULL) 482 panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); 483 484 /* 485 * Slap on VROOT if we are the root znode 486 */ 487 if (zp->z_id == zfsvfs->z_root) 488 ZTOV(zp)->v_flag |= VROOT; 489 490 mutex_exit(&zp->z_lock); 491 vn_exists(ZTOV(zp)); 492} 493 494void 495zfs_znode_dmu_fini(znode_t *zp) 496{ 497 dmu_buf_t *db = zp->z_dbuf; 498 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || 499 zp->z_unlinked || 500 RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); 501 ASSERT(zp->z_dbuf != NULL); 502 zp->z_dbuf = NULL; 503 VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); 504 dmu_buf_rele(db, NULL); 505} 506 507/* 508 * Construct a new znode/vnode and intialize. 509 * 510 * This does not do a call to dmu_set_user() that is 511 * up to the caller to do, in case you don't want to 512 * return the znode 513 */ 514static znode_t * 515zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) 516{ 517 znode_t *zp; 518 vnode_t *vp; 519 520 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 521 zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0); 522 523 ASSERT(zp->z_dirlocks == NULL); 524 ASSERT(zp->z_dbuf == NULL); 525 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 526 527 /* 528 * Defer setting z_zfsvfs until the znode is ready to be a candidate for 529 * the zfs_znode_move() callback. 530 */ 531 zp->z_phys = NULL; 532 zp->z_unlinked = 0; 533 zp->z_atime_dirty = 0; 534 zp->z_mapcnt = 0; 535 zp->z_last_itx = 0; 536 zp->z_id = db->db_object; 537 zp->z_blksz = blksz; 538 zp->z_seq = 0x7A4653; 539 zp->z_sync_cnt = 0; 540 541 vp = ZTOV(zp); 542#ifdef TODO 543 vn_reinit(vp); 544#endif 545 546 zfs_znode_dmu_init(zfsvfs, zp, db); 547 548 zp->z_gen = zp->z_phys->zp_gen; 549 550#if 0 551 if (vp == NULL) 552 return (zp); 553#endif 554 555 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 556 switch (vp->v_type) { 557 case VDIR: 558 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 559 break; 560 case VFIFO: 561 vp->v_op = &zfs_fifoops; 562 break; 563 case VREG: 564 if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir) { 565 vp->v_op = &zfs_shareops; 566 } 567 break; 568 } 569 if (vp->v_type != VFIFO) 570 VN_LOCK_ASHARE(vp); 571 572 mutex_enter(&zfsvfs->z_znodes_lock); 573 list_insert_tail(&zfsvfs->z_all_znodes, zp); 574 membar_producer(); 575 /* 576 * Everything else must be valid before assigning z_zfsvfs makes the 577 * znode eligible for zfs_znode_move(). 578 */ 579 zp->z_zfsvfs = zfsvfs; 580 mutex_exit(&zfsvfs->z_znodes_lock); 581 582 VFS_HOLD(zfsvfs->z_vfs); 583 return (zp); 584} 585 586/* 587 * Create a new DMU object to hold a zfs znode. 588 * 589 * IN: dzp - parent directory for new znode 590 * vap - file attributes for new znode 591 * tx - dmu transaction id for zap operations 592 * cr - credentials of caller 593 * flag - flags: 594 * IS_ROOT_NODE - new object will be root 595 * IS_XATTR - new object is an attribute 596 * bonuslen - length of bonus buffer 597 * setaclp - File/Dir initial ACL 598 * fuidp - Tracks fuid allocation. 599 * 600 * OUT: zpp - allocated znode 601 * 602 */ 603void 604zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 605 uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids) 606{ 607 dmu_buf_t *db; 608 znode_phys_t *pzp; 609 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 610 timestruc_t now; 611 uint64_t gen, obj; 612 int err; 613 614 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 615 616 if (zfsvfs->z_replay) { 617 obj = vap->va_nodeid; 618 now = vap->va_ctime; /* see zfs_replay_create() */ 619 gen = vap->va_nblocks; /* ditto */ 620 } else { 621 obj = 0; 622 gethrestime(&now); 623 gen = dmu_tx_get_txg(tx); 624 } 625 626 /* 627 * Create a new DMU object. 628 */ 629 /* 630 * There's currently no mechanism for pre-reading the blocks that will 631 * be to needed allocate a new object, so we accept the small chance 632 * that there will be an i/o error and we will fail one of the 633 * assertions below. 634 */ 635 if (vap->va_type == VDIR) { 636 if (zfsvfs->z_replay) { 637 err = zap_create_claim_norm(zfsvfs->z_os, obj, 638 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 639 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 640 ASSERT3U(err, ==, 0); 641 } else { 642 obj = zap_create_norm(zfsvfs->z_os, 643 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 644 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 645 } 646 } else { 647 if (zfsvfs->z_replay) { 648 err = dmu_object_claim(zfsvfs->z_os, obj, 649 DMU_OT_PLAIN_FILE_CONTENTS, 0, 650 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 651 ASSERT3U(err, ==, 0); 652 } else { 653 obj = dmu_object_alloc(zfsvfs->z_os, 654 DMU_OT_PLAIN_FILE_CONTENTS, 0, 655 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 656 } 657 } 658 659 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 660 VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); 661 dmu_buf_will_dirty(db, tx); 662 663 /* 664 * Initialize the znode physical data to zero. 665 */ 666 ASSERT(db->db_size >= sizeof (znode_phys_t)); 667 bzero(db->db_data, db->db_size); 668 pzp = db->db_data; 669 670 /* 671 * If this is the root, fix up the half-initialized parent pointer 672 * to reference the just-allocated physical data area. 673 */ 674 if (flag & IS_ROOT_NODE) { 675 dzp->z_dbuf = db; 676 dzp->z_phys = pzp; 677 dzp->z_id = obj; 678 } 679 680 /* 681 * If parent is an xattr, so am I. 682 */ 683 if (dzp->z_phys->zp_flags & ZFS_XATTR) 684 flag |= IS_XATTR; 685 686 if (vap->va_type == VBLK || vap->va_type == VCHR) { 687 pzp->zp_rdev = zfs_expldev(vap->va_rdev); 688 } 689 690 if (zfsvfs->z_use_fuids) 691 pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 692 693 if (vap->va_type == VDIR) { 694 pzp->zp_size = 2; /* contents ("." and "..") */ 695 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 696 } 697 698 pzp->zp_parent = dzp->z_id; 699 if (flag & IS_XATTR) 700 pzp->zp_flags |= ZFS_XATTR; 701 702 pzp->zp_gen = gen; 703 704 ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 705 ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 706 707 if (vap->va_mask & AT_ATIME) { 708 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 709 } else { 710 ZFS_TIME_ENCODE(&now, pzp->zp_atime); 711 } 712 713 if (vap->va_mask & AT_MTIME) { 714 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 715 } else { 716 ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 717 } 718 719 pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); 720 if (!(flag & IS_ROOT_NODE)) { 721 *zpp = zfs_znode_alloc(zfsvfs, db, 0); 722 } else { 723 /* 724 * If we are creating the root node, the "parent" we 725 * passed in is the znode for the root. 726 */ 727 *zpp = dzp; 728 } 729 pzp->zp_uid = acl_ids->z_fuid; 730 pzp->zp_gid = acl_ids->z_fgid; 731 pzp->zp_mode = acl_ids->z_mode; 732 VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); 733 if (vap->va_mask & AT_XVATTR) 734 zfs_xvattr_set(*zpp, (xvattr_t *)vap); 735 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 736 if (!(flag & IS_ROOT_NODE)) { 737 vnode_t *vp; 738 739 vp = ZTOV(*zpp); 740 vp->v_vflag |= VV_FORCEINSMQ; 741 err = insmntque(vp, zfsvfs->z_vfs); 742 vp->v_vflag &= ~VV_FORCEINSMQ; 743 KASSERT(err == 0, ("insmntque() failed: error %d", err)); 744 } 745} 746 747void 748zfs_xvattr_set(znode_t *zp, xvattr_t *xvap) 749{ 750 xoptattr_t *xoap; 751 752 xoap = xva_getxoptattr(xvap); 753 ASSERT(xoap); 754 755 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 756 ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); 757 XVA_SET_RTN(xvap, XAT_CREATETIME); 758 } 759 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 760 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); 761 XVA_SET_RTN(xvap, XAT_READONLY); 762 } 763 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 764 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); 765 XVA_SET_RTN(xvap, XAT_HIDDEN); 766 } 767 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 768 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); 769 XVA_SET_RTN(xvap, XAT_SYSTEM); 770 } 771 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 772 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); 773 XVA_SET_RTN(xvap, XAT_ARCHIVE); 774 } 775 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 776 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); 777 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 778 } 779 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 780 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); 781 XVA_SET_RTN(xvap, XAT_NOUNLINK); 782 } 783 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 784 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); 785 XVA_SET_RTN(xvap, XAT_APPENDONLY); 786 } 787 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 788 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); 789 XVA_SET_RTN(xvap, XAT_NODUMP); 790 } 791 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 792 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); 793 XVA_SET_RTN(xvap, XAT_OPAQUE); 794 } 795 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 796 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 797 xoap->xoa_av_quarantined); 798 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 799 } 800 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 801 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); 802 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 803 } 804 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 805 (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, 806 sizeof (xoap->xoa_av_scanstamp)); 807 zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; 808 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 809 } 810} 811 812int 813zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 814{ 815 dmu_object_info_t doi; 816 dmu_buf_t *db; 817 znode_t *zp; 818 vnode_t *vp; 819 int err, first = 1; 820 821 *zpp = NULL; 822again: 823 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 824 825 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 826 if (err) { 827 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 828 return (err); 829 } 830 831 dmu_object_info_from_db(db, &doi); 832 if (doi.doi_bonus_type != DMU_OT_ZNODE || 833 doi.doi_bonus_size < sizeof (znode_phys_t)) { 834 dmu_buf_rele(db, NULL); 835 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 836 return (EINVAL); 837 } 838 839 zp = dmu_buf_get_user(db); 840 if (zp != NULL) { 841 mutex_enter(&zp->z_lock); 842 843 /* 844 * Since we do immediate eviction of the z_dbuf, we 845 * should never find a dbuf with a znode that doesn't 846 * know about the dbuf. 847 */ 848 ASSERT3P(zp->z_dbuf, ==, db); 849 ASSERT3U(zp->z_id, ==, obj_num); 850 if (zp->z_unlinked) { 851 err = ENOENT; 852 } else { 853 int dying = 0; 854 855 vp = ZTOV(zp); 856 if (vp == NULL) 857 dying = 1; 858 else { 859 VN_HOLD(vp); 860 if ((vp->v_iflag & VI_DOOMED) != 0) { 861 dying = 1; 862 /* 863 * Don't VN_RELE() vnode here, because 864 * it can call vn_lock() which creates 865 * LOR between vnode lock and znode 866 * lock. We will VN_RELE() the vnode 867 * after droping znode lock. 868 */ 869 } 870 } 871 if (dying) { 872 if (first) { 873 ZFS_LOG(1, "dying znode detected (zp=%p)", zp); 874 first = 0; 875 } 876 /* 877 * znode is dying so we can't reuse it, we must 878 * wait until destruction is completed. 879 */ 880 dmu_buf_rele(db, NULL); 881 mutex_exit(&zp->z_lock); 882 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 883 if (vp != NULL) 884 VN_RELE(vp); 885 tsleep(zp, 0, "zcollide", 1); 886 goto again; 887 } 888 *zpp = zp; 889 err = 0; 890 } 891 dmu_buf_rele(db, NULL); 892 mutex_exit(&zp->z_lock); 893 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 894 return (err); 895 } 896 897 /* 898 * Not found create new znode/vnode 899 * but only if file exists. 900 * 901 * There is a small window where zfs_vget() could 902 * find this object while a file create is still in 903 * progress. Since a gen number can never be zero 904 * we will check that to determine if its an allocated 905 * file. 906 */ 907 908 if (((znode_phys_t *)db->db_data)->zp_gen != 0) { 909 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); 910 *zpp = zp; 911 vp = ZTOV(zp); 912 vp->v_vflag |= VV_FORCEINSMQ; 913 err = insmntque(vp, zfsvfs->z_vfs); 914 vp->v_vflag &= ~VV_FORCEINSMQ; 915 KASSERT(err == 0, ("insmntque() failed: error %d", err)); 916 VOP_UNLOCK(vp, 0); 917 err = 0; 918 } else { 919 dmu_buf_rele(db, NULL); 920 err = ENOENT; 921 } 922 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 923 return (err); 924} 925 926int 927zfs_rezget(znode_t *zp) 928{ 929 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 930 dmu_object_info_t doi; 931 dmu_buf_t *db; 932 uint64_t obj_num = zp->z_id; 933 int err; 934 935 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 936 937 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 938 if (err) { 939 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 940 return (err); 941 } 942 943 dmu_object_info_from_db(db, &doi); 944 if (doi.doi_bonus_type != DMU_OT_ZNODE || 945 doi.doi_bonus_size < sizeof (znode_phys_t)) { 946 dmu_buf_rele(db, NULL); 947 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 948 return (EINVAL); 949 } 950 951 if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { 952 dmu_buf_rele(db, NULL); 953 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 954 return (EIO); 955 } 956 957 zfs_znode_dmu_init(zfsvfs, zp, db); 958 zp->z_unlinked = (zp->z_phys->zp_links == 0); 959 zp->z_blksz = doi.doi_data_block_size; 960 961 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 962 963 return (0); 964} 965 966void 967zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 968{ 969 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 970 objset_t *os = zfsvfs->z_os; 971 uint64_t obj = zp->z_id; 972 uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; 973 974 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 975 if (acl_obj) 976 VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 977 VERIFY(0 == dmu_object_free(os, obj, tx)); 978 zfs_znode_dmu_fini(zp); 979 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 980 zfs_znode_free(zp); 981} 982 983void 984zfs_zinactive(znode_t *zp) 985{ 986 vnode_t *vp = ZTOV(zp); 987 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 988 uint64_t z_id = zp->z_id; 989 int vfslocked; 990 991 ASSERT(zp->z_dbuf && zp->z_phys); 992 993 /* 994 * Don't allow a zfs_zget() while were trying to release this znode 995 */ 996 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 997 998 mutex_enter(&zp->z_lock); 999 VI_LOCK(vp); 1000 if (vp->v_count > 0) { 1001 /* 1002 * If the hold count is greater than zero, somebody has 1003 * obtained a new reference on this znode while we were 1004 * processing it here, so we are done. 1005 */ 1006 VI_UNLOCK(vp); 1007 mutex_exit(&zp->z_lock); 1008 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1009 return; 1010 } 1011 VI_UNLOCK(vp); 1012 1013 /* 1014 * If this was the last reference to a file with no links, 1015 * remove the file from the file system. 1016 */ 1017 if (zp->z_unlinked) { 1018 mutex_exit(&zp->z_lock); 1019 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1020 ASSERT(vp->v_count == 0); 1021 vrecycle(vp, curthread); 1022 vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs); 1023 zfs_rmnode(zp); 1024 VFS_UNLOCK_GIANT(vfslocked); 1025 return; 1026 } 1027 mutex_exit(&zp->z_lock); 1028 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1029} 1030 1031void 1032zfs_znode_free(znode_t *zp) 1033{ 1034 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1035 1036 ASSERT(ZTOV(zp) == NULL); 1037 mutex_enter(&zfsvfs->z_znodes_lock); 1038 POINTER_INVALIDATE(&zp->z_zfsvfs); 1039 list_remove(&zfsvfs->z_all_znodes, zp); 1040 mutex_exit(&zfsvfs->z_znodes_lock); 1041 1042 kmem_cache_free(znode_cache, zp); 1043 1044 VFS_RELE(zfsvfs->z_vfs); 1045} 1046 1047void 1048zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1049{ 1050 timestruc_t now; 1051 1052 ASSERT(MUTEX_HELD(&zp->z_lock)); 1053 1054 gethrestime(&now); 1055 1056 if (tx) { 1057 dmu_buf_will_dirty(zp->z_dbuf, tx); 1058 zp->z_atime_dirty = 0; 1059 zp->z_seq++; 1060 } else { 1061 zp->z_atime_dirty = 1; 1062 } 1063 1064 if (flag & AT_ATIME) 1065 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1066 1067 if (flag & AT_MTIME) { 1068 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1069 if (zp->z_zfsvfs->z_use_fuids) 1070 zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); 1071 } 1072 1073 if (flag & AT_CTIME) { 1074 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1075 if (zp->z_zfsvfs->z_use_fuids) 1076 zp->z_phys->zp_flags |= ZFS_ARCHIVE; 1077 } 1078} 1079 1080/* 1081 * Update the requested znode timestamps with the current time. 1082 * If we are in a transaction, then go ahead and mark the znode 1083 * dirty in the transaction so the timestamps will go to disk. 1084 * Otherwise, we will get pushed next time the znode is updated 1085 * in a transaction, or when this znode eventually goes inactive. 1086 * 1087 * Why is this OK? 1088 * 1 - Only the ACCESS time is ever updated outside of a transaction. 1089 * 2 - Multiple consecutive updates will be collapsed into a single 1090 * znode update by the transaction grouping semantics of the DMU. 1091 */ 1092void 1093zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1094{ 1095 mutex_enter(&zp->z_lock); 1096 zfs_time_stamper_locked(zp, flag, tx); 1097 mutex_exit(&zp->z_lock); 1098} 1099 1100/* 1101 * Grow the block size for a file. 1102 * 1103 * IN: zp - znode of file to free data in. 1104 * size - requested block size 1105 * tx - open transaction. 1106 * 1107 * NOTE: this function assumes that the znode is write locked. 1108 */ 1109void 1110zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1111{ 1112 int error; 1113 u_longlong_t dummy; 1114 1115 if (size <= zp->z_blksz) 1116 return; 1117 /* 1118 * If the file size is already greater than the current blocksize, 1119 * we will not grow. If there is more than one block in a file, 1120 * the blocksize cannot change. 1121 */ 1122 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1123 return; 1124 1125 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1126 size, 0, tx); 1127 if (error == ENOTSUP) 1128 return; 1129 ASSERT3U(error, ==, 0); 1130 1131 /* What blocksize did we actually get? */ 1132 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1133} 1134 1135/* 1136 * Increase the file length 1137 * 1138 * IN: zp - znode of file to free data in. 1139 * end - new end-of-file 1140 * 1141 * RETURN: 0 if success 1142 * error code if failure 1143 */ 1144static int 1145zfs_extend(znode_t *zp, uint64_t end) 1146{ 1147 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1148 dmu_tx_t *tx; 1149 rl_t *rl; 1150 uint64_t newblksz; 1151 int error; 1152 1153 /* 1154 * We will change zp_size, lock the whole file. 1155 */ 1156 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1157 1158 /* 1159 * Nothing to do if file already at desired length. 1160 */ 1161 if (end <= zp->z_phys->zp_size) { 1162 zfs_range_unlock(rl); 1163 return (0); 1164 } 1165top: 1166 tx = dmu_tx_create(zfsvfs->z_os); 1167 dmu_tx_hold_bonus(tx, zp->z_id); 1168 if (end > zp->z_blksz && 1169 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1170 /* 1171 * We are growing the file past the current block size. 1172 */ 1173 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1174 ASSERT(!ISP2(zp->z_blksz)); 1175 newblksz = MIN(end, SPA_MAXBLOCKSIZE); 1176 } else { 1177 newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1178 } 1179 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1180 } else { 1181 newblksz = 0; 1182 } 1183 1184 error = dmu_tx_assign(tx, TXG_NOWAIT); 1185 if (error) { 1186 if (error == ERESTART) { 1187 dmu_tx_wait(tx); 1188 dmu_tx_abort(tx); 1189 goto top; 1190 } 1191 dmu_tx_abort(tx); 1192 zfs_range_unlock(rl); 1193 return (error); 1194 } 1195 dmu_buf_will_dirty(zp->z_dbuf, tx); 1196 1197 if (newblksz) 1198 zfs_grow_blocksize(zp, newblksz, tx); 1199 1200 zp->z_phys->zp_size = end; 1201 1202 zfs_range_unlock(rl); 1203 1204 dmu_tx_commit(tx); 1205 1206 vnode_pager_setsize(ZTOV(zp), end); 1207 1208 return (0); 1209} 1210 1211/* 1212 * Free space in a file. 1213 * 1214 * IN: zp - znode of file to free data in. 1215 * off - start of section to free. 1216 * len - length of section to free. 1217 * 1218 * RETURN: 0 if success 1219 * error code if failure 1220 */ 1221static int 1222zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1223{ 1224 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1225 rl_t *rl; 1226 int error; 1227 1228 /* 1229 * Lock the range being freed. 1230 */ 1231 rl = zfs_range_lock(zp, off, len, RL_WRITER); 1232 1233 /* 1234 * Nothing to do if file already at desired length. 1235 */ 1236 if (off >= zp->z_phys->zp_size) { 1237 zfs_range_unlock(rl); 1238 return (0); 1239 } 1240 1241 if (off + len > zp->z_phys->zp_size) 1242 len = zp->z_phys->zp_size - off; 1243 1244 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1245 1246 if (error == 0) { 1247 /* 1248 * In FreeBSD we cannot free block in the middle of a file, 1249 * but only at the end of a file. 1250 */ 1251 vnode_pager_setsize(ZTOV(zp), off); 1252 } 1253 1254 zfs_range_unlock(rl); 1255 1256 return (error); 1257} 1258 1259/* 1260 * Truncate a file 1261 * 1262 * IN: zp - znode of file to free data in. 1263 * end - new end-of-file. 1264 * 1265 * RETURN: 0 if success 1266 * error code if failure 1267 */ 1268static int 1269zfs_trunc(znode_t *zp, uint64_t end) 1270{ 1271 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1272 vnode_t *vp = ZTOV(zp); 1273 dmu_tx_t *tx; 1274 rl_t *rl; 1275 int error; 1276 1277 /* 1278 * We will change zp_size, lock the whole file. 1279 */ 1280 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1281 1282 /* 1283 * Nothing to do if file already at desired length. 1284 */ 1285 if (end >= zp->z_phys->zp_size) { 1286 zfs_range_unlock(rl); 1287 return (0); 1288 } 1289 1290 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); 1291 if (error) { 1292 zfs_range_unlock(rl); 1293 return (error); 1294 } 1295top: 1296 tx = dmu_tx_create(zfsvfs->z_os); 1297 dmu_tx_hold_bonus(tx, zp->z_id); 1298 error = dmu_tx_assign(tx, TXG_NOWAIT); 1299 if (error) { 1300 if (error == ERESTART) { 1301 dmu_tx_wait(tx); 1302 dmu_tx_abort(tx); 1303 goto top; 1304 } 1305 dmu_tx_abort(tx); 1306 zfs_range_unlock(rl); 1307 return (error); 1308 } 1309 dmu_buf_will_dirty(zp->z_dbuf, tx); 1310 1311 zp->z_phys->zp_size = end; 1312 1313 dmu_tx_commit(tx); 1314 1315 /* 1316 * Clear any mapped pages in the truncated region. This has to 1317 * happen outside of the transaction to avoid the possibility of 1318 * a deadlock with someone trying to push a page that we are 1319 * about to invalidate. 1320 */ 1321 vnode_pager_setsize(vp, end); 1322 1323 zfs_range_unlock(rl); 1324 1325 return (0); 1326} 1327 1328/* 1329 * Free space in a file 1330 * 1331 * IN: zp - znode of file to free data in. 1332 * off - start of range 1333 * len - end of range (0 => EOF) 1334 * flag - current file open mode flags. 1335 * log - TRUE if this action should be logged 1336 * 1337 * RETURN: 0 if success 1338 * error code if failure 1339 */ 1340int 1341zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1342{ 1343 vnode_t *vp = ZTOV(zp); 1344 dmu_tx_t *tx; 1345 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1346 zilog_t *zilog = zfsvfs->z_log; 1347 int error; 1348 1349 if (off > zp->z_phys->zp_size) { 1350 error = zfs_extend(zp, off+len); 1351 if (error == 0 && log) 1352 goto log; 1353 else 1354 return (error); 1355 } 1356 1357 if (len == 0) { 1358 error = zfs_trunc(zp, off); 1359 } else { 1360 if ((error = zfs_free_range(zp, off, len)) == 0 && 1361 off + len > zp->z_phys->zp_size) 1362 error = zfs_extend(zp, off+len); 1363 } 1364 if (error || !log) 1365 return (error); 1366log: 1367 tx = dmu_tx_create(zfsvfs->z_os); 1368 dmu_tx_hold_bonus(tx, zp->z_id); 1369 error = dmu_tx_assign(tx, TXG_NOWAIT); 1370 if (error) { 1371 if (error == ERESTART) { 1372 dmu_tx_wait(tx); 1373 dmu_tx_abort(tx); 1374 goto log; 1375 } 1376 dmu_tx_abort(tx); 1377 return (error); 1378 } 1379 1380 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 1381 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1382 1383 dmu_tx_commit(tx); 1384 return (0); 1385} 1386 1387void 1388zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1389{ 1390 zfsvfs_t zfsvfs; 1391 uint64_t moid, obj, version; 1392 uint64_t sense = ZFS_CASE_SENSITIVE; 1393 uint64_t norm = 0; 1394 nvpair_t *elem; 1395 int error; 1396 int i; 1397 znode_t *rootzp = NULL; 1398 vnode_t vnode; 1399 vattr_t vattr; 1400 znode_t *zp; 1401 zfs_acl_ids_t acl_ids; 1402 1403 /* 1404 * First attempt to create master node. 1405 */ 1406 /* 1407 * In an empty objset, there are no blocks to read and thus 1408 * there can be no i/o errors (which we assert below). 1409 */ 1410 moid = MASTER_NODE_OBJ; 1411 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1412 DMU_OT_NONE, 0, tx); 1413 ASSERT(error == 0); 1414 1415 /* 1416 * Set starting attributes. 1417 */ 1418 if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE) 1419 version = ZPL_VERSION; 1420 else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) 1421 version = ZPL_VERSION_USERSPACE - 1; 1422 else 1423 version = ZPL_VERSION_FUID - 1; 1424 elem = NULL; 1425 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1426 /* For the moment we expect all zpl props to be uint64_ts */ 1427 uint64_t val; 1428 char *name; 1429 1430 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1431 VERIFY(nvpair_value_uint64(elem, &val) == 0); 1432 name = nvpair_name(elem); 1433 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1434 if (val < version) 1435 version = val; 1436 } else { 1437 error = zap_update(os, moid, name, 8, 1, &val, tx); 1438 } 1439 ASSERT(error == 0); 1440 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1441 norm = val; 1442 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1443 sense = val; 1444 } 1445 ASSERT(version != 0); 1446 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); 1447 1448 /* 1449 * Create a delete queue. 1450 */ 1451 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1452 1453 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); 1454 ASSERT(error == 0); 1455 1456 /* 1457 * Create root znode. Create minimal znode/vnode/zfsvfs 1458 * to allow zfs_mknode to work. 1459 */ 1460 VATTR_NULL(&vattr); 1461 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1462 vattr.va_type = VDIR; 1463 vattr.va_mode = S_IFDIR|0755; 1464 vattr.va_uid = crgetuid(cr); 1465 vattr.va_gid = crgetgid(cr); 1466 1467 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1468 zfs_znode_cache_constructor(rootzp, NULL, 0); 1469 rootzp->z_unlinked = 0; 1470 rootzp->z_atime_dirty = 0; 1471 1472 vnode.v_type = VDIR; 1473 vnode.v_data = rootzp; 1474 rootzp->z_vnode = &vnode; 1475 1476 bzero(&zfsvfs, sizeof (zfsvfs_t)); 1477 1478 zfsvfs.z_os = os; 1479 zfsvfs.z_parent = &zfsvfs; 1480 zfsvfs.z_version = version; 1481 zfsvfs.z_use_fuids = USE_FUIDS(version, os); 1482 zfsvfs.z_norm = norm; 1483 /* 1484 * Fold case on file systems that are always or sometimes case 1485 * insensitive. 1486 */ 1487 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1488 zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; 1489 1490 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1491 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1492 offsetof(znode_t, z_link_node)); 1493 1494 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1495 mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1496 1497 ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); 1498 rootzp->z_zfsvfs = &zfsvfs; 1499 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, 1500 cr, NULL, &acl_ids)); 1501 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids); 1502 ASSERT3P(zp, ==, rootzp); 1503 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1504 ASSERT(error == 0); 1505 zfs_acl_ids_free(&acl_ids); 1506 POINTER_INVALIDATE(&rootzp->z_zfsvfs); 1507 1508 dmu_buf_rele(rootzp->z_dbuf, NULL); 1509 rootzp->z_dbuf = NULL; 1510 rootzp->z_vnode = NULL; 1511 kmem_cache_free(znode_cache, rootzp); 1512 1513 /* 1514 * Create shares directory 1515 */ 1516 1517 error = zfs_create_share_dir(&zfsvfs, tx); 1518 1519 ASSERT(error == 0); 1520 1521 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1522 mutex_destroy(&zfsvfs.z_hold_mtx[i]); 1523} 1524 1525#endif /* _KERNEL */ 1526/* 1527 * Given an object number, return its parent object number and whether 1528 * or not the object is an extended attribute directory. 1529 */ 1530static int 1531zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) 1532{ 1533 dmu_buf_t *db; 1534 dmu_object_info_t doi; 1535 znode_phys_t *zp; 1536 int error; 1537 1538 if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) 1539 return (error); 1540 1541 dmu_object_info_from_db(db, &doi); 1542 if (doi.doi_bonus_type != DMU_OT_ZNODE || 1543 doi.doi_bonus_size < sizeof (znode_phys_t)) { 1544 dmu_buf_rele(db, FTAG); 1545 return (EINVAL); 1546 } 1547 1548 zp = db->db_data; 1549 *pobjp = zp->zp_parent; 1550 *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && 1551 S_ISDIR(zp->zp_mode); 1552 dmu_buf_rele(db, FTAG); 1553 1554 return (0); 1555} 1556 1557int 1558zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 1559{ 1560 char *path = buf + len - 1; 1561 int error; 1562 1563 *path = '\0'; 1564 1565 for (;;) { 1566 uint64_t pobj; 1567 char component[MAXNAMELEN + 2]; 1568 size_t complen; 1569 int is_xattrdir; 1570 1571 if ((error = zfs_obj_to_pobj(osp, obj, &pobj, 1572 &is_xattrdir)) != 0) 1573 break; 1574 1575 if (pobj == obj) { 1576 if (path[0] != '/') 1577 *--path = '/'; 1578 break; 1579 } 1580 1581 component[0] = '/'; 1582 if (is_xattrdir) { 1583 (void) sprintf(component + 1, "<xattrdir>"); 1584 } else { 1585 error = zap_value_search(osp, pobj, obj, 1586 ZFS_DIRENT_OBJ(-1ULL), component + 1); 1587 if (error != 0) 1588 break; 1589 } 1590 1591 complen = strlen(component); 1592 path -= complen; 1593 ASSERT(path >= buf); 1594 bcopy(component, path, complen); 1595 obj = pobj; 1596 } 1597 1598 if (error == 0) 1599 (void) memmove(buf, path, buf + len - path); 1600 return (error); 1601} 1602