zfs_znode.c revision 197458
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26/* Portions Copyright 2007 Jeremy Teo */ 27 28#ifdef _KERNEL 29#include <sys/types.h> 30#include <sys/param.h> 31#include <sys/time.h> 32#include <sys/systm.h> 33#include <sys/sysmacros.h> 34#include <sys/resource.h> 35#include <sys/mntent.h> 36#include <sys/u8_textprep.h> 37#include <sys/dsl_dataset.h> 38#include <sys/vfs.h> 39#include <sys/vnode.h> 40#include <sys/file.h> 41#include <sys/kmem.h> 42#include <sys/errno.h> 43#include <sys/unistd.h> 44#include <sys/atomic.h> 45#include <sys/zfs_dir.h> 46#include <sys/zfs_acl.h> 47#include <sys/zfs_ioctl.h> 48#include <sys/zfs_rlock.h> 49#include <sys/zfs_fuid.h> 50#include <sys/fs/zfs.h> 51#include <sys/kidmap.h> 52#endif /* _KERNEL */ 53 54#include <sys/dmu.h> 55#include <sys/refcount.h> 56#include <sys/stat.h> 57#include <sys/zap.h> 58#include <sys/zfs_znode.h> 59#include <sys/refcount.h> 60 61#include "zfs_prop.h" 62 63/* Used by fstat(1). */ 64SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t), 65 "sizeof(znode_t)"); 66 67/* 68 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only 69 * turned on when DEBUG is also defined. 70 */ 71#ifdef DEBUG 72#define ZNODE_STATS 73#endif /* DEBUG */ 74 75#ifdef ZNODE_STATS 76#define ZNODE_STAT_ADD(stat) ((stat)++) 77#else 78#define ZNODE_STAT_ADD(stat) /* nothing */ 79#endif /* ZNODE_STATS */ 80 81#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) 82#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) 83 84/* 85 * Functions needed for userland (ie: libzpool) are not put under 86 * #ifdef_KERNEL; the rest of the functions have dependencies 87 * (such as VFS logic) that will not compile easily in userland. 88 */ 89#ifdef _KERNEL 90static kmem_cache_t *znode_cache = NULL; 91 92/*ARGSUSED*/ 93static void 94znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) 95{ 96#if 1 /* XXXPJD: From OpenSolaris. */ 97 /* 98 * We should never drop all dbuf refs without first clearing 99 * the eviction callback. 100 */ 101 panic("evicting znode %p\n", user_ptr); 102#else /* XXXPJD */ 103 znode_t *zp = user_ptr; 104 vnode_t *vp; 105 106 mutex_enter(&zp->z_lock); 107 zp->z_dbuf = NULL; 108 vp = ZTOV(zp); 109 if (vp == NULL) { 110 mutex_exit(&zp->z_lock); 111 zfs_znode_free(zp); 112 } else if (vp->v_count == 0) { 113 zp->z_vnode = NULL; 114 vhold(vp); 115 mutex_exit(&zp->z_lock); 116 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); 117 vrecycle(vp, curthread); 118 VOP_UNLOCK(vp, 0); 119 vdrop(vp); 120 zfs_znode_free(zp); 121 } else { 122 mutex_exit(&zp->z_lock); 123 } 124#endif 125} 126 127extern struct vop_vector zfs_vnodeops; 128extern struct vop_vector zfs_fifoops; 129 130/* 131 * XXX: We cannot use this function as a cache constructor, because 132 * there is one global cache for all file systems and we need 133 * to pass vfsp here, which is not possible, because argument 134 * 'cdrarg' is defined at kmem_cache_create() time. 135 */ 136static int 137zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 138{ 139 znode_t *zp = buf; 140 vnode_t *vp; 141 vfs_t *vfsp = arg; 142 int error; 143 144 POINTER_INVALIDATE(&zp->z_zfsvfs); 145 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 146 ASSERT(vfsp != NULL); 147 148 error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp); 149 if (error != 0 && (kmflags & KM_NOSLEEP)) 150 return (-1); 151 ASSERT(error == 0); 152 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 153 zp->z_vnode = vp; 154 vp->v_data = (caddr_t)zp; 155 VN_LOCK_AREC(vp); 156 157 list_link_init(&zp->z_link_node); 158 159 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 160 rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); 161 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 162 rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); 163 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 164 165 mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); 166 avl_create(&zp->z_range_avl, zfs_range_compare, 167 sizeof (rl_t), offsetof(rl_t, r_node)); 168 169 zp->z_dbuf = NULL; 170 zp->z_dirlocks = NULL; 171 return (0); 172} 173 174/*ARGSUSED*/ 175static void 176zfs_znode_cache_destructor(void *buf, void *arg) 177{ 178 znode_t *zp = buf; 179 180 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 181 ASSERT(ZTOV(zp) == NULL); 182 vn_free(ZTOV(zp)); 183 ASSERT(!list_link_active(&zp->z_link_node)); 184 mutex_destroy(&zp->z_lock); 185 rw_destroy(&zp->z_map_lock); 186 rw_destroy(&zp->z_parent_lock); 187 rw_destroy(&zp->z_name_lock); 188 mutex_destroy(&zp->z_acl_lock); 189 avl_destroy(&zp->z_range_avl); 190 mutex_destroy(&zp->z_range_lock); 191 192 ASSERT(zp->z_dbuf == NULL); 193 ASSERT(zp->z_dirlocks == NULL); 194} 195 196#ifdef ZNODE_STATS 197static struct { 198 uint64_t zms_zfsvfs_invalid; 199 uint64_t zms_zfsvfs_unmounted; 200 uint64_t zms_zfsvfs_recheck_invalid; 201 uint64_t zms_obj_held; 202 uint64_t zms_vnode_locked; 203 uint64_t zms_not_only_dnlc; 204} znode_move_stats; 205#endif /* ZNODE_STATS */ 206 207#if defined(sun) 208static void 209zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) 210{ 211 vnode_t *vp; 212 213 /* Copy fields. */ 214 nzp->z_zfsvfs = ozp->z_zfsvfs; 215 216 /* Swap vnodes. */ 217 vp = nzp->z_vnode; 218 nzp->z_vnode = ozp->z_vnode; 219 ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ 220 ZTOV(ozp)->v_data = ozp; 221 ZTOV(nzp)->v_data = nzp; 222 223 nzp->z_id = ozp->z_id; 224 ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ 225 ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); 226 nzp->z_unlinked = ozp->z_unlinked; 227 nzp->z_atime_dirty = ozp->z_atime_dirty; 228 nzp->z_zn_prefetch = ozp->z_zn_prefetch; 229 nzp->z_blksz = ozp->z_blksz; 230 nzp->z_seq = ozp->z_seq; 231 nzp->z_mapcnt = ozp->z_mapcnt; 232 nzp->z_last_itx = ozp->z_last_itx; 233 nzp->z_gen = ozp->z_gen; 234 nzp->z_sync_cnt = ozp->z_sync_cnt; 235 nzp->z_phys = ozp->z_phys; 236 nzp->z_dbuf = ozp->z_dbuf; 237 238 /* Update back pointers. */ 239 (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, 240 znode_evict_error); 241 242 /* 243 * Invalidate the original znode by clearing fields that provide a 244 * pointer back to the znode. Set the low bit of the vfs pointer to 245 * ensure that zfs_znode_move() recognizes the znode as invalid in any 246 * subsequent callback. 247 */ 248 ozp->z_dbuf = NULL; 249 POINTER_INVALIDATE(&ozp->z_zfsvfs); 250} 251 252/* 253 * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise 254 * returns a non-zero error code. 255 */ 256static int 257zfs_enter(zfsvfs_t *zfsvfs) 258{ 259 ZFS_ENTER(zfsvfs); 260 return (0); 261} 262 263/*ARGSUSED*/ 264static kmem_cbrc_t 265zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) 266{ 267 znode_t *ozp = buf, *nzp = newbuf; 268 zfsvfs_t *zfsvfs; 269 vnode_t *vp; 270 271 /* 272 * The znode is on the file system's list of known znodes if the vfs 273 * pointer is valid. We set the low bit of the vfs pointer when freeing 274 * the znode to invalidate it, and the memory patterns written by kmem 275 * (baddcafe and deadbeef) set at least one of the two low bits. A newly 276 * created znode sets the vfs pointer last of all to indicate that the 277 * znode is known and in a valid state to be moved by this function. 278 */ 279 zfsvfs = ozp->z_zfsvfs; 280 if (!POINTER_IS_VALID(zfsvfs)) { 281 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); 282 return (KMEM_CBRC_DONT_KNOW); 283 } 284 285 /* 286 * Ensure that the filesystem is not unmounted during the move. 287 */ 288 if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */ 289 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); 290 return (KMEM_CBRC_DONT_KNOW); 291 } 292 293 mutex_enter(&zfsvfs->z_znodes_lock); 294 /* 295 * Recheck the vfs pointer in case the znode was removed just before 296 * acquiring the lock. 297 */ 298 if (zfsvfs != ozp->z_zfsvfs) { 299 mutex_exit(&zfsvfs->z_znodes_lock); 300 ZFS_EXIT(zfsvfs); 301 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid); 302 return (KMEM_CBRC_DONT_KNOW); 303 } 304 305 /* 306 * At this point we know that as long as we hold z_znodes_lock, the 307 * znode cannot be freed and fields within the znode can be safely 308 * accessed. Now, prevent a race with zfs_zget(). 309 */ 310 if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { 311 mutex_exit(&zfsvfs->z_znodes_lock); 312 ZFS_EXIT(zfsvfs); 313 ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); 314 return (KMEM_CBRC_LATER); 315 } 316 317 vp = ZTOV(ozp); 318 if (mutex_tryenter(&vp->v_lock) == 0) { 319 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 320 mutex_exit(&zfsvfs->z_znodes_lock); 321 ZFS_EXIT(zfsvfs); 322 ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); 323 return (KMEM_CBRC_LATER); 324 } 325 326 /* Only move znodes that are referenced _only_ by the DNLC. */ 327 if (vp->v_count != 1 || !vn_in_dnlc(vp)) { 328 mutex_exit(&vp->v_lock); 329 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 330 mutex_exit(&zfsvfs->z_znodes_lock); 331 ZFS_EXIT(zfsvfs); 332 ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); 333 return (KMEM_CBRC_LATER); 334 } 335 336 /* 337 * The znode is known and in a valid state to move. We're holding the 338 * locks needed to execute the critical section. 339 */ 340 zfs_znode_move_impl(ozp, nzp); 341 mutex_exit(&vp->v_lock); 342 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 343 344 list_link_replace(&ozp->z_link_node, &nzp->z_link_node); 345 mutex_exit(&zfsvfs->z_znodes_lock); 346 ZFS_EXIT(zfsvfs); 347 348 return (KMEM_CBRC_YES); 349} 350#endif /* sun */ 351 352void 353zfs_znode_init(void) 354{ 355 /* 356 * Initialize zcache 357 */ 358 ASSERT(znode_cache == NULL); 359 znode_cache = kmem_cache_create("zfs_znode_cache", 360 sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL, 361 zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 362#if defined(sun) 363 kmem_cache_set_move(znode_cache, zfs_znode_move); 364#endif 365} 366 367void 368zfs_znode_fini(void) 369{ 370 /* 371 * Cleanup zcache 372 */ 373 if (znode_cache) 374 kmem_cache_destroy(znode_cache); 375 znode_cache = NULL; 376} 377 378/* 379 * zfs_init_fs - Initialize the zfsvfs struct and the file system 380 * incore "master" object. Verify version compatibility. 381 */ 382int 383zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp) 384{ 385 objset_t *os = zfsvfs->z_os; 386 int i, error; 387 uint64_t fsid_guid; 388 uint64_t zval; 389 390 *zpp = NULL; 391 392 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 393 if (error) { 394 return (error); 395 } else if (zfsvfs->z_version > ZPL_VERSION) { 396 (void) printf("Mismatched versions: File system " 397 "is version %llu on-disk format, which is " 398 "incompatible with this software version %lld!", 399 (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); 400 return (ENOTSUP); 401 } 402 403 if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) 404 return (error); 405 zfsvfs->z_norm = (int)zval; 406 if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) 407 return (error); 408 zfsvfs->z_utf8 = (zval != 0); 409 if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) 410 return (error); 411 zfsvfs->z_case = (uint_t)zval; 412 /* 413 * Fold case on file systems that are always or sometimes case 414 * insensitive. 415 */ 416 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 417 zfsvfs->z_case == ZFS_CASE_MIXED) 418 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 419 420 /* 421 * The fsid is 64 bits, composed of an 8-bit fs type, which 422 * separates our fsid from any other filesystem types, and a 423 * 56-bit objset unique ID. The objset unique ID is unique to 424 * all objsets open on this system, provided by unique_create(). 425 * The 8-bit fs type must be put in the low bits of fsid[1] 426 * because that's where other Solaris filesystems put it. 427 */ 428 fsid_guid = dmu_objset_fsid_guid(os); 429 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 430 zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid; 431 zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | 432 zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF; 433 434 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 435 &zfsvfs->z_root); 436 if (error) 437 return (error); 438 ASSERT(zfsvfs->z_root != 0); 439 440 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 441 &zfsvfs->z_unlinkedobj); 442 if (error) 443 return (error); 444 445 /* 446 * Initialize zget mutex's 447 */ 448 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 449 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 450 451 error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); 452 if (error) { 453 /* 454 * On error, we destroy the mutexes here since it's not 455 * possible for the caller to determine if the mutexes were 456 * initialized properly. 457 */ 458 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 459 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 460 return (error); 461 } 462 ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); 463 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 464 &zfsvfs->z_fuid_obj); 465 if (error == ENOENT) 466 error = 0; 467 468 return (0); 469} 470 471/* 472 * define a couple of values we need available 473 * for both 64 and 32 bit environments. 474 */ 475#ifndef NBITSMINOR64 476#define NBITSMINOR64 32 477#endif 478#ifndef MAXMAJ64 479#define MAXMAJ64 0xffffffffUL 480#endif 481#ifndef MAXMIN64 482#define MAXMIN64 0xffffffffUL 483#endif 484 485/* 486 * Create special expldev for ZFS private use. 487 * Can't use standard expldev since it doesn't do 488 * what we want. The standard expldev() takes a 489 * dev32_t in LP64 and expands it to a long dev_t. 490 * We need an interface that takes a dev32_t in ILP32 491 * and expands it to a long dev_t. 492 */ 493static uint64_t 494zfs_expldev(dev_t dev) 495{ 496 return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); 497} 498/* 499 * Special cmpldev for ZFS private use. 500 * Can't use standard cmpldev since it takes 501 * a long dev_t and compresses it to dev32_t in 502 * LP64. We need to do a compaction of a long dev_t 503 * to a dev32_t in ILP32. 504 */ 505dev_t 506zfs_cmpldev(uint64_t dev) 507{ 508 return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); 509} 510 511static void 512zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) 513{ 514 znode_t *nzp; 515 516 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); 517 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); 518 519 mutex_enter(&zp->z_lock); 520 521 ASSERT(zp->z_dbuf == NULL); 522 zp->z_dbuf = db; 523 nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); 524 525 /* 526 * there should be no 527 * concurrent zgets on this object. 528 */ 529 if (nzp != NULL) 530 panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); 531 532 /* 533 * Slap on VROOT if we are the root znode 534 */ 535 if (zp->z_id == zfsvfs->z_root) 536 ZTOV(zp)->v_flag |= VROOT; 537 538 mutex_exit(&zp->z_lock); 539 vn_exists(ZTOV(zp)); 540} 541 542void 543zfs_znode_dmu_fini(znode_t *zp) 544{ 545 dmu_buf_t *db = zp->z_dbuf; 546 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || 547 zp->z_unlinked || 548 RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); 549 ASSERT(zp->z_dbuf != NULL); 550 zp->z_dbuf = NULL; 551 VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); 552 dmu_buf_rele(db, NULL); 553} 554 555/* 556 * Construct a new znode/vnode and intialize. 557 * 558 * This does not do a call to dmu_set_user() that is 559 * up to the caller to do, in case you don't want to 560 * return the znode 561 */ 562static znode_t * 563zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) 564{ 565 znode_t *zp; 566 vnode_t *vp; 567 568 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 569 zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0); 570 571 ASSERT(zp->z_dirlocks == NULL); 572 ASSERT(zp->z_dbuf == NULL); 573 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 574 575 /* 576 * Defer setting z_zfsvfs until the znode is ready to be a candidate for 577 * the zfs_znode_move() callback. 578 */ 579 zp->z_phys = NULL; 580 zp->z_unlinked = 0; 581 zp->z_atime_dirty = 0; 582 zp->z_mapcnt = 0; 583 zp->z_last_itx = 0; 584 zp->z_id = db->db_object; 585 zp->z_blksz = blksz; 586 zp->z_seq = 0x7A4653; 587 zp->z_sync_cnt = 0; 588 589 vp = ZTOV(zp); 590#ifdef TODO 591 vn_reinit(vp); 592#endif 593 594 zfs_znode_dmu_init(zfsvfs, zp, db); 595 596 zp->z_gen = zp->z_phys->zp_gen; 597 598#if 0 599 if (vp == NULL) 600 return (zp); 601#endif 602 603 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 604 switch (vp->v_type) { 605 case VDIR: 606 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 607 break; 608 case VFIFO: 609 vp->v_op = &zfs_fifoops; 610 break; 611 } 612 if (vp->v_type != VFIFO) 613 VN_LOCK_ASHARE(vp); 614 615 mutex_enter(&zfsvfs->z_znodes_lock); 616 list_insert_tail(&zfsvfs->z_all_znodes, zp); 617 membar_producer(); 618 /* 619 * Everything else must be valid before assigning z_zfsvfs makes the 620 * znode eligible for zfs_znode_move(). 621 */ 622 zp->z_zfsvfs = zfsvfs; 623 mutex_exit(&zfsvfs->z_znodes_lock); 624 625 VFS_HOLD(zfsvfs->z_vfs); 626 return (zp); 627} 628 629/* 630 * Create a new DMU object to hold a zfs znode. 631 * 632 * IN: dzp - parent directory for new znode 633 * vap - file attributes for new znode 634 * tx - dmu transaction id for zap operations 635 * cr - credentials of caller 636 * flag - flags: 637 * IS_ROOT_NODE - new object will be root 638 * IS_XATTR - new object is an attribute 639 * IS_REPLAY - intent log replay 640 * bonuslen - length of bonus buffer 641 * setaclp - File/Dir initial ACL 642 * fuidp - Tracks fuid allocation. 643 * 644 * OUT: zpp - allocated znode 645 * 646 */ 647void 648zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 649 uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp, 650 zfs_fuid_info_t **fuidp) 651{ 652 dmu_buf_t *db; 653 znode_phys_t *pzp; 654 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 655 timestruc_t now; 656 uint64_t gen, obj; 657 int err; 658 659 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 660 661 if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ 662 obj = vap->va_nodeid; 663 flag |= IS_REPLAY; 664 now = vap->va_ctime; /* see zfs_replay_create() */ 665 gen = vap->va_nblocks; /* ditto */ 666 } else { 667 obj = 0; 668 gethrestime(&now); 669 gen = dmu_tx_get_txg(tx); 670 } 671 672 /* 673 * Create a new DMU object. 674 */ 675 /* 676 * There's currently no mechanism for pre-reading the blocks that will 677 * be to needed allocate a new object, so we accept the small chance 678 * that there will be an i/o error and we will fail one of the 679 * assertions below. 680 */ 681 if (vap->va_type == VDIR) { 682 if (flag & IS_REPLAY) { 683 err = zap_create_claim_norm(zfsvfs->z_os, obj, 684 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 685 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 686 ASSERT3U(err, ==, 0); 687 } else { 688 obj = zap_create_norm(zfsvfs->z_os, 689 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 690 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 691 } 692 } else { 693 if (flag & IS_REPLAY) { 694 err = dmu_object_claim(zfsvfs->z_os, obj, 695 DMU_OT_PLAIN_FILE_CONTENTS, 0, 696 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 697 ASSERT3U(err, ==, 0); 698 } else { 699 obj = dmu_object_alloc(zfsvfs->z_os, 700 DMU_OT_PLAIN_FILE_CONTENTS, 0, 701 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 702 } 703 } 704 VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); 705 dmu_buf_will_dirty(db, tx); 706 707 /* 708 * Initialize the znode physical data to zero. 709 */ 710 ASSERT(db->db_size >= sizeof (znode_phys_t)); 711 bzero(db->db_data, db->db_size); 712 pzp = db->db_data; 713 714 /* 715 * If this is the root, fix up the half-initialized parent pointer 716 * to reference the just-allocated physical data area. 717 */ 718 if (flag & IS_ROOT_NODE) { 719 dzp->z_dbuf = db; 720 dzp->z_phys = pzp; 721 dzp->z_id = obj; 722 } 723 724 /* 725 * If parent is an xattr, so am I. 726 */ 727 if (dzp->z_phys->zp_flags & ZFS_XATTR) 728 flag |= IS_XATTR; 729 730 if (vap->va_type == VBLK || vap->va_type == VCHR) { 731 pzp->zp_rdev = zfs_expldev(vap->va_rdev); 732 } 733 734 if (zfsvfs->z_use_fuids) 735 pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 736 737 if (vap->va_type == VDIR) { 738 pzp->zp_size = 2; /* contents ("." and "..") */ 739 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 740 } 741 742 pzp->zp_parent = dzp->z_id; 743 if (flag & IS_XATTR) 744 pzp->zp_flags |= ZFS_XATTR; 745 746 pzp->zp_gen = gen; 747 748 ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 749 ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 750 751 if (vap->va_mask & AT_ATIME) { 752 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 753 } else { 754 ZFS_TIME_ENCODE(&now, pzp->zp_atime); 755 } 756 757 if (vap->va_mask & AT_MTIME) { 758 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 759 } else { 760 ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 761 } 762 763 pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); 764 if (!(flag & IS_ROOT_NODE)) { 765 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 766 *zpp = zfs_znode_alloc(zfsvfs, db, 0); 767 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 768 } else { 769 /* 770 * If we are creating the root node, the "parent" we 771 * passed in is the znode for the root. 772 */ 773 *zpp = dzp; 774 } 775 zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp); 776 if (!(flag & IS_ROOT_NODE)) { 777 vnode_t *vp; 778 779 vp = ZTOV(*zpp); 780 vp->v_vflag |= VV_FORCEINSMQ; 781 err = insmntque(vp, zfsvfs->z_vfs); 782 vp->v_vflag &= ~VV_FORCEINSMQ; 783 KASSERT(err == 0, ("insmntque() failed: error %d", err)); 784 } 785} 786 787void 788zfs_xvattr_set(znode_t *zp, xvattr_t *xvap) 789{ 790 xoptattr_t *xoap; 791 792 xoap = xva_getxoptattr(xvap); 793 ASSERT(xoap); 794 795 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 796 ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); 797 XVA_SET_RTN(xvap, XAT_CREATETIME); 798 } 799 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 800 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); 801 XVA_SET_RTN(xvap, XAT_READONLY); 802 } 803 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 804 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); 805 XVA_SET_RTN(xvap, XAT_HIDDEN); 806 } 807 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 808 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); 809 XVA_SET_RTN(xvap, XAT_SYSTEM); 810 } 811 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 812 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); 813 XVA_SET_RTN(xvap, XAT_ARCHIVE); 814 } 815 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 816 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); 817 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 818 } 819 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 820 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); 821 XVA_SET_RTN(xvap, XAT_NOUNLINK); 822 } 823 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 824 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); 825 XVA_SET_RTN(xvap, XAT_APPENDONLY); 826 } 827 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 828 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); 829 XVA_SET_RTN(xvap, XAT_NODUMP); 830 } 831 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 832 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); 833 XVA_SET_RTN(xvap, XAT_OPAQUE); 834 } 835 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 836 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 837 xoap->xoa_av_quarantined); 838 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 839 } 840 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 841 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); 842 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 843 } 844 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 845 (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, 846 sizeof (xoap->xoa_av_scanstamp)); 847 zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; 848 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 849 } 850} 851 852int 853zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 854{ 855 dmu_object_info_t doi; 856 dmu_buf_t *db; 857 znode_t *zp; 858 vnode_t *vp; 859 int err, first = 1; 860 861 *zpp = NULL; 862again: 863 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 864 865 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 866 if (err) { 867 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 868 return (err); 869 } 870 871 dmu_object_info_from_db(db, &doi); 872 if (doi.doi_bonus_type != DMU_OT_ZNODE || 873 doi.doi_bonus_size < sizeof (znode_phys_t)) { 874 dmu_buf_rele(db, NULL); 875 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 876 return (EINVAL); 877 } 878 879 zp = dmu_buf_get_user(db); 880 if (zp != NULL) { 881 mutex_enter(&zp->z_lock); 882 883 /* 884 * Since we do immediate eviction of the z_dbuf, we 885 * should never find a dbuf with a znode that doesn't 886 * know about the dbuf. 887 */ 888 ASSERT3P(zp->z_dbuf, ==, db); 889 ASSERT3U(zp->z_id, ==, obj_num); 890 if (zp->z_unlinked) { 891 err = ENOENT; 892 } else { 893 int dying = 0; 894 895 vp = ZTOV(zp); 896 if (vp == NULL) 897 dying = 1; 898 else { 899 VN_HOLD(vp); 900 if ((vp->v_iflag & VI_DOOMED) != 0) { 901 dying = 1; 902 /* 903 * Don't VN_RELE() vnode here, because 904 * it can call vn_lock() which creates 905 * LOR between vnode lock and znode 906 * lock. We will VN_RELE() the vnode 907 * after droping znode lock. 908 */ 909 } 910 } 911 if (dying) { 912 if (first) { 913 ZFS_LOG(1, "dying znode detected (zp=%p)", zp); 914 first = 0; 915 } 916 /* 917 * znode is dying so we can't reuse it, we must 918 * wait until destruction is completed. 919 */ 920 dmu_buf_rele(db, NULL); 921 mutex_exit(&zp->z_lock); 922 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 923 if (vp != NULL) 924 VN_RELE(vp); 925 tsleep(zp, 0, "zcollide", 1); 926 goto again; 927 } 928 *zpp = zp; 929 err = 0; 930 } 931 dmu_buf_rele(db, NULL); 932 mutex_exit(&zp->z_lock); 933 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 934 return (err); 935 } 936 937 /* 938 * Not found create new znode/vnode 939 */ 940 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); 941 942 vp = ZTOV(zp); 943 vp->v_vflag |= VV_FORCEINSMQ; 944 err = insmntque(vp, zfsvfs->z_vfs); 945 vp->v_vflag &= ~VV_FORCEINSMQ; 946 KASSERT(err == 0, ("insmntque() failed: error %d", err)); 947 VOP_UNLOCK(vp, 0); 948 949 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 950 *zpp = zp; 951 return (0); 952} 953 954int 955zfs_rezget(znode_t *zp) 956{ 957 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 958 dmu_object_info_t doi; 959 dmu_buf_t *db; 960 uint64_t obj_num = zp->z_id; 961 int err; 962 963 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 964 965 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 966 if (err) { 967 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 968 return (err); 969 } 970 971 dmu_object_info_from_db(db, &doi); 972 if (doi.doi_bonus_type != DMU_OT_ZNODE || 973 doi.doi_bonus_size < sizeof (znode_phys_t)) { 974 dmu_buf_rele(db, NULL); 975 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 976 return (EINVAL); 977 } 978 979 if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { 980 dmu_buf_rele(db, NULL); 981 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 982 return (EIO); 983 } 984 985 zfs_znode_dmu_init(zfsvfs, zp, db); 986 zp->z_unlinked = (zp->z_phys->zp_links == 0); 987 zp->z_blksz = doi.doi_data_block_size; 988 989 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 990 991 return (0); 992} 993 994void 995zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 996{ 997 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 998 objset_t *os = zfsvfs->z_os; 999 uint64_t obj = zp->z_id; 1000 uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; 1001 1002 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 1003 if (acl_obj) 1004 VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1005 VERIFY(0 == dmu_object_free(os, obj, tx)); 1006 zfs_znode_dmu_fini(zp); 1007 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 1008 zfs_znode_free(zp); 1009} 1010 1011void 1012zfs_zinactive(znode_t *zp) 1013{ 1014 vnode_t *vp = ZTOV(zp); 1015 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1016 uint64_t z_id = zp->z_id; 1017 1018 ASSERT(zp->z_dbuf && zp->z_phys); 1019 1020 /* 1021 * Don't allow a zfs_zget() while were trying to release this znode 1022 */ 1023 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 1024 1025 mutex_enter(&zp->z_lock); 1026 VI_LOCK(vp); 1027 if (vp->v_count > 0) { 1028 /* 1029 * If the hold count is greater than zero, somebody has 1030 * obtained a new reference on this znode while we were 1031 * processing it here, so we are done. 1032 */ 1033 VI_UNLOCK(vp); 1034 mutex_exit(&zp->z_lock); 1035 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1036 return; 1037 } 1038 VI_UNLOCK(vp); 1039 1040 /* 1041 * If this was the last reference to a file with no links, 1042 * remove the file from the file system. 1043 */ 1044 if (zp->z_unlinked) { 1045 mutex_exit(&zp->z_lock); 1046 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1047 ASSERT(vp->v_count == 0); 1048 vrecycle(vp, curthread); 1049 zfs_rmnode(zp); 1050 return; 1051 } 1052 mutex_exit(&zp->z_lock); 1053 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1054} 1055 1056void 1057zfs_znode_free(znode_t *zp) 1058{ 1059 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1060 1061 ASSERT(ZTOV(zp) == NULL); 1062 mutex_enter(&zfsvfs->z_znodes_lock); 1063 POINTER_INVALIDATE(&zp->z_zfsvfs); 1064 list_remove(&zfsvfs->z_all_znodes, zp); 1065 mutex_exit(&zfsvfs->z_znodes_lock); 1066 1067 kmem_cache_free(znode_cache, zp); 1068 1069 VFS_RELE(zfsvfs->z_vfs); 1070} 1071 1072void 1073zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1074{ 1075 timestruc_t now; 1076 1077 ASSERT(MUTEX_HELD(&zp->z_lock)); 1078 1079 gethrestime(&now); 1080 1081 if (tx) { 1082 dmu_buf_will_dirty(zp->z_dbuf, tx); 1083 zp->z_atime_dirty = 0; 1084 zp->z_seq++; 1085 } else { 1086 zp->z_atime_dirty = 1; 1087 } 1088 1089 if (flag & AT_ATIME) 1090 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1091 1092 if (flag & AT_MTIME) { 1093 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1094 if (zp->z_zfsvfs->z_use_fuids) 1095 zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); 1096 } 1097 1098 if (flag & AT_CTIME) { 1099 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1100 if (zp->z_zfsvfs->z_use_fuids) 1101 zp->z_phys->zp_flags |= ZFS_ARCHIVE; 1102 } 1103} 1104 1105/* 1106 * Update the requested znode timestamps with the current time. 1107 * If we are in a transaction, then go ahead and mark the znode 1108 * dirty in the transaction so the timestamps will go to disk. 1109 * Otherwise, we will get pushed next time the znode is updated 1110 * in a transaction, or when this znode eventually goes inactive. 1111 * 1112 * Why is this OK? 1113 * 1 - Only the ACCESS time is ever updated outside of a transaction. 1114 * 2 - Multiple consecutive updates will be collapsed into a single 1115 * znode update by the transaction grouping semantics of the DMU. 1116 */ 1117void 1118zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1119{ 1120 mutex_enter(&zp->z_lock); 1121 zfs_time_stamper_locked(zp, flag, tx); 1122 mutex_exit(&zp->z_lock); 1123} 1124 1125/* 1126 * Grow the block size for a file. 1127 * 1128 * IN: zp - znode of file to free data in. 1129 * size - requested block size 1130 * tx - open transaction. 1131 * 1132 * NOTE: this function assumes that the znode is write locked. 1133 */ 1134void 1135zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1136{ 1137 int error; 1138 u_longlong_t dummy; 1139 1140 if (size <= zp->z_blksz) 1141 return; 1142 /* 1143 * If the file size is already greater than the current blocksize, 1144 * we will not grow. If there is more than one block in a file, 1145 * the blocksize cannot change. 1146 */ 1147 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1148 return; 1149 1150 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1151 size, 0, tx); 1152 if (error == ENOTSUP) 1153 return; 1154 ASSERT3U(error, ==, 0); 1155 1156 /* What blocksize did we actually get? */ 1157 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1158} 1159 1160/* 1161 * Increase the file length 1162 * 1163 * IN: zp - znode of file to free data in. 1164 * end - new end-of-file 1165 * 1166 * RETURN: 0 if success 1167 * error code if failure 1168 */ 1169static int 1170zfs_extend(znode_t *zp, uint64_t end) 1171{ 1172 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1173 dmu_tx_t *tx; 1174 rl_t *rl; 1175 uint64_t newblksz; 1176 int error; 1177 1178 /* 1179 * We will change zp_size, lock the whole file. 1180 */ 1181 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1182 1183 /* 1184 * Nothing to do if file already at desired length. 1185 */ 1186 if (end <= zp->z_phys->zp_size) { 1187 zfs_range_unlock(rl); 1188 return (0); 1189 } 1190top: 1191 tx = dmu_tx_create(zfsvfs->z_os); 1192 dmu_tx_hold_bonus(tx, zp->z_id); 1193 if (end > zp->z_blksz && 1194 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1195 /* 1196 * We are growing the file past the current block size. 1197 */ 1198 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1199 ASSERT(!ISP2(zp->z_blksz)); 1200 newblksz = MIN(end, SPA_MAXBLOCKSIZE); 1201 } else { 1202 newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1203 } 1204 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1205 } else { 1206 newblksz = 0; 1207 } 1208 1209 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1210 if (error) { 1211 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1212 dmu_tx_wait(tx); 1213 dmu_tx_abort(tx); 1214 goto top; 1215 } 1216 dmu_tx_abort(tx); 1217 zfs_range_unlock(rl); 1218 return (error); 1219 } 1220 dmu_buf_will_dirty(zp->z_dbuf, tx); 1221 1222 if (newblksz) 1223 zfs_grow_blocksize(zp, newblksz, tx); 1224 1225 zp->z_phys->zp_size = end; 1226 1227 zfs_range_unlock(rl); 1228 1229 dmu_tx_commit(tx); 1230 1231 rw_enter(&zp->z_map_lock, RW_WRITER); 1232 error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0); 1233 ASSERT(error == 0); 1234 vnode_pager_setsize(ZTOV(zp), end); 1235 rw_exit(&zp->z_map_lock); 1236 1237 return (0); 1238} 1239 1240/* 1241 * Free space in a file. 1242 * 1243 * IN: zp - znode of file to free data in. 1244 * off - start of section to free. 1245 * len - length of section to free. 1246 * 1247 * RETURN: 0 if success 1248 * error code if failure 1249 */ 1250static int 1251zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1252{ 1253 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1254 rl_t *rl; 1255 int error; 1256 1257 /* 1258 * Lock the range being freed. 1259 */ 1260 rl = zfs_range_lock(zp, off, len, RL_WRITER); 1261 1262 /* 1263 * Nothing to do if file already at desired length. 1264 */ 1265 if (off >= zp->z_phys->zp_size) { 1266 zfs_range_unlock(rl); 1267 return (0); 1268 } 1269 1270 if (off + len > zp->z_phys->zp_size) 1271 len = zp->z_phys->zp_size - off; 1272 1273 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1274 1275 if (error == 0) { 1276 /* 1277 * In FreeBSD we cannot free block in the middle of a file, 1278 * but only at the end of a file. 1279 */ 1280 rw_enter(&zp->z_map_lock, RW_WRITER); 1281 error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0); 1282 ASSERT(error == 0); 1283 vnode_pager_setsize(ZTOV(zp), off); 1284 rw_exit(&zp->z_map_lock); 1285 } 1286 1287 zfs_range_unlock(rl); 1288 1289 return (error); 1290} 1291 1292/* 1293 * Truncate a file 1294 * 1295 * IN: zp - znode of file to free data in. 1296 * end - new end-of-file. 1297 * 1298 * RETURN: 0 if success 1299 * error code if failure 1300 */ 1301static int 1302zfs_trunc(znode_t *zp, uint64_t end) 1303{ 1304 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1305 vnode_t *vp = ZTOV(zp); 1306 dmu_tx_t *tx; 1307 rl_t *rl; 1308 int error; 1309 1310 /* 1311 * We will change zp_size, lock the whole file. 1312 */ 1313 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1314 1315 /* 1316 * Nothing to do if file already at desired length. 1317 */ 1318 if (end >= zp->z_phys->zp_size) { 1319 zfs_range_unlock(rl); 1320 return (0); 1321 } 1322 1323 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); 1324 if (error) { 1325 zfs_range_unlock(rl); 1326 return (error); 1327 } 1328top: 1329 tx = dmu_tx_create(zfsvfs->z_os); 1330 dmu_tx_hold_bonus(tx, zp->z_id); 1331 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1332 if (error) { 1333 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1334 dmu_tx_wait(tx); 1335 dmu_tx_abort(tx); 1336 goto top; 1337 } 1338 dmu_tx_abort(tx); 1339 zfs_range_unlock(rl); 1340 return (error); 1341 } 1342 dmu_buf_will_dirty(zp->z_dbuf, tx); 1343 1344 zp->z_phys->zp_size = end; 1345 1346 dmu_tx_commit(tx); 1347 1348 zfs_range_unlock(rl); 1349 1350 /* 1351 * Clear any mapped pages in the truncated region. This has to 1352 * happen outside of the transaction to avoid the possibility of 1353 * a deadlock with someone trying to push a page that we are 1354 * about to invalidate. 1355 */ 1356 rw_enter(&zp->z_map_lock, RW_WRITER); 1357#if 0 1358 error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE); 1359#else 1360 error = vinvalbuf(vp, V_SAVE, 0, 0); 1361 ASSERT(error == 0); 1362 vnode_pager_setsize(vp, end); 1363#endif 1364 rw_exit(&zp->z_map_lock); 1365 1366 return (0); 1367} 1368 1369/* 1370 * Free space in a file 1371 * 1372 * IN: zp - znode of file to free data in. 1373 * off - start of range 1374 * len - end of range (0 => EOF) 1375 * flag - current file open mode flags. 1376 * log - TRUE if this action should be logged 1377 * 1378 * RETURN: 0 if success 1379 * error code if failure 1380 */ 1381int 1382zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1383{ 1384 vnode_t *vp = ZTOV(zp); 1385 dmu_tx_t *tx; 1386 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1387 zilog_t *zilog = zfsvfs->z_log; 1388 int error; 1389 1390 if (off > zp->z_phys->zp_size) { 1391 error = zfs_extend(zp, off+len); 1392 if (error == 0 && log) 1393 goto log; 1394 else 1395 return (error); 1396 } 1397 1398 if (len == 0) { 1399 error = zfs_trunc(zp, off); 1400 } else { 1401 if ((error = zfs_free_range(zp, off, len)) == 0 && 1402 off + len > zp->z_phys->zp_size) 1403 error = zfs_extend(zp, off+len); 1404 } 1405 if (error || !log) 1406 return (error); 1407log: 1408 tx = dmu_tx_create(zfsvfs->z_os); 1409 dmu_tx_hold_bonus(tx, zp->z_id); 1410 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1411 if (error) { 1412 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1413 dmu_tx_wait(tx); 1414 dmu_tx_abort(tx); 1415 goto log; 1416 } 1417 dmu_tx_abort(tx); 1418 return (error); 1419 } 1420 1421 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 1422 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1423 1424 dmu_tx_commit(tx); 1425 return (0); 1426} 1427 1428void 1429zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1430{ 1431 zfsvfs_t zfsvfs; 1432 uint64_t moid, doid, version; 1433 uint64_t sense = ZFS_CASE_SENSITIVE; 1434 uint64_t norm = 0; 1435 nvpair_t *elem; 1436 int error; 1437 znode_t *rootzp = NULL; 1438 vnode_t *vp; 1439 vattr_t vattr; 1440 znode_t *zp; 1441 1442 /* 1443 * First attempt to create master node. 1444 */ 1445 /* 1446 * In an empty objset, there are no blocks to read and thus 1447 * there can be no i/o errors (which we assert below). 1448 */ 1449 moid = MASTER_NODE_OBJ; 1450 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1451 DMU_OT_NONE, 0, tx); 1452 ASSERT(error == 0); 1453 1454 /* 1455 * Set starting attributes. 1456 */ 1457 if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) 1458 version = ZPL_VERSION; 1459 else 1460 version = ZPL_VERSION_FUID - 1; 1461 error = zap_update(os, moid, ZPL_VERSION_STR, 1462 8, 1, &version, tx); 1463 elem = NULL; 1464 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1465 /* For the moment we expect all zpl props to be uint64_ts */ 1466 uint64_t val; 1467 char *name; 1468 1469 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1470 VERIFY(nvpair_value_uint64(elem, &val) == 0); 1471 name = nvpair_name(elem); 1472 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1473 version = val; 1474 error = zap_update(os, moid, ZPL_VERSION_STR, 1475 8, 1, &version, tx); 1476 } else { 1477 error = zap_update(os, moid, name, 8, 1, &val, tx); 1478 } 1479 ASSERT(error == 0); 1480 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1481 norm = val; 1482 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1483 sense = val; 1484 } 1485 ASSERT(version != 0); 1486 1487 /* 1488 * Create a delete queue. 1489 */ 1490 doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1491 1492 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); 1493 ASSERT(error == 0); 1494 1495 /* 1496 * Create root znode. Create minimal znode/vnode/zfsvfs 1497 * to allow zfs_mknode to work. 1498 */ 1499 VATTR_NULL(&vattr); 1500 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1501 vattr.va_type = VDIR; 1502 vattr.va_mode = S_IFDIR|0755; 1503 vattr.va_uid = crgetuid(cr); 1504 vattr.va_gid = crgetgid(cr); 1505 1506 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1507 zfs_znode_cache_constructor(rootzp, &zfsvfs, 0); 1508 rootzp->z_unlinked = 0; 1509 rootzp->z_atime_dirty = 0; 1510 1511 vp = ZTOV(rootzp); 1512 vp->v_type = VDIR; 1513 VN_LOCK_ASHARE(vp); 1514 1515 bzero(&zfsvfs, sizeof (zfsvfs_t)); 1516 1517 zfsvfs.z_os = os; 1518 zfsvfs.z_assign = TXG_NOWAIT; 1519 zfsvfs.z_parent = &zfsvfs; 1520 zfsvfs.z_version = version; 1521 zfsvfs.z_use_fuids = USE_FUIDS(version, os); 1522 zfsvfs.z_norm = norm; 1523 /* 1524 * Fold case on file systems that are always or sometimes case 1525 * insensitive. 1526 */ 1527 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1528 zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; 1529 1530 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1531 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1532 offsetof(znode_t, z_link_node)); 1533 1534 ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); 1535 rootzp->z_zfsvfs = &zfsvfs; 1536 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL); 1537 ASSERT3P(zp, ==, rootzp); 1538 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1539 ASSERT(error == 0); 1540 POINTER_INVALIDATE(&rootzp->z_zfsvfs); 1541 1542 VI_LOCK(vp); 1543 ZTOV(rootzp)->v_data = NULL; 1544 ZTOV(rootzp)->v_count = 0; 1545 ZTOV(rootzp)->v_holdcnt = 0; 1546 rootzp->z_vnode = NULL; 1547 VOP_UNLOCK(vp, 0); 1548 vdestroy(vp); 1549 dmu_buf_rele(rootzp->z_dbuf, NULL); 1550 rootzp->z_dbuf = NULL; 1551 mutex_destroy(&zfsvfs.z_znodes_lock); 1552 kmem_cache_free(znode_cache, rootzp); 1553} 1554 1555#endif /* _KERNEL */ 1556/* 1557 * Given an object number, return its parent object number and whether 1558 * or not the object is an extended attribute directory. 1559 */ 1560static int 1561zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) 1562{ 1563 dmu_buf_t *db; 1564 dmu_object_info_t doi; 1565 znode_phys_t *zp; 1566 int error; 1567 1568 if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) 1569 return (error); 1570 1571 dmu_object_info_from_db(db, &doi); 1572 if (doi.doi_bonus_type != DMU_OT_ZNODE || 1573 doi.doi_bonus_size < sizeof (znode_phys_t)) { 1574 dmu_buf_rele(db, FTAG); 1575 return (EINVAL); 1576 } 1577 1578 zp = db->db_data; 1579 *pobjp = zp->zp_parent; 1580 *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && 1581 S_ISDIR(zp->zp_mode); 1582 dmu_buf_rele(db, FTAG); 1583 1584 return (0); 1585} 1586 1587int 1588zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 1589{ 1590 char *path = buf + len - 1; 1591 int error; 1592 1593 *path = '\0'; 1594 1595 for (;;) { 1596 uint64_t pobj; 1597 char component[MAXNAMELEN + 2]; 1598 size_t complen; 1599 int is_xattrdir; 1600 1601 if ((error = zfs_obj_to_pobj(osp, obj, &pobj, 1602 &is_xattrdir)) != 0) 1603 break; 1604 1605 if (pobj == obj) { 1606 if (path[0] != '/') 1607 *--path = '/'; 1608 break; 1609 } 1610 1611 component[0] = '/'; 1612 if (is_xattrdir) { 1613 (void) sprintf(component + 1, "<xattrdir>"); 1614 } else { 1615 error = zap_value_search(osp, pobj, obj, 1616 ZFS_DIRENT_OBJ(-1ULL), component + 1); 1617 if (error != 0) 1618 break; 1619 } 1620 1621 complen = strlen(component); 1622 path -= complen; 1623 ASSERT(path >= buf); 1624 bcopy(component, path, complen); 1625 obj = pobj; 1626 } 1627 1628 if (error == 0) 1629 (void) memmove(buf, path, buf + len - path); 1630 return (error); 1631} 1632