zfs_znode.c revision 187830
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26/* Portions Copyright 2007 Jeremy Teo */ 27 28#ifdef _KERNEL 29#include <sys/types.h> 30#include <sys/param.h> 31#include <sys/time.h> 32#include <sys/systm.h> 33#include <sys/sysmacros.h> 34#include <sys/resource.h> 35#include <sys/mntent.h> 36#include <sys/u8_textprep.h> 37#include <sys/dsl_dataset.h> 38#include <sys/vfs.h> 39#include <sys/vnode.h> 40#include <sys/file.h> 41#include <sys/kmem.h> 42#include <sys/errno.h> 43#include <sys/unistd.h> 44#include <sys/atomic.h> 45#include <sys/zfs_dir.h> 46#include <sys/zfs_acl.h> 47#include <sys/zfs_ioctl.h> 48#include <sys/zfs_rlock.h> 49#include <sys/zfs_fuid.h> 50#include <sys/fs/zfs.h> 51#include <sys/kidmap.h> 52#endif /* _KERNEL */ 53 54#include <sys/dmu.h> 55#include <sys/refcount.h> 56#include <sys/stat.h> 57#include <sys/zap.h> 58#include <sys/zfs_znode.h> 59#include <sys/refcount.h> 60 61#include "zfs_prop.h" 62 63/* Used by fstat(1). */ 64SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t), 65 "sizeof(znode_t)"); 66 67/* 68 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only 69 * turned on when DEBUG is also defined. 70 */ 71#ifdef DEBUG 72#define ZNODE_STATS 73#endif /* DEBUG */ 74 75#ifdef ZNODE_STATS 76#define ZNODE_STAT_ADD(stat) ((stat)++) 77#else 78#define ZNODE_STAT_ADD(stat) /* nothing */ 79#endif /* ZNODE_STATS */ 80 81#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) 82#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) 83 84/* 85 * Functions needed for userland (ie: libzpool) are not put under 86 * #ifdef_KERNEL; the rest of the functions have dependencies 87 * (such as VFS logic) that will not compile easily in userland. 88 */ 89#ifdef _KERNEL 90static kmem_cache_t *znode_cache = NULL; 91 92/*ARGSUSED*/ 93static void 94znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) 95{ 96#if 1 /* XXXPJD: From OpenSolaris. */ 97 /* 98 * We should never drop all dbuf refs without first clearing 99 * the eviction callback. 100 */ 101 panic("evicting znode %p\n", user_ptr); 102#else /* XXXPJD */ 103 znode_t *zp = user_ptr; 104 vnode_t *vp; 105 106 mutex_enter(&zp->z_lock); 107 zp->z_dbuf = NULL; 108 vp = ZTOV(zp); 109 if (vp == NULL) { 110 mutex_exit(&zp->z_lock); 111 zfs_znode_free(zp); 112 } else if (vp->v_count == 0) { 113 ZTOV(zp) = NULL; 114 vhold(vp); 115 mutex_exit(&zp->z_lock); 116 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); 117 vrecycle(vp, curthread); 118 VOP_UNLOCK(vp, 0); 119 vdrop(vp); 120 zfs_znode_free(zp); 121 } else { 122 mutex_exit(&zp->z_lock); 123 } 124#endif 125} 126 127extern struct vop_vector zfs_vnodeops; 128extern struct vop_vector zfs_fifoops; 129 130/* 131 * XXX: We cannot use this function as a cache constructor, because 132 * there is one global cache for all file systems and we need 133 * to pass vfsp here, which is not possible, because argument 134 * 'cdrarg' is defined at kmem_cache_create() time. 135 */ 136static int 137zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 138{ 139 znode_t *zp = buf; 140 vnode_t *vp; 141 vfs_t *vfsp = arg; 142 int error; 143 144 POINTER_INVALIDATE(&zp->z_zfsvfs); 145 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 146 ASSERT(vfsp != NULL); 147 148 error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp); 149 if (error != 0 && (kmflags & KM_NOSLEEP)) 150 return (-1); 151 ASSERT(error == 0); 152 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 153 zp->z_vnode = vp; 154 vp->v_data = (caddr_t)zp; 155 VN_LOCK_AREC(vp); 156 VN_LOCK_ASHARE(vp); 157 158 list_link_init(&zp->z_link_node); 159 160 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 161 rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); 162 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 163 rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); 164 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 165 166 mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); 167 avl_create(&zp->z_range_avl, zfs_range_compare, 168 sizeof (rl_t), offsetof(rl_t, r_node)); 169 170 zp->z_dbuf = NULL; 171 zp->z_dirlocks = NULL; 172 return (0); 173} 174 175/*ARGSUSED*/ 176static void 177zfs_znode_cache_destructor(void *buf, void *arg) 178{ 179 znode_t *zp = buf; 180 181 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 182 ASSERT(ZTOV(zp) == NULL); 183 vn_free(ZTOV(zp)); 184 ASSERT(!list_link_active(&zp->z_link_node)); 185 mutex_destroy(&zp->z_lock); 186 rw_destroy(&zp->z_map_lock); 187 rw_destroy(&zp->z_parent_lock); 188 rw_destroy(&zp->z_name_lock); 189 mutex_destroy(&zp->z_acl_lock); 190 avl_destroy(&zp->z_range_avl); 191 mutex_destroy(&zp->z_range_lock); 192 193 ASSERT(zp->z_dbuf == NULL); 194 ASSERT(zp->z_dirlocks == NULL); 195} 196 197#ifdef ZNODE_STATS 198static struct { 199 uint64_t zms_zfsvfs_invalid; 200 uint64_t zms_zfsvfs_unmounted; 201 uint64_t zms_zfsvfs_recheck_invalid; 202 uint64_t zms_obj_held; 203 uint64_t zms_vnode_locked; 204 uint64_t zms_not_only_dnlc; 205} znode_move_stats; 206#endif /* ZNODE_STATS */ 207 208#if defined(sun) 209static void 210zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) 211{ 212 vnode_t *vp; 213 214 /* Copy fields. */ 215 nzp->z_zfsvfs = ozp->z_zfsvfs; 216 217 /* Swap vnodes. */ 218 vp = nzp->z_vnode; 219 nzp->z_vnode = ozp->z_vnode; 220 ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ 221 ZTOV(ozp)->v_data = ozp; 222 ZTOV(nzp)->v_data = nzp; 223 224 nzp->z_id = ozp->z_id; 225 ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ 226 ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); 227 nzp->z_unlinked = ozp->z_unlinked; 228 nzp->z_atime_dirty = ozp->z_atime_dirty; 229 nzp->z_zn_prefetch = ozp->z_zn_prefetch; 230 nzp->z_blksz = ozp->z_blksz; 231 nzp->z_seq = ozp->z_seq; 232 nzp->z_mapcnt = ozp->z_mapcnt; 233 nzp->z_last_itx = ozp->z_last_itx; 234 nzp->z_gen = ozp->z_gen; 235 nzp->z_sync_cnt = ozp->z_sync_cnt; 236 nzp->z_phys = ozp->z_phys; 237 nzp->z_dbuf = ozp->z_dbuf; 238 239 /* Update back pointers. */ 240 (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, 241 znode_evict_error); 242 243 /* 244 * Invalidate the original znode by clearing fields that provide a 245 * pointer back to the znode. Set the low bit of the vfs pointer to 246 * ensure that zfs_znode_move() recognizes the znode as invalid in any 247 * subsequent callback. 248 */ 249 ozp->z_dbuf = NULL; 250 POINTER_INVALIDATE(&ozp->z_zfsvfs); 251} 252 253/* 254 * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise 255 * returns a non-zero error code. 256 */ 257static int 258zfs_enter(zfsvfs_t *zfsvfs) 259{ 260 ZFS_ENTER(zfsvfs); 261 return (0); 262} 263 264/*ARGSUSED*/ 265static kmem_cbrc_t 266zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) 267{ 268 znode_t *ozp = buf, *nzp = newbuf; 269 zfsvfs_t *zfsvfs; 270 vnode_t *vp; 271 272 /* 273 * The znode is on the file system's list of known znodes if the vfs 274 * pointer is valid. We set the low bit of the vfs pointer when freeing 275 * the znode to invalidate it, and the memory patterns written by kmem 276 * (baddcafe and deadbeef) set at least one of the two low bits. A newly 277 * created znode sets the vfs pointer last of all to indicate that the 278 * znode is known and in a valid state to be moved by this function. 279 */ 280 zfsvfs = ozp->z_zfsvfs; 281 if (!POINTER_IS_VALID(zfsvfs)) { 282 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); 283 return (KMEM_CBRC_DONT_KNOW); 284 } 285 286 /* 287 * Ensure that the filesystem is not unmounted during the move. 288 */ 289 if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */ 290 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); 291 return (KMEM_CBRC_DONT_KNOW); 292 } 293 294 mutex_enter(&zfsvfs->z_znodes_lock); 295 /* 296 * Recheck the vfs pointer in case the znode was removed just before 297 * acquiring the lock. 298 */ 299 if (zfsvfs != ozp->z_zfsvfs) { 300 mutex_exit(&zfsvfs->z_znodes_lock); 301 ZFS_EXIT(zfsvfs); 302 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid); 303 return (KMEM_CBRC_DONT_KNOW); 304 } 305 306 /* 307 * At this point we know that as long as we hold z_znodes_lock, the 308 * znode cannot be freed and fields within the znode can be safely 309 * accessed. Now, prevent a race with zfs_zget(). 310 */ 311 if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { 312 mutex_exit(&zfsvfs->z_znodes_lock); 313 ZFS_EXIT(zfsvfs); 314 ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); 315 return (KMEM_CBRC_LATER); 316 } 317 318 vp = ZTOV(ozp); 319 if (mutex_tryenter(&vp->v_lock) == 0) { 320 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 321 mutex_exit(&zfsvfs->z_znodes_lock); 322 ZFS_EXIT(zfsvfs); 323 ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); 324 return (KMEM_CBRC_LATER); 325 } 326 327 /* Only move znodes that are referenced _only_ by the DNLC. */ 328 if (vp->v_count != 1 || !vn_in_dnlc(vp)) { 329 mutex_exit(&vp->v_lock); 330 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 331 mutex_exit(&zfsvfs->z_znodes_lock); 332 ZFS_EXIT(zfsvfs); 333 ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); 334 return (KMEM_CBRC_LATER); 335 } 336 337 /* 338 * The znode is known and in a valid state to move. We're holding the 339 * locks needed to execute the critical section. 340 */ 341 zfs_znode_move_impl(ozp, nzp); 342 mutex_exit(&vp->v_lock); 343 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 344 345 list_link_replace(&ozp->z_link_node, &nzp->z_link_node); 346 mutex_exit(&zfsvfs->z_znodes_lock); 347 ZFS_EXIT(zfsvfs); 348 349 return (KMEM_CBRC_YES); 350} 351#endif /* sun */ 352 353void 354zfs_znode_init(void) 355{ 356 /* 357 * Initialize zcache 358 */ 359 ASSERT(znode_cache == NULL); 360 znode_cache = kmem_cache_create("zfs_znode_cache", 361 sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL, 362 zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 363#if defined(sun) 364 kmem_cache_set_move(znode_cache, zfs_znode_move); 365#endif 366} 367 368void 369zfs_znode_fini(void) 370{ 371 /* 372 * Cleanup zcache 373 */ 374 if (znode_cache) 375 kmem_cache_destroy(znode_cache); 376 znode_cache = NULL; 377} 378 379/* 380 * zfs_init_fs - Initialize the zfsvfs struct and the file system 381 * incore "master" object. Verify version compatibility. 382 */ 383int 384zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp) 385{ 386 objset_t *os = zfsvfs->z_os; 387 int i, error; 388 uint64_t fsid_guid; 389 uint64_t zval; 390 391 *zpp = NULL; 392 393 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 394 if (error) { 395 return (error); 396 } else if (zfsvfs->z_version > ZPL_VERSION) { 397 (void) printf("Mismatched versions: File system " 398 "is version %llu on-disk format, which is " 399 "incompatible with this software version %lld!", 400 (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); 401 return (ENOTSUP); 402 } 403 404 if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) 405 return (error); 406 zfsvfs->z_norm = (int)zval; 407 if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) 408 return (error); 409 zfsvfs->z_utf8 = (zval != 0); 410 if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) 411 return (error); 412 zfsvfs->z_case = (uint_t)zval; 413 /* 414 * Fold case on file systems that are always or sometimes case 415 * insensitive. 416 */ 417 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 418 zfsvfs->z_case == ZFS_CASE_MIXED) 419 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 420 421 /* 422 * The fsid is 64 bits, composed of an 8-bit fs type, which 423 * separates our fsid from any other filesystem types, and a 424 * 56-bit objset unique ID. The objset unique ID is unique to 425 * all objsets open on this system, provided by unique_create(). 426 * The 8-bit fs type must be put in the low bits of fsid[1] 427 * because that's where other Solaris filesystems put it. 428 */ 429 fsid_guid = dmu_objset_fsid_guid(os); 430 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 431 zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid; 432 zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | 433 zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF; 434 435 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 436 &zfsvfs->z_root); 437 if (error) 438 return (error); 439 ASSERT(zfsvfs->z_root != 0); 440 441 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 442 &zfsvfs->z_unlinkedobj); 443 if (error) 444 return (error); 445 446 /* 447 * Initialize zget mutex's 448 */ 449 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 450 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 451 452 error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); 453 if (error) { 454 /* 455 * On error, we destroy the mutexes here since it's not 456 * possible for the caller to determine if the mutexes were 457 * initialized properly. 458 */ 459 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 460 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 461 return (error); 462 } 463 ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); 464 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 465 &zfsvfs->z_fuid_obj); 466 if (error == ENOENT) 467 error = 0; 468 469 return (0); 470} 471 472/* 473 * define a couple of values we need available 474 * for both 64 and 32 bit environments. 475 */ 476#ifndef NBITSMINOR64 477#define NBITSMINOR64 32 478#endif 479#ifndef MAXMAJ64 480#define MAXMAJ64 0xffffffffUL 481#endif 482#ifndef MAXMIN64 483#define MAXMIN64 0xffffffffUL 484#endif 485 486/* 487 * Create special expldev for ZFS private use. 488 * Can't use standard expldev since it doesn't do 489 * what we want. The standard expldev() takes a 490 * dev32_t in LP64 and expands it to a long dev_t. 491 * We need an interface that takes a dev32_t in ILP32 492 * and expands it to a long dev_t. 493 */ 494static uint64_t 495zfs_expldev(dev_t dev) 496{ 497 return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); 498} 499/* 500 * Special cmpldev for ZFS private use. 501 * Can't use standard cmpldev since it takes 502 * a long dev_t and compresses it to dev32_t in 503 * LP64. We need to do a compaction of a long dev_t 504 * to a dev32_t in ILP32. 505 */ 506dev_t 507zfs_cmpldev(uint64_t dev) 508{ 509 return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); 510} 511 512static void 513zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) 514{ 515 znode_t *nzp; 516 517 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); 518 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); 519 520 mutex_enter(&zp->z_lock); 521 522 ASSERT(zp->z_dbuf == NULL); 523 zp->z_dbuf = db; 524 nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); 525 526 /* 527 * there should be no 528 * concurrent zgets on this object. 529 */ 530 if (nzp != NULL) 531 panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); 532 533 /* 534 * Slap on VROOT if we are the root znode 535 */ 536 if (zp->z_id == zfsvfs->z_root) 537 ZTOV(zp)->v_flag |= VROOT; 538 539 mutex_exit(&zp->z_lock); 540 vn_exists(ZTOV(zp)); 541} 542 543void 544zfs_znode_dmu_fini(znode_t *zp) 545{ 546 dmu_buf_t *db = zp->z_dbuf; 547 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || 548 zp->z_unlinked || 549 RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); 550 ASSERT(zp->z_dbuf != NULL); 551 zp->z_dbuf = NULL; 552 VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); 553 dmu_buf_rele(db, NULL); 554} 555 556/* 557 * Construct a new znode/vnode and intialize. 558 * 559 * This does not do a call to dmu_set_user() that is 560 * up to the caller to do, in case you don't want to 561 * return the znode 562 */ 563static znode_t * 564zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) 565{ 566 znode_t *zp; 567 vnode_t *vp; 568 569 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 570 zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0); 571 572 ASSERT(zp->z_dirlocks == NULL); 573 ASSERT(zp->z_dbuf == NULL); 574 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 575 576 /* 577 * Defer setting z_zfsvfs until the znode is ready to be a candidate for 578 * the zfs_znode_move() callback. 579 */ 580 zp->z_phys = NULL; 581 zp->z_unlinked = 0; 582 zp->z_atime_dirty = 0; 583 zp->z_mapcnt = 0; 584 zp->z_last_itx = 0; 585 zp->z_id = db->db_object; 586 zp->z_blksz = blksz; 587 zp->z_seq = 0x7A4653; 588 zp->z_sync_cnt = 0; 589 590 vp = ZTOV(zp); 591#ifdef TODO 592 vn_reinit(vp); 593#endif 594 595 zfs_znode_dmu_init(zfsvfs, zp, db); 596 597 zp->z_gen = zp->z_phys->zp_gen; 598 599#if 0 600 if (vp == NULL) 601 return (zp); 602#endif 603 604 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 605 switch (vp->v_type) { 606 case VDIR: 607 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 608 break; 609 case VFIFO: 610 vp->v_op = &zfs_fifoops; 611 break; 612 } 613 614 mutex_enter(&zfsvfs->z_znodes_lock); 615 list_insert_tail(&zfsvfs->z_all_znodes, zp); 616 membar_producer(); 617 /* 618 * Everything else must be valid before assigning z_zfsvfs makes the 619 * znode eligible for zfs_znode_move(). 620 */ 621 zp->z_zfsvfs = zfsvfs; 622 mutex_exit(&zfsvfs->z_znodes_lock); 623 624 VFS_HOLD(zfsvfs->z_vfs); 625 return (zp); 626} 627 628/* 629 * Create a new DMU object to hold a zfs znode. 630 * 631 * IN: dzp - parent directory for new znode 632 * vap - file attributes for new znode 633 * tx - dmu transaction id for zap operations 634 * cr - credentials of caller 635 * flag - flags: 636 * IS_ROOT_NODE - new object will be root 637 * IS_XATTR - new object is an attribute 638 * IS_REPLAY - intent log replay 639 * bonuslen - length of bonus buffer 640 * setaclp - File/Dir initial ACL 641 * fuidp - Tracks fuid allocation. 642 * 643 * OUT: zpp - allocated znode 644 * 645 */ 646void 647zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 648 uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp, 649 zfs_fuid_info_t **fuidp) 650{ 651 dmu_buf_t *db; 652 znode_phys_t *pzp; 653 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 654 timestruc_t now; 655 uint64_t gen, obj; 656 int err; 657 658 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 659 660 if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ 661 obj = vap->va_nodeid; 662 flag |= IS_REPLAY; 663 now = vap->va_ctime; /* see zfs_replay_create() */ 664 gen = vap->va_nblocks; /* ditto */ 665 } else { 666 obj = 0; 667 gethrestime(&now); 668 gen = dmu_tx_get_txg(tx); 669 } 670 671 /* 672 * Create a new DMU object. 673 */ 674 /* 675 * There's currently no mechanism for pre-reading the blocks that will 676 * be to needed allocate a new object, so we accept the small chance 677 * that there will be an i/o error and we will fail one of the 678 * assertions below. 679 */ 680 if (vap->va_type == VDIR) { 681 if (flag & IS_REPLAY) { 682 err = zap_create_claim_norm(zfsvfs->z_os, obj, 683 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 684 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 685 ASSERT3U(err, ==, 0); 686 } else { 687 obj = zap_create_norm(zfsvfs->z_os, 688 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 689 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 690 } 691 } else { 692 if (flag & IS_REPLAY) { 693 err = dmu_object_claim(zfsvfs->z_os, obj, 694 DMU_OT_PLAIN_FILE_CONTENTS, 0, 695 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 696 ASSERT3U(err, ==, 0); 697 } else { 698 obj = dmu_object_alloc(zfsvfs->z_os, 699 DMU_OT_PLAIN_FILE_CONTENTS, 0, 700 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 701 } 702 } 703 VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); 704 dmu_buf_will_dirty(db, tx); 705 706 /* 707 * Initialize the znode physical data to zero. 708 */ 709 ASSERT(db->db_size >= sizeof (znode_phys_t)); 710 bzero(db->db_data, db->db_size); 711 pzp = db->db_data; 712 713 /* 714 * If this is the root, fix up the half-initialized parent pointer 715 * to reference the just-allocated physical data area. 716 */ 717 if (flag & IS_ROOT_NODE) { 718 dzp->z_dbuf = db; 719 dzp->z_phys = pzp; 720 dzp->z_id = obj; 721 } 722 723 /* 724 * If parent is an xattr, so am I. 725 */ 726 if (dzp->z_phys->zp_flags & ZFS_XATTR) 727 flag |= IS_XATTR; 728 729 if (vap->va_type == VBLK || vap->va_type == VCHR) { 730 pzp->zp_rdev = zfs_expldev(vap->va_rdev); 731 } 732 733 if (zfsvfs->z_use_fuids) 734 pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 735 736 if (vap->va_type == VDIR) { 737 pzp->zp_size = 2; /* contents ("." and "..") */ 738 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 739 } 740 741 pzp->zp_parent = dzp->z_id; 742 if (flag & IS_XATTR) 743 pzp->zp_flags |= ZFS_XATTR; 744 745 pzp->zp_gen = gen; 746 747 ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 748 ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 749 750 if (vap->va_mask & AT_ATIME) { 751 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 752 } else { 753 ZFS_TIME_ENCODE(&now, pzp->zp_atime); 754 } 755 756 if (vap->va_mask & AT_MTIME) { 757 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 758 } else { 759 ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 760 } 761 762 pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); 763 if (!(flag & IS_ROOT_NODE)) { 764 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 765 *zpp = zfs_znode_alloc(zfsvfs, db, 0); 766 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 767 } else { 768 /* 769 * If we are creating the root node, the "parent" we 770 * passed in is the znode for the root. 771 */ 772 *zpp = dzp; 773 } 774 zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp); 775 if (!(flag & IS_ROOT_NODE)) { 776 vnode_t *vp; 777 778 vp = ZTOV(*zpp); 779 vp->v_vflag |= VV_FORCEINSMQ; 780 err = insmntque(vp, zfsvfs->z_vfs); 781 vp->v_vflag &= ~VV_FORCEINSMQ; 782 KASSERT(err == 0, ("insmntque() failed: error %d", err)); 783 } 784} 785 786void 787zfs_xvattr_set(znode_t *zp, xvattr_t *xvap) 788{ 789 xoptattr_t *xoap; 790 791 xoap = xva_getxoptattr(xvap); 792 ASSERT(xoap); 793 794 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 795 ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); 796 XVA_SET_RTN(xvap, XAT_CREATETIME); 797 } 798 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 799 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); 800 XVA_SET_RTN(xvap, XAT_READONLY); 801 } 802 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 803 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); 804 XVA_SET_RTN(xvap, XAT_HIDDEN); 805 } 806 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 807 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); 808 XVA_SET_RTN(xvap, XAT_SYSTEM); 809 } 810 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 811 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); 812 XVA_SET_RTN(xvap, XAT_ARCHIVE); 813 } 814 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 815 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); 816 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 817 } 818 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 819 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); 820 XVA_SET_RTN(xvap, XAT_NOUNLINK); 821 } 822 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 823 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); 824 XVA_SET_RTN(xvap, XAT_APPENDONLY); 825 } 826 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 827 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); 828 XVA_SET_RTN(xvap, XAT_NODUMP); 829 } 830 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 831 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); 832 XVA_SET_RTN(xvap, XAT_OPAQUE); 833 } 834 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 835 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 836 xoap->xoa_av_quarantined); 837 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 838 } 839 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 840 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); 841 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 842 } 843 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 844 (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, 845 sizeof (xoap->xoa_av_scanstamp)); 846 zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; 847 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 848 } 849} 850 851int 852zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 853{ 854 dmu_object_info_t doi; 855 dmu_buf_t *db; 856 znode_t *zp; 857 vnode_t *vp; 858 int err, first = 1; 859 860 *zpp = NULL; 861again: 862 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 863 864 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 865 if (err) { 866 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 867 return (err); 868 } 869 870 dmu_object_info_from_db(db, &doi); 871 if (doi.doi_bonus_type != DMU_OT_ZNODE || 872 doi.doi_bonus_size < sizeof (znode_phys_t)) { 873 dmu_buf_rele(db, NULL); 874 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 875 return (EINVAL); 876 } 877 878 zp = dmu_buf_get_user(db); 879 if (zp != NULL) { 880 mutex_enter(&zp->z_lock); 881 882 /* 883 * Since we do immediate eviction of the z_dbuf, we 884 * should never find a dbuf with a znode that doesn't 885 * know about the dbuf. 886 */ 887 ASSERT3P(zp->z_dbuf, ==, db); 888 ASSERT3U(zp->z_id, ==, obj_num); 889 if (zp->z_unlinked) { 890 err = ENOENT; 891 } else { 892 if (ZTOV(zp) != NULL) 893 VN_HOLD(ZTOV(zp)); 894 else { 895 if (first) { 896 ZFS_LOG(1, "dying znode detected (zp=%p)", zp); 897 first = 0; 898 } 899 /* 900 * znode is dying so we can't reuse it, we must 901 * wait until destruction is completed. 902 */ 903 dmu_buf_rele(db, NULL); 904 mutex_exit(&zp->z_lock); 905 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 906 tsleep(zp, 0, "zcollide", 1); 907 goto again; 908 } 909 *zpp = zp; 910 err = 0; 911 } 912 dmu_buf_rele(db, NULL); 913 mutex_exit(&zp->z_lock); 914 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 915 return (err); 916 } 917 918 /* 919 * Not found create new znode/vnode 920 */ 921 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); 922 923 vp = ZTOV(zp); 924 vp->v_vflag |= VV_FORCEINSMQ; 925 err = insmntque(vp, zfsvfs->z_vfs); 926 vp->v_vflag &= ~VV_FORCEINSMQ; 927 KASSERT(err == 0, ("insmntque() failed: error %d", err)); 928 VOP_UNLOCK(vp, 0); 929 930 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 931 *zpp = zp; 932 return (0); 933} 934 935int 936zfs_rezget(znode_t *zp) 937{ 938 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 939 dmu_object_info_t doi; 940 dmu_buf_t *db; 941 uint64_t obj_num = zp->z_id; 942 int err; 943 944 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 945 946 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 947 if (err) { 948 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 949 return (err); 950 } 951 952 dmu_object_info_from_db(db, &doi); 953 if (doi.doi_bonus_type != DMU_OT_ZNODE || 954 doi.doi_bonus_size < sizeof (znode_phys_t)) { 955 dmu_buf_rele(db, NULL); 956 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 957 return (EINVAL); 958 } 959 960 if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { 961 dmu_buf_rele(db, NULL); 962 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 963 return (EIO); 964 } 965 966 zfs_znode_dmu_init(zfsvfs, zp, db); 967 zp->z_unlinked = (zp->z_phys->zp_links == 0); 968 zp->z_blksz = doi.doi_data_block_size; 969 970 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 971 972 return (0); 973} 974 975void 976zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 977{ 978 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 979 objset_t *os = zfsvfs->z_os; 980 uint64_t obj = zp->z_id; 981 uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; 982 983 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 984 if (acl_obj) 985 VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 986 VERIFY(0 == dmu_object_free(os, obj, tx)); 987 zfs_znode_dmu_fini(zp); 988 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 989 zfs_znode_free(zp); 990} 991 992void 993zfs_zinactive(znode_t *zp) 994{ 995 vnode_t *vp = ZTOV(zp); 996 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 997 uint64_t z_id = zp->z_id; 998 999 ASSERT(zp->z_dbuf && zp->z_phys); 1000 1001 /* 1002 * Don't allow a zfs_zget() while were trying to release this znode 1003 */ 1004 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 1005 1006 mutex_enter(&zp->z_lock); 1007 VI_LOCK(vp); 1008 if (vp->v_count > 0) { 1009 /* 1010 * If the hold count is greater than zero, somebody has 1011 * obtained a new reference on this znode while we were 1012 * processing it here, so we are done. 1013 */ 1014 VI_UNLOCK(vp); 1015 mutex_exit(&zp->z_lock); 1016 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1017 return; 1018 } 1019 VI_UNLOCK(vp); 1020 1021 /* 1022 * If this was the last reference to a file with no links, 1023 * remove the file from the file system. 1024 */ 1025 if (zp->z_unlinked) { 1026 mutex_exit(&zp->z_lock); 1027 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1028 ASSERT(vp->v_count == 0); 1029 vrecycle(vp, curthread); 1030 zfs_rmnode(zp); 1031 return; 1032 } 1033 mutex_exit(&zp->z_lock); 1034 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1035} 1036 1037void 1038zfs_znode_free(znode_t *zp) 1039{ 1040 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1041 1042 ASSERT(ZTOV(zp) == NULL); 1043 mutex_enter(&zfsvfs->z_znodes_lock); 1044 POINTER_INVALIDATE(&zp->z_zfsvfs); 1045 list_remove(&zfsvfs->z_all_znodes, zp); 1046 mutex_exit(&zfsvfs->z_znodes_lock); 1047 1048 kmem_cache_free(znode_cache, zp); 1049 1050 VFS_RELE(zfsvfs->z_vfs); 1051} 1052 1053void 1054zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1055{ 1056 timestruc_t now; 1057 1058 ASSERT(MUTEX_HELD(&zp->z_lock)); 1059 1060 gethrestime(&now); 1061 1062 if (tx) { 1063 dmu_buf_will_dirty(zp->z_dbuf, tx); 1064 zp->z_atime_dirty = 0; 1065 zp->z_seq++; 1066 } else { 1067 zp->z_atime_dirty = 1; 1068 } 1069 1070 if (flag & AT_ATIME) 1071 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1072 1073 if (flag & AT_MTIME) { 1074 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1075 if (zp->z_zfsvfs->z_use_fuids) 1076 zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); 1077 } 1078 1079 if (flag & AT_CTIME) { 1080 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1081 if (zp->z_zfsvfs->z_use_fuids) 1082 zp->z_phys->zp_flags |= ZFS_ARCHIVE; 1083 } 1084} 1085 1086/* 1087 * Update the requested znode timestamps with the current time. 1088 * If we are in a transaction, then go ahead and mark the znode 1089 * dirty in the transaction so the timestamps will go to disk. 1090 * Otherwise, we will get pushed next time the znode is updated 1091 * in a transaction, or when this znode eventually goes inactive. 1092 * 1093 * Why is this OK? 1094 * 1 - Only the ACCESS time is ever updated outside of a transaction. 1095 * 2 - Multiple consecutive updates will be collapsed into a single 1096 * znode update by the transaction grouping semantics of the DMU. 1097 */ 1098void 1099zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1100{ 1101 mutex_enter(&zp->z_lock); 1102 zfs_time_stamper_locked(zp, flag, tx); 1103 mutex_exit(&zp->z_lock); 1104} 1105 1106/* 1107 * Grow the block size for a file. 1108 * 1109 * IN: zp - znode of file to free data in. 1110 * size - requested block size 1111 * tx - open transaction. 1112 * 1113 * NOTE: this function assumes that the znode is write locked. 1114 */ 1115void 1116zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1117{ 1118 int error; 1119 u_longlong_t dummy; 1120 1121 if (size <= zp->z_blksz) 1122 return; 1123 /* 1124 * If the file size is already greater than the current blocksize, 1125 * we will not grow. If there is more than one block in a file, 1126 * the blocksize cannot change. 1127 */ 1128 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1129 return; 1130 1131 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1132 size, 0, tx); 1133 if (error == ENOTSUP) 1134 return; 1135 ASSERT3U(error, ==, 0); 1136 1137 /* What blocksize did we actually get? */ 1138 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1139} 1140 1141/* 1142 * Increase the file length 1143 * 1144 * IN: zp - znode of file to free data in. 1145 * end - new end-of-file 1146 * 1147 * RETURN: 0 if success 1148 * error code if failure 1149 */ 1150static int 1151zfs_extend(znode_t *zp, uint64_t end) 1152{ 1153 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1154 dmu_tx_t *tx; 1155 rl_t *rl; 1156 uint64_t newblksz; 1157 int error; 1158 1159 /* 1160 * We will change zp_size, lock the whole file. 1161 */ 1162 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1163 1164 /* 1165 * Nothing to do if file already at desired length. 1166 */ 1167 if (end <= zp->z_phys->zp_size) { 1168 zfs_range_unlock(rl); 1169 return (0); 1170 } 1171top: 1172 tx = dmu_tx_create(zfsvfs->z_os); 1173 dmu_tx_hold_bonus(tx, zp->z_id); 1174 if (end > zp->z_blksz && 1175 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1176 /* 1177 * We are growing the file past the current block size. 1178 */ 1179 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1180 ASSERT(!ISP2(zp->z_blksz)); 1181 newblksz = MIN(end, SPA_MAXBLOCKSIZE); 1182 } else { 1183 newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1184 } 1185 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1186 } else { 1187 newblksz = 0; 1188 } 1189 1190 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1191 if (error) { 1192 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1193 dmu_tx_wait(tx); 1194 dmu_tx_abort(tx); 1195 goto top; 1196 } 1197 dmu_tx_abort(tx); 1198 zfs_range_unlock(rl); 1199 return (error); 1200 } 1201 dmu_buf_will_dirty(zp->z_dbuf, tx); 1202 1203 if (newblksz) 1204 zfs_grow_blocksize(zp, newblksz, tx); 1205 1206 zp->z_phys->zp_size = end; 1207 1208 zfs_range_unlock(rl); 1209 1210 dmu_tx_commit(tx); 1211 1212 rw_enter(&zp->z_map_lock, RW_WRITER); 1213 error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0); 1214 ASSERT(error == 0); 1215 vnode_pager_setsize(ZTOV(zp), end); 1216 rw_exit(&zp->z_map_lock); 1217 1218 return (0); 1219} 1220 1221/* 1222 * Free space in a file. 1223 * 1224 * IN: zp - znode of file to free data in. 1225 * off - start of section to free. 1226 * len - length of section to free. 1227 * 1228 * RETURN: 0 if success 1229 * error code if failure 1230 */ 1231static int 1232zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1233{ 1234 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1235 rl_t *rl; 1236 int error; 1237 1238 /* 1239 * Lock the range being freed. 1240 */ 1241 rl = zfs_range_lock(zp, off, len, RL_WRITER); 1242 1243 /* 1244 * Nothing to do if file already at desired length. 1245 */ 1246 if (off >= zp->z_phys->zp_size) { 1247 zfs_range_unlock(rl); 1248 return (0); 1249 } 1250 1251 if (off + len > zp->z_phys->zp_size) 1252 len = zp->z_phys->zp_size - off; 1253 1254 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1255 1256 if (error == 0) { 1257 /* 1258 * In FreeBSD we cannot free block in the middle of a file, 1259 * but only at the end of a file. 1260 */ 1261 rw_enter(&zp->z_map_lock, RW_WRITER); 1262 error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0); 1263 ASSERT(error == 0); 1264 vnode_pager_setsize(ZTOV(zp), off); 1265 rw_exit(&zp->z_map_lock); 1266 } 1267 1268 zfs_range_unlock(rl); 1269 1270 return (error); 1271} 1272 1273/* 1274 * Truncate a file 1275 * 1276 * IN: zp - znode of file to free data in. 1277 * end - new end-of-file. 1278 * 1279 * RETURN: 0 if success 1280 * error code if failure 1281 */ 1282static int 1283zfs_trunc(znode_t *zp, uint64_t end) 1284{ 1285 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1286 vnode_t *vp = ZTOV(zp); 1287 dmu_tx_t *tx; 1288 rl_t *rl; 1289 int error; 1290 1291 /* 1292 * We will change zp_size, lock the whole file. 1293 */ 1294 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1295 1296 /* 1297 * Nothing to do if file already at desired length. 1298 */ 1299 if (end >= zp->z_phys->zp_size) { 1300 zfs_range_unlock(rl); 1301 return (0); 1302 } 1303 1304 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); 1305 if (error) { 1306 zfs_range_unlock(rl); 1307 return (error); 1308 } 1309top: 1310 tx = dmu_tx_create(zfsvfs->z_os); 1311 dmu_tx_hold_bonus(tx, zp->z_id); 1312 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1313 if (error) { 1314 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1315 dmu_tx_wait(tx); 1316 dmu_tx_abort(tx); 1317 goto top; 1318 } 1319 dmu_tx_abort(tx); 1320 zfs_range_unlock(rl); 1321 return (error); 1322 } 1323 dmu_buf_will_dirty(zp->z_dbuf, tx); 1324 1325 zp->z_phys->zp_size = end; 1326 1327 dmu_tx_commit(tx); 1328 1329 zfs_range_unlock(rl); 1330 1331 /* 1332 * Clear any mapped pages in the truncated region. This has to 1333 * happen outside of the transaction to avoid the possibility of 1334 * a deadlock with someone trying to push a page that we are 1335 * about to invalidate. 1336 */ 1337 rw_enter(&zp->z_map_lock, RW_WRITER); 1338#if 0 1339 error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE); 1340#else 1341 error = vinvalbuf(vp, V_SAVE, 0, 0); 1342 ASSERT(error == 0); 1343 vnode_pager_setsize(vp, end); 1344#endif 1345 rw_exit(&zp->z_map_lock); 1346 1347 return (0); 1348} 1349 1350/* 1351 * Free space in a file 1352 * 1353 * IN: zp - znode of file to free data in. 1354 * off - start of range 1355 * len - end of range (0 => EOF) 1356 * flag - current file open mode flags. 1357 * log - TRUE if this action should be logged 1358 * 1359 * RETURN: 0 if success 1360 * error code if failure 1361 */ 1362int 1363zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1364{ 1365 vnode_t *vp = ZTOV(zp); 1366 dmu_tx_t *tx; 1367 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1368 zilog_t *zilog = zfsvfs->z_log; 1369 int error; 1370 1371 if (off > zp->z_phys->zp_size) { 1372 error = zfs_extend(zp, off+len); 1373 if (error == 0 && log) 1374 goto log; 1375 else 1376 return (error); 1377 } 1378 1379 if (len == 0) { 1380 error = zfs_trunc(zp, off); 1381 } else { 1382 if ((error = zfs_free_range(zp, off, len)) == 0 && 1383 off + len > zp->z_phys->zp_size) 1384 error = zfs_extend(zp, off+len); 1385 } 1386 if (error || !log) 1387 return (error); 1388log: 1389 tx = dmu_tx_create(zfsvfs->z_os); 1390 dmu_tx_hold_bonus(tx, zp->z_id); 1391 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1392 if (error) { 1393 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1394 dmu_tx_wait(tx); 1395 dmu_tx_abort(tx); 1396 goto log; 1397 } 1398 dmu_tx_abort(tx); 1399 return (error); 1400 } 1401 1402 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 1403 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1404 1405 dmu_tx_commit(tx); 1406 return (0); 1407} 1408 1409void 1410zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1411{ 1412 zfsvfs_t zfsvfs; 1413 uint64_t moid, doid, version; 1414 uint64_t sense = ZFS_CASE_SENSITIVE; 1415 uint64_t norm = 0; 1416 nvpair_t *elem; 1417 int error; 1418 znode_t *rootzp = NULL; 1419 vnode_t *vp; 1420 vattr_t vattr; 1421 znode_t *zp; 1422 1423 /* 1424 * First attempt to create master node. 1425 */ 1426 /* 1427 * In an empty objset, there are no blocks to read and thus 1428 * there can be no i/o errors (which we assert below). 1429 */ 1430 moid = MASTER_NODE_OBJ; 1431 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1432 DMU_OT_NONE, 0, tx); 1433 ASSERT(error == 0); 1434 1435 /* 1436 * Set starting attributes. 1437 */ 1438 if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) 1439 version = ZPL_VERSION; 1440 else 1441 version = ZPL_VERSION_FUID - 1; 1442 error = zap_update(os, moid, ZPL_VERSION_STR, 1443 8, 1, &version, tx); 1444 elem = NULL; 1445 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1446 /* For the moment we expect all zpl props to be uint64_ts */ 1447 uint64_t val; 1448 char *name; 1449 1450 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1451 VERIFY(nvpair_value_uint64(elem, &val) == 0); 1452 name = nvpair_name(elem); 1453 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1454 version = val; 1455 error = zap_update(os, moid, ZPL_VERSION_STR, 1456 8, 1, &version, tx); 1457 } else { 1458 error = zap_update(os, moid, name, 8, 1, &val, tx); 1459 } 1460 ASSERT(error == 0); 1461 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1462 norm = val; 1463 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1464 sense = val; 1465 } 1466 ASSERT(version != 0); 1467 1468 /* 1469 * Create a delete queue. 1470 */ 1471 doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1472 1473 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); 1474 ASSERT(error == 0); 1475 1476 /* 1477 * Create root znode. Create minimal znode/vnode/zfsvfs 1478 * to allow zfs_mknode to work. 1479 */ 1480 VATTR_NULL(&vattr); 1481 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1482 vattr.va_type = VDIR; 1483 vattr.va_mode = S_IFDIR|0755; 1484 vattr.va_uid = crgetuid(cr); 1485 vattr.va_gid = crgetgid(cr); 1486 1487 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1488 zfs_znode_cache_constructor(rootzp, &zfsvfs, 0); 1489 rootzp->z_unlinked = 0; 1490 rootzp->z_atime_dirty = 0; 1491 1492 vp = ZTOV(rootzp); 1493 vp->v_type = VDIR; 1494 1495 bzero(&zfsvfs, sizeof (zfsvfs_t)); 1496 1497 zfsvfs.z_os = os; 1498 zfsvfs.z_assign = TXG_NOWAIT; 1499 zfsvfs.z_parent = &zfsvfs; 1500 zfsvfs.z_version = version; 1501 zfsvfs.z_use_fuids = USE_FUIDS(version, os); 1502 zfsvfs.z_norm = norm; 1503 /* 1504 * Fold case on file systems that are always or sometimes case 1505 * insensitive. 1506 */ 1507 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1508 zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; 1509 1510 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1511 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1512 offsetof(znode_t, z_link_node)); 1513 1514 ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); 1515 rootzp->z_zfsvfs = &zfsvfs; 1516 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL); 1517 ASSERT3P(zp, ==, rootzp); 1518 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1519 ASSERT(error == 0); 1520 POINTER_INVALIDATE(&rootzp->z_zfsvfs); 1521 1522 VI_LOCK(vp); 1523 ZTOV(rootzp)->v_data = NULL; 1524 ZTOV(rootzp)->v_count = 0; 1525 ZTOV(rootzp)->v_holdcnt = 0; 1526 ZTOV(rootzp) = NULL; 1527 VOP_UNLOCK(vp, 0); 1528 vdestroy(vp); 1529 dmu_buf_rele(rootzp->z_dbuf, NULL); 1530 rootzp->z_dbuf = NULL; 1531 mutex_destroy(&zfsvfs.z_znodes_lock); 1532 kmem_cache_free(znode_cache, rootzp); 1533} 1534 1535#endif /* _KERNEL */ 1536/* 1537 * Given an object number, return its parent object number and whether 1538 * or not the object is an extended attribute directory. 1539 */ 1540static int 1541zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) 1542{ 1543 dmu_buf_t *db; 1544 dmu_object_info_t doi; 1545 znode_phys_t *zp; 1546 int error; 1547 1548 if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) 1549 return (error); 1550 1551 dmu_object_info_from_db(db, &doi); 1552 if (doi.doi_bonus_type != DMU_OT_ZNODE || 1553 doi.doi_bonus_size < sizeof (znode_phys_t)) { 1554 dmu_buf_rele(db, FTAG); 1555 return (EINVAL); 1556 } 1557 1558 zp = db->db_data; 1559 *pobjp = zp->zp_parent; 1560 *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && 1561 S_ISDIR(zp->zp_mode); 1562 dmu_buf_rele(db, FTAG); 1563 1564 return (0); 1565} 1566 1567int 1568zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 1569{ 1570 char *path = buf + len - 1; 1571 int error; 1572 1573 *path = '\0'; 1574 1575 for (;;) { 1576 uint64_t pobj; 1577 char component[MAXNAMELEN + 2]; 1578 size_t complen; 1579 int is_xattrdir; 1580 1581 if ((error = zfs_obj_to_pobj(osp, obj, &pobj, 1582 &is_xattrdir)) != 0) 1583 break; 1584 1585 if (pobj == obj) { 1586 if (path[0] != '/') 1587 *--path = '/'; 1588 break; 1589 } 1590 1591 component[0] = '/'; 1592 if (is_xattrdir) { 1593 (void) sprintf(component + 1, "<xattrdir>"); 1594 } else { 1595 error = zap_value_search(osp, pobj, obj, 1596 ZFS_DIRENT_OBJ(-1ULL), component + 1); 1597 if (error != 0) 1598 break; 1599 } 1600 1601 complen = strlen(component); 1602 path -= complen; 1603 ASSERT(path >= buf); 1604 bcopy(component, path, complen); 1605 obj = pobj; 1606 } 1607 1608 if (error == 0) 1609 (void) memmove(buf, path, buf + len - path); 1610 return (error); 1611} 1612