zfs_vfsops.c revision 209230
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#include <sys/types.h> 27#include <sys/param.h> 28#include <sys/systm.h> 29#include <sys/kernel.h> 30#include <sys/sysmacros.h> 31#include <sys/kmem.h> 32#include <sys/acl.h> 33#include <sys/vnode.h> 34#include <sys/vfs.h> 35#include <sys/mntent.h> 36#include <sys/mount.h> 37#include <sys/cmn_err.h> 38#include <sys/zfs_znode.h> 39#include <sys/zfs_dir.h> 40#include <sys/zil.h> 41#include <sys/fs/zfs.h> 42#include <sys/dmu.h> 43#include <sys/dsl_prop.h> 44#include <sys/dsl_dataset.h> 45#include <sys/dsl_deleg.h> 46#include <sys/spa.h> 47#include <sys/zap.h> 48#include <sys/varargs.h> 49#include <sys/policy.h> 50#include <sys/atomic.h> 51#include <sys/zfs_ioctl.h> 52#include <sys/zfs_ctldir.h> 53#include <sys/zfs_fuid.h> 54#include <sys/sunddi.h> 55#include <sys/dnlc.h> 56#include <sys/dmu_objset.h> 57#include <sys/spa_boot.h> 58#include <sys/vdev_impl.h> /* VDEV_BOOT_VERSION */ 59 60struct mtx zfs_debug_mtx; 61MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); 62 63SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); 64 65int zfs_super_owner = 0; 66SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, 67 "File system owner can perform privileged operation on his file systems"); 68 69int zfs_debug_level = 0; 70TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level); 71SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0, 72 "Debug level"); 73 74SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); 75static int zfs_version_acl = ZFS_ACL_VERSION; 76SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, 77 "ZFS_ACL_VERSION"); 78static int zfs_version_dmu_backup_header = DMU_BACKUP_HEADER_VERSION; 79SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_header, CTLFLAG_RD, 80 &zfs_version_dmu_backup_header, 0, "DMU_BACKUP_HEADER_VERSION"); 81static int zfs_version_dmu_backup_stream = DMU_BACKUP_STREAM_VERSION; 82SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_stream, CTLFLAG_RD, 83 &zfs_version_dmu_backup_stream, 0, "DMU_BACKUP_STREAM_VERSION"); 84static int zfs_version_spa = SPA_VERSION; 85SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, 86 "SPA_VERSION"); 87static int zfs_version_vdev_boot = VDEV_BOOT_VERSION; 88SYSCTL_INT(_vfs_zfs_version, OID_AUTO, vdev_boot, CTLFLAG_RD, 89 &zfs_version_vdev_boot, 0, "VDEV_BOOT_VERSION"); 90static int zfs_version_zpl = ZPL_VERSION; 91SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, 92 "ZPL_VERSION"); 93 94static int zfs_mount(vfs_t *vfsp); 95static int zfs_umount(vfs_t *vfsp, int fflag); 96static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); 97static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); 98static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); 99static int zfs_sync(vfs_t *vfsp, int waitfor); 100static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, 101 struct ucred **credanonp, int *numsecflavors, int **secflavors); 102static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp); 103static void zfs_objset_close(zfsvfs_t *zfsvfs); 104static void zfs_freevfs(vfs_t *vfsp); 105 106static struct vfsops zfs_vfsops = { 107 .vfs_mount = zfs_mount, 108 .vfs_unmount = zfs_umount, 109 .vfs_root = zfs_root, 110 .vfs_statfs = zfs_statfs, 111 .vfs_vget = zfs_vget, 112 .vfs_sync = zfs_sync, 113 .vfs_checkexp = zfs_checkexp, 114 .vfs_fhtovp = zfs_fhtovp, 115}; 116 117VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); 118 119/* 120 * We need to keep a count of active fs's. 121 * This is necessary to prevent our module 122 * from being unloaded after a umount -f 123 */ 124static uint32_t zfs_active_fs_count = 0; 125 126/*ARGSUSED*/ 127static int 128zfs_sync(vfs_t *vfsp, int waitfor) 129{ 130 131 /* 132 * Data integrity is job one. We don't want a compromised kernel 133 * writing to the storage pool, so we never sync during panic. 134 */ 135 if (panicstr) 136 return (0); 137 138 if (vfsp != NULL) { 139 /* 140 * Sync a specific filesystem. 141 */ 142 zfsvfs_t *zfsvfs = vfsp->vfs_data; 143 int error; 144 145 error = vfs_stdsync(vfsp, waitfor); 146 if (error != 0) 147 return (error); 148 149 ZFS_ENTER(zfsvfs); 150 if (zfsvfs->z_log != NULL) 151 zil_commit(zfsvfs->z_log, UINT64_MAX, 0); 152 else 153 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 154 ZFS_EXIT(zfsvfs); 155 } else { 156 /* 157 * Sync all ZFS filesystems. This is what happens when you 158 * run sync(1M). Unlike other filesystems, ZFS honors the 159 * request by waiting for all pools to commit all dirty data. 160 */ 161 spa_sync_allpools(); 162 } 163 164 return (0); 165} 166 167static void 168atime_changed_cb(void *arg, uint64_t newval) 169{ 170 zfsvfs_t *zfsvfs = arg; 171 172 if (newval == TRUE) { 173 zfsvfs->z_atime = TRUE; 174 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; 175 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 176 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 177 } else { 178 zfsvfs->z_atime = FALSE; 179 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; 180 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 181 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 182 } 183} 184 185static void 186xattr_changed_cb(void *arg, uint64_t newval) 187{ 188 zfsvfs_t *zfsvfs = arg; 189 190 if (newval == TRUE) { 191 /* XXX locking on vfs_flag? */ 192#ifdef TODO 193 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; 194#endif 195 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); 196 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); 197 } else { 198 /* XXX locking on vfs_flag? */ 199#ifdef TODO 200 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; 201#endif 202 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); 203 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); 204 } 205} 206 207static void 208blksz_changed_cb(void *arg, uint64_t newval) 209{ 210 zfsvfs_t *zfsvfs = arg; 211 212 if (newval < SPA_MINBLOCKSIZE || 213 newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) 214 newval = SPA_MAXBLOCKSIZE; 215 216 zfsvfs->z_max_blksz = newval; 217 zfsvfs->z_vfs->mnt_stat.f_iosize = newval; 218} 219 220static void 221readonly_changed_cb(void *arg, uint64_t newval) 222{ 223 zfsvfs_t *zfsvfs = arg; 224 225 if (newval) { 226 /* XXX locking on vfs_flag? */ 227 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 228 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 229 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 230 } else { 231 /* XXX locking on vfs_flag? */ 232 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 233 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 234 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 235 } 236} 237 238static void 239setuid_changed_cb(void *arg, uint64_t newval) 240{ 241 zfsvfs_t *zfsvfs = arg; 242 243 if (newval == FALSE) { 244 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 245 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 246 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 247 } else { 248 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 249 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 250 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 251 } 252} 253 254static void 255exec_changed_cb(void *arg, uint64_t newval) 256{ 257 zfsvfs_t *zfsvfs = arg; 258 259 if (newval == FALSE) { 260 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 261 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 262 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 263 } else { 264 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 265 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 266 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 267 } 268} 269 270/* 271 * The nbmand mount option can be changed at mount time. 272 * We can't allow it to be toggled on live file systems or incorrect 273 * behavior may be seen from cifs clients 274 * 275 * This property isn't registered via dsl_prop_register(), but this callback 276 * will be called when a file system is first mounted 277 */ 278static void 279nbmand_changed_cb(void *arg, uint64_t newval) 280{ 281 zfsvfs_t *zfsvfs = arg; 282 if (newval == FALSE) { 283 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 284 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 285 } else { 286 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 287 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 288 } 289} 290 291static void 292snapdir_changed_cb(void *arg, uint64_t newval) 293{ 294 zfsvfs_t *zfsvfs = arg; 295 296 zfsvfs->z_show_ctldir = newval; 297} 298 299static void 300vscan_changed_cb(void *arg, uint64_t newval) 301{ 302 zfsvfs_t *zfsvfs = arg; 303 304 zfsvfs->z_vscan = newval; 305} 306 307static void 308acl_mode_changed_cb(void *arg, uint64_t newval) 309{ 310 zfsvfs_t *zfsvfs = arg; 311 312 zfsvfs->z_acl_mode = newval; 313} 314 315static void 316acl_inherit_changed_cb(void *arg, uint64_t newval) 317{ 318 zfsvfs_t *zfsvfs = arg; 319 320 zfsvfs->z_acl_inherit = newval; 321} 322 323static int 324zfs_register_callbacks(vfs_t *vfsp) 325{ 326 struct dsl_dataset *ds = NULL; 327 objset_t *os = NULL; 328 zfsvfs_t *zfsvfs = NULL; 329 uint64_t nbmand; 330 int readonly, do_readonly = FALSE; 331 int setuid, do_setuid = FALSE; 332 int exec, do_exec = FALSE; 333 int xattr, do_xattr = FALSE; 334 int atime, do_atime = FALSE; 335 int error = 0; 336 337 ASSERT(vfsp); 338 zfsvfs = vfsp->vfs_data; 339 ASSERT(zfsvfs); 340 os = zfsvfs->z_os; 341 342 /* 343 * This function can be called for a snapshot when we update snapshot's 344 * mount point, which isn't really supported. 345 */ 346 if (dmu_objset_is_snapshot(os)) 347 return (EOPNOTSUPP); 348 349 /* 350 * The act of registering our callbacks will destroy any mount 351 * options we may have. In order to enable temporary overrides 352 * of mount options, we stash away the current values and 353 * restore them after we register the callbacks. 354 */ 355 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { 356 readonly = B_TRUE; 357 do_readonly = B_TRUE; 358 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 359 readonly = B_FALSE; 360 do_readonly = B_TRUE; 361 } 362 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 363 setuid = B_FALSE; 364 do_setuid = B_TRUE; 365 } else { 366 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 367 setuid = B_FALSE; 368 do_setuid = B_TRUE; 369 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 370 setuid = B_TRUE; 371 do_setuid = B_TRUE; 372 } 373 } 374 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 375 exec = B_FALSE; 376 do_exec = B_TRUE; 377 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 378 exec = B_TRUE; 379 do_exec = B_TRUE; 380 } 381 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 382 xattr = B_FALSE; 383 do_xattr = B_TRUE; 384 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 385 xattr = B_TRUE; 386 do_xattr = B_TRUE; 387 } 388 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 389 atime = B_FALSE; 390 do_atime = B_TRUE; 391 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 392 atime = B_TRUE; 393 do_atime = B_TRUE; 394 } 395 396 /* 397 * nbmand is a special property. It can only be changed at 398 * mount time. 399 * 400 * This is weird, but it is documented to only be changeable 401 * at mount time. 402 */ 403 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 404 nbmand = B_FALSE; 405 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 406 nbmand = B_TRUE; 407 } else { 408 char osname[MAXNAMELEN]; 409 410 dmu_objset_name(os, osname); 411 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, 412 NULL)) { 413 return (error); 414 } 415 } 416 417 /* 418 * Register property callbacks. 419 * 420 * It would probably be fine to just check for i/o error from 421 * the first prop_register(), but I guess I like to go 422 * overboard... 423 */ 424 ds = dmu_objset_ds(os); 425 error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); 426 error = error ? error : dsl_prop_register(ds, 427 "xattr", xattr_changed_cb, zfsvfs); 428 error = error ? error : dsl_prop_register(ds, 429 "recordsize", blksz_changed_cb, zfsvfs); 430 error = error ? error : dsl_prop_register(ds, 431 "readonly", readonly_changed_cb, zfsvfs); 432 error = error ? error : dsl_prop_register(ds, 433 "setuid", setuid_changed_cb, zfsvfs); 434 error = error ? error : dsl_prop_register(ds, 435 "exec", exec_changed_cb, zfsvfs); 436 error = error ? error : dsl_prop_register(ds, 437 "snapdir", snapdir_changed_cb, zfsvfs); 438 error = error ? error : dsl_prop_register(ds, 439 "aclmode", acl_mode_changed_cb, zfsvfs); 440 error = error ? error : dsl_prop_register(ds, 441 "aclinherit", acl_inherit_changed_cb, zfsvfs); 442 error = error ? error : dsl_prop_register(ds, 443 "vscan", vscan_changed_cb, zfsvfs); 444 if (error) 445 goto unregister; 446 447 /* 448 * Invoke our callbacks to restore temporary mount options. 449 */ 450 if (do_readonly) 451 readonly_changed_cb(zfsvfs, readonly); 452 if (do_setuid) 453 setuid_changed_cb(zfsvfs, setuid); 454 if (do_exec) 455 exec_changed_cb(zfsvfs, exec); 456 if (do_xattr) 457 xattr_changed_cb(zfsvfs, xattr); 458 if (do_atime) 459 atime_changed_cb(zfsvfs, atime); 460 461 nbmand_changed_cb(zfsvfs, nbmand); 462 463 return (0); 464 465unregister: 466 /* 467 * We may attempt to unregister some callbacks that are not 468 * registered, but this is OK; it will simply return ENOMSG, 469 * which we will ignore. 470 */ 471 (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); 472 (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); 473 (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); 474 (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); 475 (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); 476 (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); 477 (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); 478 (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); 479 (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, 480 zfsvfs); 481 (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); 482 return (error); 483 484} 485 486static int 487zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 488{ 489 int error; 490 491 error = zfs_register_callbacks(zfsvfs->z_vfs); 492 if (error) 493 return (error); 494 495 /* 496 * Set the objset user_ptr to track its zfsvfs. 497 */ 498 mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock); 499 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 500 mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); 501 502 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); 503 if (zil_disable) { 504 zil_destroy(zfsvfs->z_log, B_FALSE); 505 zfsvfs->z_log = NULL; 506 } 507 508 /* 509 * If we are not mounting (ie: online recv), then we don't 510 * have to worry about replaying the log as we blocked all 511 * operations out since we closed the ZIL. 512 */ 513 if (mounting) { 514 boolean_t readonly; 515 516 /* 517 * During replay we remove the read only flag to 518 * allow replays to succeed. 519 */ 520 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 521 if (readonly != 0) 522 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 523 else 524 zfs_unlinked_drain(zfsvfs); 525 526 if (zfsvfs->z_log) { 527 /* 528 * Parse and replay the intent log. 529 * 530 * Because of ziltest, this must be done after 531 * zfs_unlinked_drain(). (Further note: ziltest 532 * doesn't use readonly mounts, where 533 * zfs_unlinked_drain() isn't called.) This is because 534 * ziltest causes spa_sync() to think it's committed, 535 * but actually it is not, so the intent log contains 536 * many txg's worth of changes. 537 * 538 * In particular, if object N is in the unlinked set in 539 * the last txg to actually sync, then it could be 540 * actually freed in a later txg and then reallocated 541 * in a yet later txg. This would write a "create 542 * object N" record to the intent log. Normally, this 543 * would be fine because the spa_sync() would have 544 * written out the fact that object N is free, before 545 * we could write the "create object N" intent log 546 * record. 547 * 548 * But when we are in ziltest mode, we advance the "open 549 * txg" without actually spa_sync()-ing the changes to 550 * disk. So we would see that object N is still 551 * allocated and in the unlinked set, and there is an 552 * intent log record saying to allocate it. 553 */ 554 zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, 555 zfs_replay_vector, zfs_unlinked_drain); 556 } 557 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ 558 } 559 560 return (0); 561} 562 563static void 564zfs_freezfsvfs(zfsvfs_t *zfsvfs) 565{ 566 mutex_destroy(&zfsvfs->z_znodes_lock); 567 mutex_destroy(&zfsvfs->z_online_recv_lock); 568 list_destroy(&zfsvfs->z_all_znodes); 569 rrw_destroy(&zfsvfs->z_teardown_lock); 570 rw_destroy(&zfsvfs->z_teardown_inactive_lock); 571 rw_destroy(&zfsvfs->z_fuid_lock); 572 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 573} 574 575static int 576zfs_domount(vfs_t *vfsp, char *osname) 577{ 578 uint64_t recordsize, readonly; 579 int error = 0; 580 int mode; 581 zfsvfs_t *zfsvfs; 582 znode_t *zp = NULL; 583 584 ASSERT(vfsp); 585 ASSERT(osname); 586 587 /* 588 * Initialize the zfs-specific filesystem structure. 589 * Should probably make this a kmem cache, shuffle fields, 590 * and just bzero up to z_hold_mtx[]. 591 */ 592 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 593 zfsvfs->z_vfs = vfsp; 594 zfsvfs->z_parent = zfsvfs; 595 zfsvfs->z_assign = TXG_NOWAIT; 596 zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; 597 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 598 599 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 600 mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); 601 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 602 offsetof(znode_t, z_link_node)); 603 rrw_init(&zfsvfs->z_teardown_lock); 604 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); 605 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 606 607 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, 608 NULL)) 609 goto out; 610 zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; 611 zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; 612 613 vfsp->vfs_data = zfsvfs; 614 vfsp->mnt_flag |= MNT_LOCAL; 615 vfsp->mnt_kern_flag |= MNTK_MPSAFE; 616 vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; 617 vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; 618 619 if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) 620 goto out; 621 622 mode = DS_MODE_OWNER; 623 if (readonly) 624 mode |= DS_MODE_READONLY; 625 626 error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); 627 if (error == EROFS) { 628 mode = DS_MODE_OWNER | DS_MODE_READONLY; 629 error = dmu_objset_open(osname, DMU_OST_ZFS, mode, 630 &zfsvfs->z_os); 631 } 632 633 if (error) 634 goto out; 635 636 if (error = zfs_init_fs(zfsvfs, &zp)) 637 goto out; 638 639 /* 640 * Set features for file system. 641 */ 642 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 643 if (zfsvfs->z_use_fuids) { 644 vfs_set_feature(vfsp, VFSFT_XVATTR); 645 vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); 646 vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS); 647 vfs_set_feature(vfsp, VFSFT_ACLONCREATE); 648 } 649 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { 650 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 651 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 652 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); 653 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { 654 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 655 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 656 } 657 658 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 659 uint64_t pval; 660 661 ASSERT(mode & DS_MODE_READONLY); 662 atime_changed_cb(zfsvfs, B_FALSE); 663 readonly_changed_cb(zfsvfs, B_TRUE); 664 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) 665 goto out; 666 xattr_changed_cb(zfsvfs, pval); 667 zfsvfs->z_issnap = B_TRUE; 668 } else { 669 error = zfsvfs_setup(zfsvfs, B_TRUE); 670 } 671 672 vfs_mountedfrom(vfsp, osname); 673 674 if (!zfsvfs->z_issnap) 675 zfsctl_create(zfsvfs); 676out: 677 if (error) { 678 if (zfsvfs->z_os) 679 dmu_objset_close(zfsvfs->z_os); 680 zfs_freezfsvfs(zfsvfs); 681 } else { 682 atomic_add_32(&zfs_active_fs_count, 1); 683 } 684 685 return (error); 686} 687 688void 689zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 690{ 691 objset_t *os = zfsvfs->z_os; 692 struct dsl_dataset *ds; 693 694 /* 695 * Unregister properties. 696 */ 697 if (!dmu_objset_is_snapshot(os)) { 698 ds = dmu_objset_ds(os); 699 VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, 700 zfsvfs) == 0); 701 702 VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, 703 zfsvfs) == 0); 704 705 VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, 706 zfsvfs) == 0); 707 708 VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, 709 zfsvfs) == 0); 710 711 VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, 712 zfsvfs) == 0); 713 714 VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, 715 zfsvfs) == 0); 716 717 VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, 718 zfsvfs) == 0); 719 720 VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, 721 zfsvfs) == 0); 722 723 VERIFY(dsl_prop_unregister(ds, "aclinherit", 724 acl_inherit_changed_cb, zfsvfs) == 0); 725 726 VERIFY(dsl_prop_unregister(ds, "vscan", 727 vscan_changed_cb, zfsvfs) == 0); 728 } 729} 730 731/*ARGSUSED*/ 732static int 733zfs_mount(vfs_t *vfsp) 734{ 735 kthread_t *td = curthread; 736 vnode_t *mvp = vfsp->mnt_vnodecovered; 737 cred_t *cr = td->td_ucred; 738 char *osname; 739 int error = 0; 740 int canwrite; 741 742 if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) 743 return (EINVAL); 744 745 /* 746 * If full-owner-access is enabled and delegated administration is 747 * turned on, we must set nosuid. 748 */ 749 if (zfs_super_owner && 750 dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { 751 secpolicy_fs_mount_clearopts(cr, vfsp); 752 } 753 754 /* 755 * Check for mount privilege? 756 * 757 * If we don't have privilege then see if 758 * we have local permission to allow it 759 */ 760 error = secpolicy_fs_mount(cr, mvp, vfsp); 761 if (error) { 762 error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr); 763 if (error != 0) 764 goto out; 765 766 if (!(vfsp->vfs_flag & MS_REMOUNT)) { 767 vattr_t vattr; 768 769 /* 770 * Make sure user is the owner of the mount point 771 * or has sufficient privileges. 772 */ 773 774 vattr.va_mask = AT_UID; 775 776 vn_lock(mvp, LK_SHARED | LK_RETRY); 777 if (error = VOP_GETATTR(mvp, &vattr, cr)) { 778 VOP_UNLOCK(mvp, 0); 779 goto out; 780 } 781 782#if 0 /* CHECK THIS! Is probably needed for zfs_suser. */ 783 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && 784 VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { 785 error = EPERM; 786 goto out; 787 } 788#else 789 if (error = secpolicy_vnode_owner(mvp, cr, vattr.va_uid)) { 790 VOP_UNLOCK(mvp, 0); 791 goto out; 792 } 793 794 if (error = VOP_ACCESS(mvp, VWRITE, cr, td)) { 795 VOP_UNLOCK(mvp, 0); 796 goto out; 797 } 798 VOP_UNLOCK(mvp, 0); 799#endif 800 } 801 802 secpolicy_fs_mount_clearopts(cr, vfsp); 803 } 804 805 /* 806 * Refuse to mount a filesystem if we are in a local zone and the 807 * dataset is not visible. 808 */ 809 if (!INGLOBALZONE(curthread) && 810 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 811 error = EPERM; 812 goto out; 813 } 814 815 /* 816 * When doing a remount, we simply refresh our temporary properties 817 * according to those options set in the current VFS options. 818 */ 819 if (vfsp->vfs_flag & MS_REMOUNT) { 820 /* refresh mount options */ 821 zfs_unregister_callbacks(vfsp->vfs_data); 822 error = zfs_register_callbacks(vfsp); 823 goto out; 824 } 825 826 DROP_GIANT(); 827 error = zfs_domount(vfsp, osname); 828 PICKUP_GIANT(); 829out: 830 return (error); 831} 832 833static int 834zfs_statfs(vfs_t *vfsp, struct statfs *statp) 835{ 836 zfsvfs_t *zfsvfs = vfsp->vfs_data; 837 uint64_t refdbytes, availbytes, usedobjs, availobjs; 838 839 statp->f_version = STATFS_VERSION; 840 841 ZFS_ENTER(zfsvfs); 842 843 dmu_objset_space(zfsvfs->z_os, 844 &refdbytes, &availbytes, &usedobjs, &availobjs); 845 846 /* 847 * The underlying storage pool actually uses multiple block sizes. 848 * We report the fragsize as the smallest block size we support, 849 * and we report our blocksize as the filesystem's maximum blocksize. 850 */ 851 statp->f_bsize = SPA_MINBLOCKSIZE; 852 statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; 853 854 /* 855 * The following report "total" blocks of various kinds in the 856 * file system, but reported in terms of f_frsize - the 857 * "fragment" size. 858 */ 859 860 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 861 statp->f_bfree = availbytes / statp->f_bsize; 862 statp->f_bavail = statp->f_bfree; /* no root reservation */ 863 864 /* 865 * statvfs() should really be called statufs(), because it assumes 866 * static metadata. ZFS doesn't preallocate files, so the best 867 * we can do is report the max that could possibly fit in f_files, 868 * and that minus the number actually used in f_ffree. 869 * For f_ffree, report the smaller of the number of object available 870 * and the number of blocks (each object will take at least a block). 871 */ 872 statp->f_ffree = MIN(availobjs, statp->f_bfree); 873 statp->f_files = statp->f_ffree + usedobjs; 874 875 /* 876 * We're a zfs filesystem. 877 */ 878 (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); 879 880 strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, 881 sizeof(statp->f_mntfromname)); 882 strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, 883 sizeof(statp->f_mntonname)); 884 885 statp->f_namemax = ZFS_MAXNAMELEN; 886 887 ZFS_EXIT(zfsvfs); 888 return (0); 889} 890 891static int 892zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) 893{ 894 zfsvfs_t *zfsvfs = vfsp->vfs_data; 895 znode_t *rootzp; 896 int error; 897 898 ZFS_ENTER_NOERROR(zfsvfs); 899 900 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); 901 902 ZFS_EXIT(zfsvfs); 903 904 if (error == 0) { 905 *vpp = ZTOV(rootzp); 906 error = vn_lock(*vpp, flags); 907 (*vpp)->v_vflag |= VV_ROOT; 908 } 909 910 return (error); 911} 912 913/* 914 * Teardown the zfsvfs::z_os. 915 * 916 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' 917 * and 'z_teardown_inactive_lock' held. 918 */ 919static int 920zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 921{ 922 znode_t *zp; 923 924 rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); 925 926 if (!unmounting) { 927 /* 928 * We purge the parent filesystem's vfsp as the parent 929 * filesystem and all of its snapshots have their vnode's 930 * v_vfsp set to the parent's filesystem's vfsp. Note, 931 * 'z_parent' is self referential for non-snapshots. 932 */ 933 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 934#ifdef FREEBSD_NAMECACHE 935 cache_purgevfs(zfsvfs->z_parent->z_vfs); 936#endif 937 } 938 939 /* 940 * Close the zil. NB: Can't close the zil while zfs_inactive 941 * threads are blocked as zil_close can call zfs_inactive. 942 */ 943 if (zfsvfs->z_log) { 944 zil_close(zfsvfs->z_log); 945 zfsvfs->z_log = NULL; 946 } 947 948 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); 949 950 /* 951 * If we are not unmounting (ie: online recv) and someone already 952 * unmounted this file system while we were doing the switcheroo, 953 * or a reopen of z_os failed then just bail out now. 954 */ 955 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 956 rw_exit(&zfsvfs->z_teardown_inactive_lock); 957 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 958 return (EIO); 959 } 960 961 /* 962 * At this point there are no vops active, and any new vops will 963 * fail with EIO since we have z_teardown_lock for writer (only 964 * relavent for forced unmount). 965 * 966 * Release all holds on dbufs. 967 */ 968 mutex_enter(&zfsvfs->z_znodes_lock); 969 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 970 zp = list_next(&zfsvfs->z_all_znodes, zp)) 971 if (zp->z_dbuf) { 972 ASSERT(ZTOV(zp)->v_count >= 0); 973 zfs_znode_dmu_fini(zp); 974 } 975 mutex_exit(&zfsvfs->z_znodes_lock); 976 977 /* 978 * If we are unmounting, set the unmounted flag and let new vops 979 * unblock. zfs_inactive will have the unmounted behavior, and all 980 * other vops will fail with EIO. 981 */ 982 if (unmounting) { 983 zfsvfs->z_unmounted = B_TRUE; 984 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 985 rw_exit(&zfsvfs->z_teardown_inactive_lock); 986 987#ifdef __FreeBSD__ 988 /* 989 * Some znodes might not be fully reclaimed, wait for them. 990 */ 991 mutex_enter(&zfsvfs->z_znodes_lock); 992 while (list_head(&zfsvfs->z_all_znodes) != NULL) { 993 msleep(zfsvfs, &zfsvfs->z_znodes_lock, 0, 994 "zteardown", 0); 995 } 996 mutex_exit(&zfsvfs->z_znodes_lock); 997#endif 998 } 999 1000 /* 1001 * z_os will be NULL if there was an error in attempting to reopen 1002 * zfsvfs, so just return as the properties had already been 1003 * unregistered and cached data had been evicted before. 1004 */ 1005 if (zfsvfs->z_os == NULL) 1006 return (0); 1007 1008 /* 1009 * Unregister properties. 1010 */ 1011 zfs_unregister_callbacks(zfsvfs); 1012 1013 /* 1014 * Evict cached data 1015 */ 1016 if (dmu_objset_evict_dbufs(zfsvfs->z_os)) { 1017 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 1018 (void) dmu_objset_evict_dbufs(zfsvfs->z_os); 1019 } 1020 1021 return (0); 1022} 1023 1024/*ARGSUSED*/ 1025static int 1026zfs_umount(vfs_t *vfsp, int fflag) 1027{ 1028 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1029 objset_t *os; 1030 cred_t *cr = curthread->td_ucred; 1031 int ret; 1032 1033 ret = secpolicy_fs_unmount(cr, vfsp); 1034 if (ret) { 1035 ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), 1036 ZFS_DELEG_PERM_MOUNT, cr); 1037 if (ret) 1038 return (ret); 1039 } 1040 /* 1041 * We purge the parent filesystem's vfsp as the parent filesystem 1042 * and all of its snapshots have their vnode's v_vfsp set to the 1043 * parent's filesystem's vfsp. Note, 'z_parent' is self 1044 * referential for non-snapshots. 1045 */ 1046 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 1047 1048 /* 1049 * Unmount any snapshots mounted under .zfs before unmounting the 1050 * dataset itself. 1051 */ 1052 if (zfsvfs->z_ctldir != NULL) { 1053 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) 1054 return (ret); 1055 ret = vflush(vfsp, 0, 0, curthread); 1056 ASSERT(ret == EBUSY); 1057 if (!(fflag & MS_FORCE)) { 1058 if (zfsvfs->z_ctldir->v_count > 1) 1059 return (EBUSY); 1060 ASSERT(zfsvfs->z_ctldir->v_count == 1); 1061 } 1062 zfsctl_destroy(zfsvfs); 1063 ASSERT(zfsvfs->z_ctldir == NULL); 1064 } 1065 1066 if (fflag & MS_FORCE) { 1067 /* 1068 * Mark file system as unmounted before calling 1069 * vflush(FORCECLOSE). This way we ensure no future vnops 1070 * will be called and risk operating on DOOMED vnodes. 1071 */ 1072 rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); 1073 zfsvfs->z_unmounted = B_TRUE; 1074 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 1075 } 1076 1077 /* 1078 * Flush all the files. 1079 */ 1080 ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, curthread); 1081 if (ret != 0) { 1082 if (!zfsvfs->z_issnap) { 1083 zfsctl_create(zfsvfs); 1084 ASSERT(zfsvfs->z_ctldir != NULL); 1085 } 1086 return (ret); 1087 } 1088 1089 if (!(fflag & MS_FORCE)) { 1090 /* 1091 * Check the number of active vnodes in the file system. 1092 * Our count is maintained in the vfs structure, but the 1093 * number is off by 1 to indicate a hold on the vfs 1094 * structure itself. 1095 * 1096 * The '.zfs' directory maintains a reference of its 1097 * own, and any active references underneath are 1098 * reflected in the vnode count. 1099 */ 1100 if (zfsvfs->z_ctldir == NULL) { 1101 if (vfsp->vfs_count > 1) 1102 return (EBUSY); 1103 } else { 1104 if (vfsp->vfs_count > 2 || 1105 zfsvfs->z_ctldir->v_count > 1) 1106 return (EBUSY); 1107 } 1108 } else { 1109 MNT_ILOCK(vfsp); 1110 vfsp->mnt_kern_flag |= MNTK_UNMOUNTF; 1111 MNT_IUNLOCK(vfsp); 1112 } 1113 1114 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); 1115 os = zfsvfs->z_os; 1116 1117 /* 1118 * z_os will be NULL if there was an error in 1119 * attempting to reopen zfsvfs. 1120 */ 1121 if (os != NULL) { 1122 /* 1123 * Unset the objset user_ptr. 1124 */ 1125 mutex_enter(&os->os->os_user_ptr_lock); 1126 dmu_objset_set_user(os, NULL); 1127 mutex_exit(&os->os->os_user_ptr_lock); 1128 1129 /* 1130 * Finally release the objset 1131 */ 1132 dmu_objset_close(os); 1133 } 1134 1135 /* 1136 * We can now safely destroy the '.zfs' directory node. 1137 */ 1138 if (zfsvfs->z_ctldir != NULL) 1139 zfsctl_destroy(zfsvfs); 1140 if (zfsvfs->z_issnap) { 1141 vnode_t *svp = vfsp->mnt_vnodecovered; 1142 1143 if (svp->v_count >= 2) 1144 VN_RELE(svp); 1145 } 1146 zfs_freevfs(vfsp); 1147 1148 return (0); 1149} 1150 1151static int 1152zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) 1153{ 1154 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1155 znode_t *zp; 1156 int err; 1157 1158 /* 1159 * XXXPJD: zfs_zget() can't operate on virtual entires like .zfs/ or 1160 * .zfs/snapshot/ directories, so for now just return EOPNOTSUPP. 1161 * This will make NFS to fall back to using READDIR instead of 1162 * READDIRPLUS. 1163 * Also snapshots are stored in AVL tree, but based on their names, 1164 * not inode numbers, so it will be very inefficient to iterate 1165 * over all snapshots to find the right one. 1166 * Note that OpenSolaris READDIRPLUS implementation does LOOKUP on 1167 * d_name, and not VGET on d_fileno as we do. 1168 */ 1169 if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR) 1170 return (EOPNOTSUPP); 1171 1172 ZFS_ENTER(zfsvfs); 1173 err = zfs_zget(zfsvfs, ino, &zp); 1174 if (err == 0 && zp->z_unlinked) { 1175 VN_RELE(ZTOV(zp)); 1176 err = EINVAL; 1177 } 1178 ZFS_EXIT(zfsvfs); 1179 if (err != 0) 1180 *vpp = NULL; 1181 else { 1182 *vpp = ZTOV(zp); 1183 vn_lock(*vpp, flags); 1184 } 1185 return (err); 1186} 1187 1188static int 1189zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, 1190 struct ucred **credanonp, int *numsecflavors, int **secflavors) 1191{ 1192 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1193 1194 /* 1195 * If this is regular file system vfsp is the same as 1196 * zfsvfs->z_parent->z_vfs, but if it is snapshot, 1197 * zfsvfs->z_parent->z_vfs represents parent file system 1198 * which we have to use here, because only this file system 1199 * has mnt_export configured. 1200 */ 1201 return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, 1202 credanonp, numsecflavors, secflavors)); 1203} 1204 1205CTASSERT(SHORT_FID_LEN <= sizeof(struct fid)); 1206CTASSERT(LONG_FID_LEN <= sizeof(struct fid)); 1207 1208static int 1209zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) 1210{ 1211 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1212 znode_t *zp; 1213 uint64_t object = 0; 1214 uint64_t fid_gen = 0; 1215 uint64_t gen_mask; 1216 uint64_t zp_gen; 1217 int i, err; 1218 1219 *vpp = NULL; 1220 1221 ZFS_ENTER(zfsvfs); 1222 1223 /* 1224 * On FreeBSD we can get snapshot's mount point or its parent file 1225 * system mount point depending if snapshot is already mounted or not. 1226 */ 1227 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { 1228 zfid_long_t *zlfid = (zfid_long_t *)fidp; 1229 uint64_t objsetid = 0; 1230 uint64_t setgen = 0; 1231 1232 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 1233 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 1234 1235 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 1236 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 1237 1238 ZFS_EXIT(zfsvfs); 1239 1240 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 1241 if (err) 1242 return (EINVAL); 1243 ZFS_ENTER(zfsvfs); 1244 } 1245 1246 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 1247 zfid_short_t *zfid = (zfid_short_t *)fidp; 1248 1249 for (i = 0; i < sizeof (zfid->zf_object); i++) 1250 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 1251 1252 for (i = 0; i < sizeof (zfid->zf_gen); i++) 1253 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 1254 } else { 1255 ZFS_EXIT(zfsvfs); 1256 return (EINVAL); 1257 } 1258 1259 /* A zero fid_gen means we are in the .zfs control directories */ 1260 if (fid_gen == 0 && 1261 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { 1262 *vpp = zfsvfs->z_ctldir; 1263 ASSERT(*vpp != NULL); 1264 if (object == ZFSCTL_INO_SNAPDIR) { 1265 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 1266 0, NULL, NULL, NULL, NULL, NULL) == 0); 1267 } else { 1268 VN_HOLD(*vpp); 1269 } 1270 ZFS_EXIT(zfsvfs); 1271 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); 1272 return (0); 1273 } 1274 1275 gen_mask = -1ULL >> (64 - 8 * i); 1276 1277 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); 1278 if (err = zfs_zget(zfsvfs, object, &zp)) { 1279 ZFS_EXIT(zfsvfs); 1280 return (err); 1281 } 1282 zp_gen = zp->z_phys->zp_gen & gen_mask; 1283 if (zp_gen == 0) 1284 zp_gen = 1; 1285 if (zp->z_unlinked || zp_gen != fid_gen) { 1286 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); 1287 VN_RELE(ZTOV(zp)); 1288 ZFS_EXIT(zfsvfs); 1289 return (EINVAL); 1290 } 1291 1292 ZFS_EXIT(zfsvfs); 1293 1294 *vpp = ZTOV(zp); 1295 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); 1296 vnode_create_vobject(*vpp, zp->z_phys->zp_size, curthread); 1297 return (0); 1298} 1299 1300/* 1301 * Block out VOPs and close zfsvfs_t::z_os 1302 * 1303 * Note, if successful, then we return with the 'z_teardown_lock' and 1304 * 'z_teardown_inactive_lock' write held. 1305 */ 1306int 1307zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode) 1308{ 1309 int error; 1310 1311 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 1312 return (error); 1313 1314 *mode = zfsvfs->z_os->os_mode; 1315 dmu_objset_name(zfsvfs->z_os, name); 1316 dmu_objset_close(zfsvfs->z_os); 1317 1318 return (0); 1319} 1320 1321/* 1322 * Reopen zfsvfs_t::z_os and release VOPs. 1323 */ 1324int 1325zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode) 1326{ 1327 int err; 1328 1329 ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); 1330 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); 1331 1332 err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); 1333 if (err) { 1334 zfsvfs->z_os = NULL; 1335 } else { 1336 znode_t *zp; 1337 1338 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); 1339 1340 /* 1341 * Attempt to re-establish all the active znodes with 1342 * their dbufs. If a zfs_rezget() fails, then we'll let 1343 * any potential callers discover that via ZFS_ENTER_VERIFY_VP 1344 * when they try to use their znode. 1345 */ 1346 mutex_enter(&zfsvfs->z_znodes_lock); 1347 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 1348 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 1349 (void) zfs_rezget(zp); 1350 } 1351 mutex_exit(&zfsvfs->z_znodes_lock); 1352 1353 } 1354 1355 /* release the VOPs */ 1356 rw_exit(&zfsvfs->z_teardown_inactive_lock); 1357 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 1358 1359 if (err) { 1360 /* 1361 * Since we couldn't reopen zfsvfs::z_os, force 1362 * unmount this file system. 1363 */ 1364 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) 1365 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); 1366 } 1367 return (err); 1368} 1369 1370static void 1371zfs_freevfs(vfs_t *vfsp) 1372{ 1373 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1374 int i; 1375 1376 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1377 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1378 1379 zfs_fuid_destroy(zfsvfs); 1380 zfs_freezfsvfs(zfsvfs); 1381 1382 atomic_add_32(&zfs_active_fs_count, -1); 1383} 1384 1385#ifdef __i386__ 1386static int desiredvnodes_backup; 1387#endif 1388 1389static void 1390zfs_vnodes_adjust(void) 1391{ 1392#ifdef __i386__ 1393 int newdesiredvnodes; 1394 1395 desiredvnodes_backup = desiredvnodes; 1396 1397 /* 1398 * We calculate newdesiredvnodes the same way it is done in 1399 * vntblinit(). If it is equal to desiredvnodes, it means that 1400 * it wasn't tuned by the administrator and we can tune it down. 1401 */ 1402 newdesiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * 1403 vm_kmem_size / (5 * (sizeof(struct vm_object) + 1404 sizeof(struct vnode)))); 1405 if (newdesiredvnodes == desiredvnodes) 1406 desiredvnodes = (3 * newdesiredvnodes) / 4; 1407#endif 1408} 1409 1410static void 1411zfs_vnodes_adjust_back(void) 1412{ 1413 1414#ifdef __i386__ 1415 desiredvnodes = desiredvnodes_backup; 1416#endif 1417} 1418 1419void 1420zfs_init(void) 1421{ 1422 1423 printf("ZFS filesystem version " ZPL_VERSION_STRING "\n"); 1424 1425 /* 1426 * Initialize znode cache, vnode ops, etc... 1427 */ 1428 zfs_znode_init(); 1429 1430 /* 1431 * Initialize .zfs directory structures 1432 */ 1433 zfsctl_init(); 1434 1435 /* 1436 * Reduce number of vnode. Originally number of vnodes is calculated 1437 * with UFS inode in mind. We reduce it here, because it's too big for 1438 * ZFS/i386. 1439 */ 1440 zfs_vnodes_adjust(); 1441} 1442 1443void 1444zfs_fini(void) 1445{ 1446 zfsctl_fini(); 1447 zfs_znode_fini(); 1448 zfs_vnodes_adjust_back(); 1449} 1450 1451int 1452zfs_busy(void) 1453{ 1454 return (zfs_active_fs_count != 0); 1455} 1456 1457int 1458zfs_set_version(const char *name, uint64_t newvers) 1459{ 1460 int error; 1461 objset_t *os; 1462 dmu_tx_t *tx; 1463 uint64_t curvers; 1464 1465 /* 1466 * XXX for now, require that the filesystem be unmounted. Would 1467 * be nice to find the zfsvfs_t and just update that if 1468 * possible. 1469 */ 1470 1471 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 1472 return (EINVAL); 1473 1474 error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os); 1475 if (error) 1476 return (error); 1477 1478 error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 1479 8, 1, &curvers); 1480 if (error) 1481 goto out; 1482 if (newvers < curvers) { 1483 error = EINVAL; 1484 goto out; 1485 } 1486 1487 tx = dmu_tx_create(os); 1488 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR); 1489 error = dmu_tx_assign(tx, TXG_WAIT); 1490 if (error) { 1491 dmu_tx_abort(tx); 1492 goto out; 1493 } 1494 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, 1495 &newvers, tx); 1496 1497 spa_history_internal_log(LOG_DS_UPGRADE, 1498 dmu_objset_spa(os), tx, CRED(), 1499 "oldver=%llu newver=%llu dataset = %llu", curvers, newvers, 1500 dmu_objset_id(os)); 1501 dmu_tx_commit(tx); 1502 1503out: 1504 dmu_objset_close(os); 1505 return (error); 1506} 1507/* 1508 * Read a property stored within the master node. 1509 */ 1510int 1511zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) 1512{ 1513 const char *pname; 1514 int error = ENOENT; 1515 1516 /* 1517 * Look up the file system's value for the property. For the 1518 * version property, we look up a slightly different string. 1519 */ 1520 if (prop == ZFS_PROP_VERSION) 1521 pname = ZPL_VERSION_STR; 1522 else 1523 pname = zfs_prop_to_name(prop); 1524 1525 if (os != NULL) 1526 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); 1527 1528 if (error == ENOENT) { 1529 /* No value set, use the default value */ 1530 switch (prop) { 1531 case ZFS_PROP_VERSION: 1532 *value = ZPL_VERSION; 1533 break; 1534 case ZFS_PROP_NORMALIZE: 1535 case ZFS_PROP_UTF8ONLY: 1536 *value = 0; 1537 break; 1538 case ZFS_PROP_CASE: 1539 *value = ZFS_CASE_SENSITIVE; 1540 break; 1541 default: 1542 return (error); 1543 } 1544 error = 0; 1545 } 1546 return (error); 1547} 1548