zfs_vfsops.c revision 225736
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25/* Portions Copyright 2010 Robert Milkowski */ 26 27#include <sys/types.h> 28#include <sys/param.h> 29#include <sys/systm.h> 30#include <sys/kernel.h> 31#include <sys/sysmacros.h> 32#include <sys/kmem.h> 33#include <sys/acl.h> 34#include <sys/vnode.h> 35#include <sys/vfs.h> 36#include <sys/mntent.h> 37#include <sys/mount.h> 38#include <sys/cmn_err.h> 39#include <sys/zfs_znode.h> 40#include <sys/zfs_dir.h> 41#include <sys/zil.h> 42#include <sys/fs/zfs.h> 43#include <sys/dmu.h> 44#include <sys/dsl_prop.h> 45#include <sys/dsl_dataset.h> 46#include <sys/dsl_deleg.h> 47#include <sys/spa.h> 48#include <sys/zap.h> 49#include <sys/sa.h> 50#include <sys/varargs.h> 51#include <sys/policy.h> 52#include <sys/atomic.h> 53#include <sys/zfs_ioctl.h> 54#include <sys/zfs_ctldir.h> 55#include <sys/zfs_fuid.h> 56#include <sys/sunddi.h> 57#include <sys/dnlc.h> 58#include <sys/dmu_objset.h> 59#include <sys/spa_boot.h> 60#include <sys/sa.h> 61#include "zfs_comutil.h" 62 63struct mtx zfs_debug_mtx; 64MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); 65 66SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); 67 68int zfs_super_owner; 69SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, 70 "File system owner can perform privileged operation on his file systems"); 71 72int zfs_debug_level; 73TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level); 74SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0, 75 "Debug level"); 76 77SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); 78static int zfs_version_acl = ZFS_ACL_VERSION; 79SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, 80 "ZFS_ACL_VERSION"); 81static int zfs_version_spa = SPA_VERSION; 82SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, 83 "SPA_VERSION"); 84static int zfs_version_zpl = ZPL_VERSION; 85SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, 86 "ZPL_VERSION"); 87 88static int zfs_mount(vfs_t *vfsp); 89static int zfs_umount(vfs_t *vfsp, int fflag); 90static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); 91static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); 92static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); 93static int zfs_sync(vfs_t *vfsp, int waitfor); 94static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, 95 struct ucred **credanonp, int *numsecflavors, int **secflavors); 96static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); 97static void zfs_objset_close(zfsvfs_t *zfsvfs); 98static void zfs_freevfs(vfs_t *vfsp); 99 100static struct vfsops zfs_vfsops = { 101 .vfs_mount = zfs_mount, 102 .vfs_unmount = zfs_umount, 103 .vfs_root = zfs_root, 104 .vfs_statfs = zfs_statfs, 105 .vfs_vget = zfs_vget, 106 .vfs_sync = zfs_sync, 107 .vfs_checkexp = zfs_checkexp, 108 .vfs_fhtovp = zfs_fhtovp, 109}; 110 111VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); 112 113/* 114 * We need to keep a count of active fs's. 115 * This is necessary to prevent our module 116 * from being unloaded after a umount -f 117 */ 118static uint32_t zfs_active_fs_count = 0; 119 120/*ARGSUSED*/ 121static int 122zfs_sync(vfs_t *vfsp, int waitfor) 123{ 124 125 /* 126 * Data integrity is job one. We don't want a compromised kernel 127 * writing to the storage pool, so we never sync during panic. 128 */ 129 if (panicstr) 130 return (0); 131 132 if (vfsp != NULL) { 133 /* 134 * Sync a specific filesystem. 135 */ 136 zfsvfs_t *zfsvfs = vfsp->vfs_data; 137 dsl_pool_t *dp; 138 int error; 139 140 error = vfs_stdsync(vfsp, waitfor); 141 if (error != 0) 142 return (error); 143 144 ZFS_ENTER(zfsvfs); 145 dp = dmu_objset_pool(zfsvfs->z_os); 146 147 /* 148 * If the system is shutting down, then skip any 149 * filesystems which may exist on a suspended pool. 150 */ 151 if (sys_shutdown && spa_suspended(dp->dp_spa)) { 152 ZFS_EXIT(zfsvfs); 153 return (0); 154 } 155 156 if (zfsvfs->z_log != NULL) 157 zil_commit(zfsvfs->z_log, 0); 158 159 ZFS_EXIT(zfsvfs); 160 } else { 161 /* 162 * Sync all ZFS filesystems. This is what happens when you 163 * run sync(1M). Unlike other filesystems, ZFS honors the 164 * request by waiting for all pools to commit all dirty data. 165 */ 166 spa_sync_allpools(); 167 } 168 169 return (0); 170} 171 172#ifndef __FreeBSD__ 173static int 174zfs_create_unique_device(dev_t *dev) 175{ 176 major_t new_major; 177 178 do { 179 ASSERT3U(zfs_minor, <=, MAXMIN32); 180 minor_t start = zfs_minor; 181 do { 182 mutex_enter(&zfs_dev_mtx); 183 if (zfs_minor >= MAXMIN32) { 184 /* 185 * If we're still using the real major 186 * keep out of /dev/zfs and /dev/zvol minor 187 * number space. If we're using a getudev()'ed 188 * major number, we can use all of its minors. 189 */ 190 if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) 191 zfs_minor = ZFS_MIN_MINOR; 192 else 193 zfs_minor = 0; 194 } else { 195 zfs_minor++; 196 } 197 *dev = makedevice(zfs_major, zfs_minor); 198 mutex_exit(&zfs_dev_mtx); 199 } while (vfs_devismounted(*dev) && zfs_minor != start); 200 if (zfs_minor == start) { 201 /* 202 * We are using all ~262,000 minor numbers for the 203 * current major number. Create a new major number. 204 */ 205 if ((new_major = getudev()) == (major_t)-1) { 206 cmn_err(CE_WARN, 207 "zfs_mount: Can't get unique major " 208 "device number."); 209 return (-1); 210 } 211 mutex_enter(&zfs_dev_mtx); 212 zfs_major = new_major; 213 zfs_minor = 0; 214 215 mutex_exit(&zfs_dev_mtx); 216 } else { 217 break; 218 } 219 /* CONSTANTCONDITION */ 220 } while (1); 221 222 return (0); 223} 224#endif /* !__FreeBSD__ */ 225 226static void 227atime_changed_cb(void *arg, uint64_t newval) 228{ 229 zfsvfs_t *zfsvfs = arg; 230 231 if (newval == TRUE) { 232 zfsvfs->z_atime = TRUE; 233 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; 234 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 235 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 236 } else { 237 zfsvfs->z_atime = FALSE; 238 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; 239 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 240 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 241 } 242} 243 244static void 245xattr_changed_cb(void *arg, uint64_t newval) 246{ 247 zfsvfs_t *zfsvfs = arg; 248 249 if (newval == TRUE) { 250 /* XXX locking on vfs_flag? */ 251#ifdef TODO 252 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; 253#endif 254 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); 255 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); 256 } else { 257 /* XXX locking on vfs_flag? */ 258#ifdef TODO 259 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; 260#endif 261 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); 262 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); 263 } 264} 265 266static void 267blksz_changed_cb(void *arg, uint64_t newval) 268{ 269 zfsvfs_t *zfsvfs = arg; 270 271 if (newval < SPA_MINBLOCKSIZE || 272 newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) 273 newval = SPA_MAXBLOCKSIZE; 274 275 zfsvfs->z_max_blksz = newval; 276 zfsvfs->z_vfs->mnt_stat.f_iosize = newval; 277} 278 279static void 280readonly_changed_cb(void *arg, uint64_t newval) 281{ 282 zfsvfs_t *zfsvfs = arg; 283 284 if (newval) { 285 /* XXX locking on vfs_flag? */ 286 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 287 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 288 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 289 } else { 290 /* XXX locking on vfs_flag? */ 291 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 292 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 293 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 294 } 295} 296 297static void 298setuid_changed_cb(void *arg, uint64_t newval) 299{ 300 zfsvfs_t *zfsvfs = arg; 301 302 if (newval == FALSE) { 303 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 304 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 305 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 306 } else { 307 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 308 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 309 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 310 } 311} 312 313static void 314exec_changed_cb(void *arg, uint64_t newval) 315{ 316 zfsvfs_t *zfsvfs = arg; 317 318 if (newval == FALSE) { 319 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 320 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 321 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 322 } else { 323 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 324 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 325 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 326 } 327} 328 329/* 330 * The nbmand mount option can be changed at mount time. 331 * We can't allow it to be toggled on live file systems or incorrect 332 * behavior may be seen from cifs clients 333 * 334 * This property isn't registered via dsl_prop_register(), but this callback 335 * will be called when a file system is first mounted 336 */ 337static void 338nbmand_changed_cb(void *arg, uint64_t newval) 339{ 340 zfsvfs_t *zfsvfs = arg; 341 if (newval == FALSE) { 342 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 343 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 344 } else { 345 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 346 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 347 } 348} 349 350static void 351snapdir_changed_cb(void *arg, uint64_t newval) 352{ 353 zfsvfs_t *zfsvfs = arg; 354 355 zfsvfs->z_show_ctldir = newval; 356} 357 358static void 359vscan_changed_cb(void *arg, uint64_t newval) 360{ 361 zfsvfs_t *zfsvfs = arg; 362 363 zfsvfs->z_vscan = newval; 364} 365 366static void 367acl_mode_changed_cb(void *arg, uint64_t newval) 368{ 369 zfsvfs_t *zfsvfs = arg; 370 371 zfsvfs->z_acl_mode = newval; 372} 373 374static void 375acl_inherit_changed_cb(void *arg, uint64_t newval) 376{ 377 zfsvfs_t *zfsvfs = arg; 378 379 zfsvfs->z_acl_inherit = newval; 380} 381 382static int 383zfs_register_callbacks(vfs_t *vfsp) 384{ 385 struct dsl_dataset *ds = NULL; 386 objset_t *os = NULL; 387 zfsvfs_t *zfsvfs = NULL; 388 uint64_t nbmand; 389 int readonly, do_readonly = B_FALSE; 390 int setuid, do_setuid = B_FALSE; 391 int exec, do_exec = B_FALSE; 392 int xattr, do_xattr = B_FALSE; 393 int atime, do_atime = B_FALSE; 394 int error = 0; 395 396 ASSERT(vfsp); 397 zfsvfs = vfsp->vfs_data; 398 ASSERT(zfsvfs); 399 os = zfsvfs->z_os; 400 401 /* 402 * This function can be called for a snapshot when we update snapshot's 403 * mount point, which isn't really supported. 404 */ 405 if (dmu_objset_is_snapshot(os)) 406 return (EOPNOTSUPP); 407 408 /* 409 * The act of registering our callbacks will destroy any mount 410 * options we may have. In order to enable temporary overrides 411 * of mount options, we stash away the current values and 412 * restore them after we register the callbacks. 413 */ 414 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || 415 !spa_writeable(dmu_objset_spa(os))) { 416 readonly = B_TRUE; 417 do_readonly = B_TRUE; 418 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 419 readonly = B_FALSE; 420 do_readonly = B_TRUE; 421 } 422 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 423 setuid = B_FALSE; 424 do_setuid = B_TRUE; 425 } else { 426 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 427 setuid = B_FALSE; 428 do_setuid = B_TRUE; 429 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 430 setuid = B_TRUE; 431 do_setuid = B_TRUE; 432 } 433 } 434 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 435 exec = B_FALSE; 436 do_exec = B_TRUE; 437 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 438 exec = B_TRUE; 439 do_exec = B_TRUE; 440 } 441 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 442 xattr = B_FALSE; 443 do_xattr = B_TRUE; 444 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 445 xattr = B_TRUE; 446 do_xattr = B_TRUE; 447 } 448 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 449 atime = B_FALSE; 450 do_atime = B_TRUE; 451 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 452 atime = B_TRUE; 453 do_atime = B_TRUE; 454 } 455 456 /* 457 * nbmand is a special property. It can only be changed at 458 * mount time. 459 * 460 * This is weird, but it is documented to only be changeable 461 * at mount time. 462 */ 463 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 464 nbmand = B_FALSE; 465 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 466 nbmand = B_TRUE; 467 } else { 468 char osname[MAXNAMELEN]; 469 470 dmu_objset_name(os, osname); 471 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, 472 NULL)) { 473 return (error); 474 } 475 } 476 477 /* 478 * Register property callbacks. 479 * 480 * It would probably be fine to just check for i/o error from 481 * the first prop_register(), but I guess I like to go 482 * overboard... 483 */ 484 ds = dmu_objset_ds(os); 485 error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); 486 error = error ? error : dsl_prop_register(ds, 487 "xattr", xattr_changed_cb, zfsvfs); 488 error = error ? error : dsl_prop_register(ds, 489 "recordsize", blksz_changed_cb, zfsvfs); 490 error = error ? error : dsl_prop_register(ds, 491 "readonly", readonly_changed_cb, zfsvfs); 492 error = error ? error : dsl_prop_register(ds, 493 "setuid", setuid_changed_cb, zfsvfs); 494 error = error ? error : dsl_prop_register(ds, 495 "exec", exec_changed_cb, zfsvfs); 496 error = error ? error : dsl_prop_register(ds, 497 "snapdir", snapdir_changed_cb, zfsvfs); 498 error = error ? error : dsl_prop_register(ds, 499 "aclmode", acl_mode_changed_cb, zfsvfs); 500 error = error ? error : dsl_prop_register(ds, 501 "aclinherit", acl_inherit_changed_cb, zfsvfs); 502 error = error ? error : dsl_prop_register(ds, 503 "vscan", vscan_changed_cb, zfsvfs); 504 if (error) 505 goto unregister; 506 507 /* 508 * Invoke our callbacks to restore temporary mount options. 509 */ 510 if (do_readonly) 511 readonly_changed_cb(zfsvfs, readonly); 512 if (do_setuid) 513 setuid_changed_cb(zfsvfs, setuid); 514 if (do_exec) 515 exec_changed_cb(zfsvfs, exec); 516 if (do_xattr) 517 xattr_changed_cb(zfsvfs, xattr); 518 if (do_atime) 519 atime_changed_cb(zfsvfs, atime); 520 521 nbmand_changed_cb(zfsvfs, nbmand); 522 523 return (0); 524 525unregister: 526 /* 527 * We may attempt to unregister some callbacks that are not 528 * registered, but this is OK; it will simply return ENOMSG, 529 * which we will ignore. 530 */ 531 (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); 532 (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); 533 (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); 534 (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); 535 (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); 536 (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); 537 (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); 538 (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); 539 (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, 540 zfsvfs); 541 (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); 542 return (error); 543 544} 545 546static int 547zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, 548 uint64_t *userp, uint64_t *groupp) 549{ 550 znode_phys_t *znp = data; 551 int error = 0; 552 553 /* 554 * Is it a valid type of object to track? 555 */ 556 if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) 557 return (ENOENT); 558 559 /* 560 * If we have a NULL data pointer 561 * then assume the id's aren't changing and 562 * return EEXIST to the dmu to let it know to 563 * use the same ids 564 */ 565 if (data == NULL) 566 return (EEXIST); 567 568 if (bonustype == DMU_OT_ZNODE) { 569 *userp = znp->zp_uid; 570 *groupp = znp->zp_gid; 571 } else { 572 int hdrsize; 573 574 ASSERT(bonustype == DMU_OT_SA); 575 hdrsize = sa_hdrsize(data); 576 577 if (hdrsize != 0) { 578 *userp = *((uint64_t *)((uintptr_t)data + hdrsize + 579 SA_UID_OFFSET)); 580 *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + 581 SA_GID_OFFSET)); 582 } else { 583 /* 584 * This should only happen for newly created 585 * files that haven't had the znode data filled 586 * in yet. 587 */ 588 *userp = 0; 589 *groupp = 0; 590 } 591 } 592 return (error); 593} 594 595static void 596fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, 597 char *domainbuf, int buflen, uid_t *ridp) 598{ 599 uint64_t fuid; 600 const char *domain; 601 602 fuid = strtonum(fuidstr, NULL); 603 604 domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); 605 if (domain) 606 (void) strlcpy(domainbuf, domain, buflen); 607 else 608 domainbuf[0] = '\0'; 609 *ridp = FUID_RID(fuid); 610} 611 612static uint64_t 613zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) 614{ 615 switch (type) { 616 case ZFS_PROP_USERUSED: 617 return (DMU_USERUSED_OBJECT); 618 case ZFS_PROP_GROUPUSED: 619 return (DMU_GROUPUSED_OBJECT); 620 case ZFS_PROP_USERQUOTA: 621 return (zfsvfs->z_userquota_obj); 622 case ZFS_PROP_GROUPQUOTA: 623 return (zfsvfs->z_groupquota_obj); 624 } 625 return (0); 626} 627 628int 629zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 630 uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) 631{ 632 int error; 633 zap_cursor_t zc; 634 zap_attribute_t za; 635 zfs_useracct_t *buf = vbuf; 636 uint64_t obj; 637 638 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 639 return (ENOTSUP); 640 641 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 642 if (obj == 0) { 643 *bufsizep = 0; 644 return (0); 645 } 646 647 for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); 648 (error = zap_cursor_retrieve(&zc, &za)) == 0; 649 zap_cursor_advance(&zc)) { 650 if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > 651 *bufsizep) 652 break; 653 654 fuidstr_to_sid(zfsvfs, za.za_name, 655 buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); 656 657 buf->zu_space = za.za_first_integer; 658 buf++; 659 } 660 if (error == ENOENT) 661 error = 0; 662 663 ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); 664 *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; 665 *cookiep = zap_cursor_serialize(&zc); 666 zap_cursor_fini(&zc); 667 return (error); 668} 669 670/* 671 * buf must be big enough (eg, 32 bytes) 672 */ 673static int 674id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, 675 char *buf, boolean_t addok) 676{ 677 uint64_t fuid; 678 int domainid = 0; 679 680 if (domain && domain[0]) { 681 domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); 682 if (domainid == -1) 683 return (ENOENT); 684 } 685 fuid = FUID_ENCODE(domainid, rid); 686 (void) sprintf(buf, "%llx", (longlong_t)fuid); 687 return (0); 688} 689 690int 691zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 692 const char *domain, uint64_t rid, uint64_t *valp) 693{ 694 char buf[32]; 695 int err; 696 uint64_t obj; 697 698 *valp = 0; 699 700 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 701 return (ENOTSUP); 702 703 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 704 if (obj == 0) 705 return (0); 706 707 err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); 708 if (err) 709 return (err); 710 711 err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); 712 if (err == ENOENT) 713 err = 0; 714 return (err); 715} 716 717int 718zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 719 const char *domain, uint64_t rid, uint64_t quota) 720{ 721 char buf[32]; 722 int err; 723 dmu_tx_t *tx; 724 uint64_t *objp; 725 boolean_t fuid_dirtied; 726 727 if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) 728 return (EINVAL); 729 730 if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) 731 return (ENOTSUP); 732 733 objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : 734 &zfsvfs->z_groupquota_obj; 735 736 err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); 737 if (err) 738 return (err); 739 fuid_dirtied = zfsvfs->z_fuid_dirty; 740 741 tx = dmu_tx_create(zfsvfs->z_os); 742 dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); 743 if (*objp == 0) { 744 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 745 zfs_userquota_prop_prefixes[type]); 746 } 747 if (fuid_dirtied) 748 zfs_fuid_txhold(zfsvfs, tx); 749 err = dmu_tx_assign(tx, TXG_WAIT); 750 if (err) { 751 dmu_tx_abort(tx); 752 return (err); 753 } 754 755 mutex_enter(&zfsvfs->z_lock); 756 if (*objp == 0) { 757 *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, 758 DMU_OT_NONE, 0, tx); 759 VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 760 zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); 761 } 762 mutex_exit(&zfsvfs->z_lock); 763 764 if (quota == 0) { 765 err = zap_remove(zfsvfs->z_os, *objp, buf, tx); 766 if (err == ENOENT) 767 err = 0; 768 } else { 769 err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); 770 } 771 ASSERT(err == 0); 772 if (fuid_dirtied) 773 zfs_fuid_sync(zfsvfs, tx); 774 dmu_tx_commit(tx); 775 return (err); 776} 777 778boolean_t 779zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) 780{ 781 char buf[32]; 782 uint64_t used, quota, usedobj, quotaobj; 783 int err; 784 785 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; 786 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 787 788 if (quotaobj == 0 || zfsvfs->z_replay) 789 return (B_FALSE); 790 791 (void) sprintf(buf, "%llx", (longlong_t)fuid); 792 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); 793 if (err != 0) 794 return (B_FALSE); 795 796 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); 797 if (err != 0) 798 return (B_FALSE); 799 return (used >= quota); 800} 801 802boolean_t 803zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) 804{ 805 uint64_t fuid; 806 uint64_t quotaobj; 807 808 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 809 810 fuid = isgroup ? zp->z_gid : zp->z_uid; 811 812 if (quotaobj == 0 || zfsvfs->z_replay) 813 return (B_FALSE); 814 815 return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); 816} 817 818int 819zfsvfs_create(const char *osname, zfsvfs_t **zfvp) 820{ 821 objset_t *os; 822 zfsvfs_t *zfsvfs; 823 uint64_t zval; 824 int i, error; 825 uint64_t sa_obj; 826 827 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 828 829 /* 830 * We claim to always be readonly so we can open snapshots; 831 * other ZPL code will prevent us from writing to snapshots. 832 */ 833 error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); 834 if (error) { 835 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 836 return (error); 837 } 838 839 /* 840 * Initialize the zfs-specific filesystem structure. 841 * Should probably make this a kmem cache, shuffle fields, 842 * and just bzero up to z_hold_mtx[]. 843 */ 844 zfsvfs->z_vfs = NULL; 845 zfsvfs->z_parent = zfsvfs; 846 zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; 847 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 848 zfsvfs->z_os = os; 849 850 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 851 if (error) { 852 goto out; 853 } else if (zfsvfs->z_version > 854 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { 855 (void) printf("Can't mount a version %lld file system " 856 "on a version %lld pool\n. Pool must be upgraded to mount " 857 "this file system.", (u_longlong_t)zfsvfs->z_version, 858 (u_longlong_t)spa_version(dmu_objset_spa(os))); 859 error = ENOTSUP; 860 goto out; 861 } 862 if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) 863 goto out; 864 zfsvfs->z_norm = (int)zval; 865 866 if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) 867 goto out; 868 zfsvfs->z_utf8 = (zval != 0); 869 870 if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) 871 goto out; 872 zfsvfs->z_case = (uint_t)zval; 873 874 /* 875 * Fold case on file systems that are always or sometimes case 876 * insensitive. 877 */ 878 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 879 zfsvfs->z_case == ZFS_CASE_MIXED) 880 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 881 882 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 883 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 884 885 if (zfsvfs->z_use_sa) { 886 /* should either have both of these objects or none */ 887 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, 888 &sa_obj); 889 if (error) 890 return (error); 891 } else { 892 /* 893 * Pre SA versions file systems should never touch 894 * either the attribute registration or layout objects. 895 */ 896 sa_obj = 0; 897 } 898 899 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 900 &zfsvfs->z_attr_table); 901 if (error) 902 goto out; 903 904 if (zfsvfs->z_version >= ZPL_VERSION_SA) 905 sa_register_update_callback(os, zfs_sa_upgrade); 906 907 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 908 &zfsvfs->z_root); 909 if (error) 910 goto out; 911 ASSERT(zfsvfs->z_root != 0); 912 913 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 914 &zfsvfs->z_unlinkedobj); 915 if (error) 916 goto out; 917 918 error = zap_lookup(os, MASTER_NODE_OBJ, 919 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 920 8, 1, &zfsvfs->z_userquota_obj); 921 if (error && error != ENOENT) 922 goto out; 923 924 error = zap_lookup(os, MASTER_NODE_OBJ, 925 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 926 8, 1, &zfsvfs->z_groupquota_obj); 927 if (error && error != ENOENT) 928 goto out; 929 930 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 931 &zfsvfs->z_fuid_obj); 932 if (error && error != ENOENT) 933 goto out; 934 935 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 936 &zfsvfs->z_shares_dir); 937 if (error && error != ENOENT) 938 goto out; 939 940 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 941 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 942 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 943 offsetof(znode_t, z_link_node)); 944 rrw_init(&zfsvfs->z_teardown_lock); 945 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); 946 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 947 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 948 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 949 950 *zfvp = zfsvfs; 951 return (0); 952 953out: 954 dmu_objset_disown(os, zfsvfs); 955 *zfvp = NULL; 956 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 957 return (error); 958} 959 960static int 961zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 962{ 963 int error; 964 965 error = zfs_register_callbacks(zfsvfs->z_vfs); 966 if (error) 967 return (error); 968 969 /* 970 * Set the objset user_ptr to track its zfsvfs. 971 */ 972 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 973 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 974 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 975 976 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); 977 978 /* 979 * If we are not mounting (ie: online recv), then we don't 980 * have to worry about replaying the log as we blocked all 981 * operations out since we closed the ZIL. 982 */ 983 if (mounting) { 984 boolean_t readonly; 985 986 /* 987 * During replay we remove the read only flag to 988 * allow replays to succeed. 989 */ 990 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 991 if (readonly != 0) 992 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 993 else 994 zfs_unlinked_drain(zfsvfs); 995 996 /* 997 * Parse and replay the intent log. 998 * 999 * Because of ziltest, this must be done after 1000 * zfs_unlinked_drain(). (Further note: ziltest 1001 * doesn't use readonly mounts, where 1002 * zfs_unlinked_drain() isn't called.) This is because 1003 * ziltest causes spa_sync() to think it's committed, 1004 * but actually it is not, so the intent log contains 1005 * many txg's worth of changes. 1006 * 1007 * In particular, if object N is in the unlinked set in 1008 * the last txg to actually sync, then it could be 1009 * actually freed in a later txg and then reallocated 1010 * in a yet later txg. This would write a "create 1011 * object N" record to the intent log. Normally, this 1012 * would be fine because the spa_sync() would have 1013 * written out the fact that object N is free, before 1014 * we could write the "create object N" intent log 1015 * record. 1016 * 1017 * But when we are in ziltest mode, we advance the "open 1018 * txg" without actually spa_sync()-ing the changes to 1019 * disk. So we would see that object N is still 1020 * allocated and in the unlinked set, and there is an 1021 * intent log record saying to allocate it. 1022 */ 1023 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { 1024 if (zil_replay_disable) { 1025 zil_destroy(zfsvfs->z_log, B_FALSE); 1026 } else { 1027 zfsvfs->z_replay = B_TRUE; 1028 zil_replay(zfsvfs->z_os, zfsvfs, 1029 zfs_replay_vector); 1030 zfsvfs->z_replay = B_FALSE; 1031 } 1032 } 1033 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ 1034 } 1035 1036 return (0); 1037} 1038 1039extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ 1040 1041void 1042zfsvfs_free(zfsvfs_t *zfsvfs) 1043{ 1044 int i; 1045 1046 /* 1047 * This is a barrier to prevent the filesystem from going away in 1048 * zfs_znode_move() until we can safely ensure that the filesystem is 1049 * not unmounted. We consider the filesystem valid before the barrier 1050 * and invalid after the barrier. 1051 */ 1052 rw_enter(&zfsvfs_lock, RW_READER); 1053 rw_exit(&zfsvfs_lock); 1054 1055 zfs_fuid_destroy(zfsvfs); 1056 1057 mutex_destroy(&zfsvfs->z_znodes_lock); 1058 mutex_destroy(&zfsvfs->z_lock); 1059 list_destroy(&zfsvfs->z_all_znodes); 1060 rrw_destroy(&zfsvfs->z_teardown_lock); 1061 rw_destroy(&zfsvfs->z_teardown_inactive_lock); 1062 rw_destroy(&zfsvfs->z_fuid_lock); 1063 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1064 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1065 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1066} 1067 1068static void 1069zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1070{ 1071 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1072 if (zfsvfs->z_vfs) { 1073 if (zfsvfs->z_use_fuids) { 1074 vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1075 vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1076 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1077 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1078 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1079 vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1080 } else { 1081 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1082 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1083 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1084 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1085 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1086 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1087 } 1088 } 1089 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1090} 1091 1092static int 1093zfs_domount(vfs_t *vfsp, char *osname) 1094{ 1095 uint64_t recordsize, fsid_guid; 1096 int error = 0; 1097 zfsvfs_t *zfsvfs; 1098 vnode_t *vp; 1099 1100 ASSERT(vfsp); 1101 ASSERT(osname); 1102 1103 error = zfsvfs_create(osname, &zfsvfs); 1104 if (error) 1105 return (error); 1106 zfsvfs->z_vfs = vfsp; 1107 1108 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, 1109 NULL)) 1110 goto out; 1111 zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; 1112 zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; 1113 1114 vfsp->vfs_data = zfsvfs; 1115 vfsp->mnt_flag |= MNT_LOCAL; 1116 vfsp->mnt_kern_flag |= MNTK_MPSAFE; 1117 vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; 1118 vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; 1119 1120 /* 1121 * The fsid is 64 bits, composed of an 8-bit fs type, which 1122 * separates our fsid from any other filesystem types, and a 1123 * 56-bit objset unique ID. The objset unique ID is unique to 1124 * all objsets open on this system, provided by unique_create(). 1125 * The 8-bit fs type must be put in the low bits of fsid[1] 1126 * because that's where other Solaris filesystems put it. 1127 */ 1128 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1129 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 1130 vfsp->vfs_fsid.val[0] = fsid_guid; 1131 vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | 1132 vfsp->mnt_vfc->vfc_typenum & 0xFF; 1133 1134 /* 1135 * Set features for file system. 1136 */ 1137 zfs_set_fuid_feature(zfsvfs); 1138 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { 1139 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1140 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1141 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); 1142 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { 1143 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1144 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1145 } 1146 vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); 1147 1148 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1149 uint64_t pval; 1150 1151 atime_changed_cb(zfsvfs, B_FALSE); 1152 readonly_changed_cb(zfsvfs, B_TRUE); 1153 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) 1154 goto out; 1155 xattr_changed_cb(zfsvfs, pval); 1156 zfsvfs->z_issnap = B_TRUE; 1157 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; 1158 1159 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1160 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1161 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1162 } else { 1163 error = zfsvfs_setup(zfsvfs, B_TRUE); 1164 } 1165 1166 vfs_mountedfrom(vfsp, osname); 1167 /* Grab extra reference. */ 1168 VERIFY(VFS_ROOT(vfsp, LK_EXCLUSIVE, &vp) == 0); 1169 VOP_UNLOCK(vp, 0); 1170 1171 if (!zfsvfs->z_issnap) 1172 zfsctl_create(zfsvfs); 1173out: 1174 if (error) { 1175 dmu_objset_disown(zfsvfs->z_os, zfsvfs); 1176 zfsvfs_free(zfsvfs); 1177 } else { 1178 atomic_add_32(&zfs_active_fs_count, 1); 1179 } 1180 1181 return (error); 1182} 1183 1184void 1185zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1186{ 1187 objset_t *os = zfsvfs->z_os; 1188 struct dsl_dataset *ds; 1189 1190 /* 1191 * Unregister properties. 1192 */ 1193 if (!dmu_objset_is_snapshot(os)) { 1194 ds = dmu_objset_ds(os); 1195 VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, 1196 zfsvfs) == 0); 1197 1198 VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, 1199 zfsvfs) == 0); 1200 1201 VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, 1202 zfsvfs) == 0); 1203 1204 VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, 1205 zfsvfs) == 0); 1206 1207 VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, 1208 zfsvfs) == 0); 1209 1210 VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, 1211 zfsvfs) == 0); 1212 1213 VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, 1214 zfsvfs) == 0); 1215 1216 VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, 1217 zfsvfs) == 0); 1218 1219 VERIFY(dsl_prop_unregister(ds, "aclinherit", 1220 acl_inherit_changed_cb, zfsvfs) == 0); 1221 1222 VERIFY(dsl_prop_unregister(ds, "vscan", 1223 vscan_changed_cb, zfsvfs) == 0); 1224 } 1225} 1226 1227#ifdef SECLABEL 1228/* 1229 * Convert a decimal digit string to a uint64_t integer. 1230 */ 1231static int 1232str_to_uint64(char *str, uint64_t *objnum) 1233{ 1234 uint64_t num = 0; 1235 1236 while (*str) { 1237 if (*str < '0' || *str > '9') 1238 return (EINVAL); 1239 1240 num = num*10 + *str++ - '0'; 1241 } 1242 1243 *objnum = num; 1244 return (0); 1245} 1246 1247/* 1248 * The boot path passed from the boot loader is in the form of 1249 * "rootpool-name/root-filesystem-object-number'. Convert this 1250 * string to a dataset name: "rootpool-name/root-filesystem-name". 1251 */ 1252static int 1253zfs_parse_bootfs(char *bpath, char *outpath) 1254{ 1255 char *slashp; 1256 uint64_t objnum; 1257 int error; 1258 1259 if (*bpath == 0 || *bpath == '/') 1260 return (EINVAL); 1261 1262 (void) strcpy(outpath, bpath); 1263 1264 slashp = strchr(bpath, '/'); 1265 1266 /* if no '/', just return the pool name */ 1267 if (slashp == NULL) { 1268 return (0); 1269 } 1270 1271 /* if not a number, just return the root dataset name */ 1272 if (str_to_uint64(slashp+1, &objnum)) { 1273 return (0); 1274 } 1275 1276 *slashp = '\0'; 1277 error = dsl_dsobj_to_dsname(bpath, objnum, outpath); 1278 *slashp = '/'; 1279 1280 return (error); 1281} 1282 1283/* 1284 * zfs_check_global_label: 1285 * Check that the hex label string is appropriate for the dataset 1286 * being mounted into the global_zone proper. 1287 * 1288 * Return an error if the hex label string is not default or 1289 * admin_low/admin_high. For admin_low labels, the corresponding 1290 * dataset must be readonly. 1291 */ 1292int 1293zfs_check_global_label(const char *dsname, const char *hexsl) 1294{ 1295 if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1296 return (0); 1297 if (strcasecmp(hexsl, ADMIN_HIGH) == 0) 1298 return (0); 1299 if (strcasecmp(hexsl, ADMIN_LOW) == 0) { 1300 /* must be readonly */ 1301 uint64_t rdonly; 1302 1303 if (dsl_prop_get_integer(dsname, 1304 zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) 1305 return (EACCES); 1306 return (rdonly ? 0 : EACCES); 1307 } 1308 return (EACCES); 1309} 1310 1311/* 1312 * zfs_mount_label_policy: 1313 * Determine whether the mount is allowed according to MAC check. 1314 * by comparing (where appropriate) label of the dataset against 1315 * the label of the zone being mounted into. If the dataset has 1316 * no label, create one. 1317 * 1318 * Returns: 1319 * 0 : access allowed 1320 * >0 : error code, such as EACCES 1321 */ 1322static int 1323zfs_mount_label_policy(vfs_t *vfsp, char *osname) 1324{ 1325 int error, retv; 1326 zone_t *mntzone = NULL; 1327 ts_label_t *mnt_tsl; 1328 bslabel_t *mnt_sl; 1329 bslabel_t ds_sl; 1330 char ds_hexsl[MAXNAMELEN]; 1331 1332 retv = EACCES; /* assume the worst */ 1333 1334 /* 1335 * Start by getting the dataset label if it exists. 1336 */ 1337 error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1338 1, sizeof (ds_hexsl), &ds_hexsl, NULL); 1339 if (error) 1340 return (EACCES); 1341 1342 /* 1343 * If labeling is NOT enabled, then disallow the mount of datasets 1344 * which have a non-default label already. No other label checks 1345 * are needed. 1346 */ 1347 if (!is_system_labeled()) { 1348 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1349 return (0); 1350 return (EACCES); 1351 } 1352 1353 /* 1354 * Get the label of the mountpoint. If mounting into the global 1355 * zone (i.e. mountpoint is not within an active zone and the 1356 * zoned property is off), the label must be default or 1357 * admin_low/admin_high only; no other checks are needed. 1358 */ 1359 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 1360 if (mntzone->zone_id == GLOBAL_ZONEID) { 1361 uint64_t zoned; 1362 1363 zone_rele(mntzone); 1364 1365 if (dsl_prop_get_integer(osname, 1366 zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) 1367 return (EACCES); 1368 if (!zoned) 1369 return (zfs_check_global_label(osname, ds_hexsl)); 1370 else 1371 /* 1372 * This is the case of a zone dataset being mounted 1373 * initially, before the zone has been fully created; 1374 * allow this mount into global zone. 1375 */ 1376 return (0); 1377 } 1378 1379 mnt_tsl = mntzone->zone_slabel; 1380 ASSERT(mnt_tsl != NULL); 1381 label_hold(mnt_tsl); 1382 mnt_sl = label2bslabel(mnt_tsl); 1383 1384 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { 1385 /* 1386 * The dataset doesn't have a real label, so fabricate one. 1387 */ 1388 char *str = NULL; 1389 1390 if (l_to_str_internal(mnt_sl, &str) == 0 && 1391 dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1392 ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0) 1393 retv = 0; 1394 if (str != NULL) 1395 kmem_free(str, strlen(str) + 1); 1396 } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { 1397 /* 1398 * Now compare labels to complete the MAC check. If the 1399 * labels are equal then allow access. If the mountpoint 1400 * label dominates the dataset label, allow readonly access. 1401 * Otherwise, access is denied. 1402 */ 1403 if (blequal(mnt_sl, &ds_sl)) 1404 retv = 0; 1405 else if (bldominates(mnt_sl, &ds_sl)) { 1406 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1407 retv = 0; 1408 } 1409 } 1410 1411 label_rele(mnt_tsl); 1412 zone_rele(mntzone); 1413 return (retv); 1414} 1415#endif /* SECLABEL */ 1416 1417#ifdef OPENSOLARIS_MOUNTROOT 1418static int 1419zfs_mountroot(vfs_t *vfsp, enum whymountroot why) 1420{ 1421 int error = 0; 1422 static int zfsrootdone = 0; 1423 zfsvfs_t *zfsvfs = NULL; 1424 znode_t *zp = NULL; 1425 vnode_t *vp = NULL; 1426 char *zfs_bootfs; 1427 char *zfs_devid; 1428 1429 ASSERT(vfsp); 1430 1431 /* 1432 * The filesystem that we mount as root is defined in the 1433 * boot property "zfs-bootfs" with a format of 1434 * "poolname/root-dataset-objnum". 1435 */ 1436 if (why == ROOT_INIT) { 1437 if (zfsrootdone++) 1438 return (EBUSY); 1439 /* 1440 * the process of doing a spa_load will require the 1441 * clock to be set before we could (for example) do 1442 * something better by looking at the timestamp on 1443 * an uberblock, so just set it to -1. 1444 */ 1445 clkset(-1); 1446 1447 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { 1448 cmn_err(CE_NOTE, "spa_get_bootfs: can not get " 1449 "bootfs name"); 1450 return (EINVAL); 1451 } 1452 zfs_devid = spa_get_bootprop("diskdevid"); 1453 error = spa_import_rootpool(rootfs.bo_name, zfs_devid); 1454 if (zfs_devid) 1455 spa_free_bootprop(zfs_devid); 1456 if (error) { 1457 spa_free_bootprop(zfs_bootfs); 1458 cmn_err(CE_NOTE, "spa_import_rootpool: error %d", 1459 error); 1460 return (error); 1461 } 1462 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { 1463 spa_free_bootprop(zfs_bootfs); 1464 cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", 1465 error); 1466 return (error); 1467 } 1468 1469 spa_free_bootprop(zfs_bootfs); 1470 1471 if (error = vfs_lock(vfsp)) 1472 return (error); 1473 1474 if (error = zfs_domount(vfsp, rootfs.bo_name)) { 1475 cmn_err(CE_NOTE, "zfs_domount: error %d", error); 1476 goto out; 1477 } 1478 1479 zfsvfs = (zfsvfs_t *)vfsp->vfs_data; 1480 ASSERT(zfsvfs); 1481 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { 1482 cmn_err(CE_NOTE, "zfs_zget: error %d", error); 1483 goto out; 1484 } 1485 1486 vp = ZTOV(zp); 1487 mutex_enter(&vp->v_lock); 1488 vp->v_flag |= VROOT; 1489 mutex_exit(&vp->v_lock); 1490 rootvp = vp; 1491 1492 /* 1493 * Leave rootvp held. The root file system is never unmounted. 1494 */ 1495 1496 vfs_add((struct vnode *)0, vfsp, 1497 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); 1498out: 1499 vfs_unlock(vfsp); 1500 return (error); 1501 } else if (why == ROOT_REMOUNT) { 1502 readonly_changed_cb(vfsp->vfs_data, B_FALSE); 1503 vfsp->vfs_flag |= VFS_REMOUNT; 1504 1505 /* refresh mount options */ 1506 zfs_unregister_callbacks(vfsp->vfs_data); 1507 return (zfs_register_callbacks(vfsp)); 1508 1509 } else if (why == ROOT_UNMOUNT) { 1510 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); 1511 (void) zfs_sync(vfsp, 0, 0); 1512 return (0); 1513 } 1514 1515 /* 1516 * if "why" is equal to anything else other than ROOT_INIT, 1517 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. 1518 */ 1519 return (ENOTSUP); 1520} 1521#endif /* OPENSOLARIS_MOUNTROOT */ 1522 1523/*ARGSUSED*/ 1524static int 1525zfs_mount(vfs_t *vfsp) 1526{ 1527 kthread_t *td = curthread; 1528 vnode_t *mvp = vfsp->mnt_vnodecovered; 1529 cred_t *cr = td->td_ucred; 1530 char *osname; 1531 int error = 0; 1532 int canwrite; 1533 1534 if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) 1535 return (EINVAL); 1536 1537 /* 1538 * If full-owner-access is enabled and delegated administration is 1539 * turned on, we must set nosuid. 1540 */ 1541 if (zfs_super_owner && 1542 dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { 1543 secpolicy_fs_mount_clearopts(cr, vfsp); 1544 } 1545 1546 /* 1547 * Check for mount privilege? 1548 * 1549 * If we don't have privilege then see if 1550 * we have local permission to allow it 1551 */ 1552 error = secpolicy_fs_mount(cr, mvp, vfsp); 1553 if (error) { 1554 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) 1555 goto out; 1556 1557 if (!(vfsp->vfs_flag & MS_REMOUNT)) { 1558 vattr_t vattr; 1559 1560 /* 1561 * Make sure user is the owner of the mount point 1562 * or has sufficient privileges. 1563 */ 1564 1565 vattr.va_mask = AT_UID; 1566 1567 vn_lock(mvp, LK_SHARED | LK_RETRY); 1568 if (VOP_GETATTR(mvp, &vattr, cr)) { 1569 VOP_UNLOCK(mvp, 0); 1570 goto out; 1571 } 1572 1573 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && 1574 VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { 1575 VOP_UNLOCK(mvp, 0); 1576 goto out; 1577 } 1578 VOP_UNLOCK(mvp, 0); 1579 } 1580 1581 secpolicy_fs_mount_clearopts(cr, vfsp); 1582 } 1583 1584 /* 1585 * Refuse to mount a filesystem if we are in a local zone and the 1586 * dataset is not visible. 1587 */ 1588 if (!INGLOBALZONE(curthread) && 1589 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 1590 error = EPERM; 1591 goto out; 1592 } 1593 1594#ifdef SECLABEL 1595 error = zfs_mount_label_policy(vfsp, osname); 1596 if (error) 1597 goto out; 1598#endif 1599 1600 vfsp->vfs_flag |= MNT_NFS4ACLS; 1601 1602 /* 1603 * When doing a remount, we simply refresh our temporary properties 1604 * according to those options set in the current VFS options. 1605 */ 1606 if (vfsp->vfs_flag & MS_REMOUNT) { 1607 /* refresh mount options */ 1608 zfs_unregister_callbacks(vfsp->vfs_data); 1609 error = zfs_register_callbacks(vfsp); 1610 goto out; 1611 } 1612 1613 DROP_GIANT(); 1614 error = zfs_domount(vfsp, osname); 1615 PICKUP_GIANT(); 1616 1617#ifdef sun 1618 /* 1619 * Add an extra VFS_HOLD on our parent vfs so that it can't 1620 * disappear due to a forced unmount. 1621 */ 1622 if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) 1623 VFS_HOLD(mvp->v_vfsp); 1624#endif /* sun */ 1625 1626out: 1627 return (error); 1628} 1629 1630static int 1631zfs_statfs(vfs_t *vfsp, struct statfs *statp) 1632{ 1633 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1634 uint64_t refdbytes, availbytes, usedobjs, availobjs; 1635 1636 statp->f_version = STATFS_VERSION; 1637 1638 ZFS_ENTER(zfsvfs); 1639 1640 dmu_objset_space(zfsvfs->z_os, 1641 &refdbytes, &availbytes, &usedobjs, &availobjs); 1642 1643 /* 1644 * The underlying storage pool actually uses multiple block sizes. 1645 * We report the fragsize as the smallest block size we support, 1646 * and we report our blocksize as the filesystem's maximum blocksize. 1647 */ 1648 statp->f_bsize = SPA_MINBLOCKSIZE; 1649 statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; 1650 1651 /* 1652 * The following report "total" blocks of various kinds in the 1653 * file system, but reported in terms of f_frsize - the 1654 * "fragment" size. 1655 */ 1656 1657 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 1658 statp->f_bfree = availbytes / statp->f_bsize; 1659 statp->f_bavail = statp->f_bfree; /* no root reservation */ 1660 1661 /* 1662 * statvfs() should really be called statufs(), because it assumes 1663 * static metadata. ZFS doesn't preallocate files, so the best 1664 * we can do is report the max that could possibly fit in f_files, 1665 * and that minus the number actually used in f_ffree. 1666 * For f_ffree, report the smaller of the number of object available 1667 * and the number of blocks (each object will take at least a block). 1668 */ 1669 statp->f_ffree = MIN(availobjs, statp->f_bfree); 1670 statp->f_files = statp->f_ffree + usedobjs; 1671 1672 /* 1673 * We're a zfs filesystem. 1674 */ 1675 (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); 1676 1677 strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, 1678 sizeof(statp->f_mntfromname)); 1679 strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, 1680 sizeof(statp->f_mntonname)); 1681 1682 statp->f_namemax = ZFS_MAXNAMELEN; 1683 1684 ZFS_EXIT(zfsvfs); 1685 return (0); 1686} 1687 1688int 1689zfs_vnode_lock(vnode_t *vp, int flags) 1690{ 1691 int error; 1692 1693 ASSERT(vp != NULL); 1694 1695 /* 1696 * Check if the file system wasn't forcibly unmounted in the meantime. 1697 */ 1698 error = vn_lock(vp, flags); 1699 if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) { 1700 VOP_UNLOCK(vp, 0); 1701 error = ENOENT; 1702 } 1703 1704 return (error); 1705} 1706 1707static int 1708zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) 1709{ 1710 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1711 znode_t *rootzp; 1712 int error; 1713 1714 ZFS_ENTER_NOERROR(zfsvfs); 1715 1716 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); 1717 if (error == 0) 1718 *vpp = ZTOV(rootzp); 1719 1720 ZFS_EXIT(zfsvfs); 1721 1722 if (error == 0) { 1723 error = zfs_vnode_lock(*vpp, flags); 1724 if (error == 0) 1725 (*vpp)->v_vflag |= VV_ROOT; 1726 } 1727 if (error != 0) 1728 *vpp = NULL; 1729 1730 return (error); 1731} 1732 1733/* 1734 * Teardown the zfsvfs::z_os. 1735 * 1736 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' 1737 * and 'z_teardown_inactive_lock' held. 1738 */ 1739static int 1740zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 1741{ 1742 znode_t *zp; 1743 1744 rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); 1745 1746 if (!unmounting) { 1747 /* 1748 * We purge the parent filesystem's vfsp as the parent 1749 * filesystem and all of its snapshots have their vnode's 1750 * v_vfsp set to the parent's filesystem's vfsp. Note, 1751 * 'z_parent' is self referential for non-snapshots. 1752 */ 1753 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 1754#ifdef FREEBSD_NAMECACHE 1755 cache_purgevfs(zfsvfs->z_parent->z_vfs); 1756#endif 1757 } 1758 1759 /* 1760 * Close the zil. NB: Can't close the zil while zfs_inactive 1761 * threads are blocked as zil_close can call zfs_inactive. 1762 */ 1763 if (zfsvfs->z_log) { 1764 zil_close(zfsvfs->z_log); 1765 zfsvfs->z_log = NULL; 1766 } 1767 1768 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); 1769 1770 /* 1771 * If we are not unmounting (ie: online recv) and someone already 1772 * unmounted this file system while we were doing the switcheroo, 1773 * or a reopen of z_os failed then just bail out now. 1774 */ 1775 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 1776 rw_exit(&zfsvfs->z_teardown_inactive_lock); 1777 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 1778 return (EIO); 1779 } 1780 1781 /* 1782 * At this point there are no vops active, and any new vops will 1783 * fail with EIO since we have z_teardown_lock for writer (only 1784 * relavent for forced unmount). 1785 * 1786 * Release all holds on dbufs. 1787 */ 1788 mutex_enter(&zfsvfs->z_znodes_lock); 1789 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 1790 zp = list_next(&zfsvfs->z_all_znodes, zp)) 1791 if (zp->z_sa_hdl) { 1792 ASSERT(ZTOV(zp)->v_count >= 0); 1793 zfs_znode_dmu_fini(zp); 1794 } 1795 mutex_exit(&zfsvfs->z_znodes_lock); 1796 1797 /* 1798 * If we are unmounting, set the unmounted flag and let new vops 1799 * unblock. zfs_inactive will have the unmounted behavior, and all 1800 * other vops will fail with EIO. 1801 */ 1802 if (unmounting) { 1803 zfsvfs->z_unmounted = B_TRUE; 1804 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 1805 rw_exit(&zfsvfs->z_teardown_inactive_lock); 1806 1807#ifdef __FreeBSD__ 1808 /* 1809 * Some znodes might not be fully reclaimed, wait for them. 1810 */ 1811 mutex_enter(&zfsvfs->z_znodes_lock); 1812 while (list_head(&zfsvfs->z_all_znodes) != NULL) { 1813 msleep(zfsvfs, &zfsvfs->z_znodes_lock, 0, 1814 "zteardown", 0); 1815 } 1816 mutex_exit(&zfsvfs->z_znodes_lock); 1817#endif 1818 } 1819 1820 /* 1821 * z_os will be NULL if there was an error in attempting to reopen 1822 * zfsvfs, so just return as the properties had already been 1823 * unregistered and cached data had been evicted before. 1824 */ 1825 if (zfsvfs->z_os == NULL) 1826 return (0); 1827 1828 /* 1829 * Unregister properties. 1830 */ 1831 zfs_unregister_callbacks(zfsvfs); 1832 1833 /* 1834 * Evict cached data 1835 */ 1836 if (dmu_objset_is_dirty_anywhere(zfsvfs->z_os)) 1837 if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) 1838 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 1839 (void) dmu_objset_evict_dbufs(zfsvfs->z_os); 1840 1841 return (0); 1842} 1843 1844/*ARGSUSED*/ 1845static int 1846zfs_umount(vfs_t *vfsp, int fflag) 1847{ 1848 kthread_t *td = curthread; 1849 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1850 objset_t *os; 1851 cred_t *cr = td->td_ucred; 1852 int ret; 1853 1854 ret = secpolicy_fs_unmount(cr, vfsp); 1855 if (ret) { 1856 if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), 1857 ZFS_DELEG_PERM_MOUNT, cr)) 1858 return (ret); 1859 } 1860 1861 /* 1862 * We purge the parent filesystem's vfsp as the parent filesystem 1863 * and all of its snapshots have their vnode's v_vfsp set to the 1864 * parent's filesystem's vfsp. Note, 'z_parent' is self 1865 * referential for non-snapshots. 1866 */ 1867 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 1868 1869 /* 1870 * Unmount any snapshots mounted under .zfs before unmounting the 1871 * dataset itself. 1872 */ 1873 if (zfsvfs->z_ctldir != NULL) { 1874 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) 1875 return (ret); 1876 ret = vflush(vfsp, 0, 0, td); 1877 ASSERT(ret == EBUSY); 1878 if (!(fflag & MS_FORCE)) { 1879 if (zfsvfs->z_ctldir->v_count > 1) 1880 return (EBUSY); 1881 ASSERT(zfsvfs->z_ctldir->v_count == 1); 1882 } 1883 zfsctl_destroy(zfsvfs); 1884 ASSERT(zfsvfs->z_ctldir == NULL); 1885 } 1886 1887 if (fflag & MS_FORCE) { 1888 /* 1889 * Mark file system as unmounted before calling 1890 * vflush(FORCECLOSE). This way we ensure no future vnops 1891 * will be called and risk operating on DOOMED vnodes. 1892 */ 1893 rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); 1894 zfsvfs->z_unmounted = B_TRUE; 1895 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 1896 } 1897 1898 /* 1899 * Flush all the files. 1900 */ 1901 ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); 1902 if (ret != 0) { 1903 if (!zfsvfs->z_issnap) { 1904 zfsctl_create(zfsvfs); 1905 ASSERT(zfsvfs->z_ctldir != NULL); 1906 } 1907 return (ret); 1908 } 1909 1910 if (!(fflag & MS_FORCE)) { 1911 /* 1912 * Check the number of active vnodes in the file system. 1913 * Our count is maintained in the vfs structure, but the 1914 * number is off by 1 to indicate a hold on the vfs 1915 * structure itself. 1916 * 1917 * The '.zfs' directory maintains a reference of its 1918 * own, and any active references underneath are 1919 * reflected in the vnode count. 1920 */ 1921 if (zfsvfs->z_ctldir == NULL) { 1922 if (vfsp->vfs_count > 1) 1923 return (EBUSY); 1924 } else { 1925 if (vfsp->vfs_count > 2 || 1926 zfsvfs->z_ctldir->v_count > 1) 1927 return (EBUSY); 1928 } 1929 } else { 1930 MNT_ILOCK(vfsp); 1931 vfsp->mnt_kern_flag |= MNTK_UNMOUNTF; 1932 MNT_IUNLOCK(vfsp); 1933 } 1934 1935 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); 1936 os = zfsvfs->z_os; 1937 1938 /* 1939 * z_os will be NULL if there was an error in 1940 * attempting to reopen zfsvfs. 1941 */ 1942 if (os != NULL) { 1943 /* 1944 * Unset the objset user_ptr. 1945 */ 1946 mutex_enter(&os->os_user_ptr_lock); 1947 dmu_objset_set_user(os, NULL); 1948 mutex_exit(&os->os_user_ptr_lock); 1949 1950 /* 1951 * Finally release the objset 1952 */ 1953 dmu_objset_disown(os, zfsvfs); 1954 } 1955 1956 /* 1957 * We can now safely destroy the '.zfs' directory node. 1958 */ 1959 if (zfsvfs->z_ctldir != NULL) 1960 zfsctl_destroy(zfsvfs); 1961 if (zfsvfs->z_issnap) { 1962 vnode_t *svp = vfsp->mnt_vnodecovered; 1963 1964 if (svp->v_count >= 2) 1965 VN_RELE(svp); 1966 } 1967 zfs_freevfs(vfsp); 1968 1969 return (0); 1970} 1971 1972static int 1973zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) 1974{ 1975 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1976 znode_t *zp; 1977 int err; 1978 1979 /* 1980 * zfs_zget() can't operate on virtual entries like .zfs/ or 1981 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. 1982 * This will make NFS to switch to LOOKUP instead of using VGET. 1983 */ 1984 if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR) 1985 return (EOPNOTSUPP); 1986 1987 ZFS_ENTER(zfsvfs); 1988 err = zfs_zget(zfsvfs, ino, &zp); 1989 if (err == 0 && zp->z_unlinked) { 1990 VN_RELE(ZTOV(zp)); 1991 err = EINVAL; 1992 } 1993 if (err == 0) 1994 *vpp = ZTOV(zp); 1995 ZFS_EXIT(zfsvfs); 1996 if (err == 0) 1997 err = zfs_vnode_lock(*vpp, flags); 1998 if (err != 0) 1999 *vpp = NULL; 2000 return (err); 2001} 2002 2003static int 2004zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, 2005 struct ucred **credanonp, int *numsecflavors, int **secflavors) 2006{ 2007 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2008 2009 /* 2010 * If this is regular file system vfsp is the same as 2011 * zfsvfs->z_parent->z_vfs, but if it is snapshot, 2012 * zfsvfs->z_parent->z_vfs represents parent file system 2013 * which we have to use here, because only this file system 2014 * has mnt_export configured. 2015 */ 2016 return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, 2017 credanonp, numsecflavors, secflavors)); 2018} 2019 2020CTASSERT(SHORT_FID_LEN <= sizeof(struct fid)); 2021CTASSERT(LONG_FID_LEN <= sizeof(struct fid)); 2022 2023static int 2024zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) 2025{ 2026 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2027 znode_t *zp; 2028 uint64_t object = 0; 2029 uint64_t fid_gen = 0; 2030 uint64_t gen_mask; 2031 uint64_t zp_gen; 2032 int i, err; 2033 2034 *vpp = NULL; 2035 2036 ZFS_ENTER(zfsvfs); 2037 2038 /* 2039 * On FreeBSD we can get snapshot's mount point or its parent file 2040 * system mount point depending if snapshot is already mounted or not. 2041 */ 2042 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { 2043 zfid_long_t *zlfid = (zfid_long_t *)fidp; 2044 uint64_t objsetid = 0; 2045 uint64_t setgen = 0; 2046 2047 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 2048 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 2049 2050 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 2051 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 2052 2053 ZFS_EXIT(zfsvfs); 2054 2055 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 2056 if (err) 2057 return (EINVAL); 2058 ZFS_ENTER(zfsvfs); 2059 } 2060 2061 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 2062 zfid_short_t *zfid = (zfid_short_t *)fidp; 2063 2064 for (i = 0; i < sizeof (zfid->zf_object); i++) 2065 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 2066 2067 for (i = 0; i < sizeof (zfid->zf_gen); i++) 2068 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 2069 } else { 2070 ZFS_EXIT(zfsvfs); 2071 return (EINVAL); 2072 } 2073 2074 /* A zero fid_gen means we are in the .zfs control directories */ 2075 if (fid_gen == 0 && 2076 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { 2077 *vpp = zfsvfs->z_ctldir; 2078 ASSERT(*vpp != NULL); 2079 if (object == ZFSCTL_INO_SNAPDIR) { 2080 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 2081 0, NULL, NULL, NULL, NULL, NULL) == 0); 2082 } else { 2083 VN_HOLD(*vpp); 2084 } 2085 ZFS_EXIT(zfsvfs); 2086 err = zfs_vnode_lock(*vpp, flags | LK_RETRY); 2087 if (err != 0) 2088 *vpp = NULL; 2089 return (err); 2090 } 2091 2092 gen_mask = -1ULL >> (64 - 8 * i); 2093 2094 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); 2095 if (err = zfs_zget(zfsvfs, object, &zp)) { 2096 ZFS_EXIT(zfsvfs); 2097 return (err); 2098 } 2099 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 2100 sizeof (uint64_t)); 2101 zp_gen = zp_gen & gen_mask; 2102 if (zp_gen == 0) 2103 zp_gen = 1; 2104 if (zp->z_unlinked || zp_gen != fid_gen) { 2105 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); 2106 VN_RELE(ZTOV(zp)); 2107 ZFS_EXIT(zfsvfs); 2108 return (EINVAL); 2109 } 2110 2111 *vpp = ZTOV(zp); 2112 ZFS_EXIT(zfsvfs); 2113 err = zfs_vnode_lock(*vpp, flags | LK_RETRY); 2114 if (err == 0) 2115 vnode_create_vobject(*vpp, zp->z_size, curthread); 2116 else 2117 *vpp = NULL; 2118 return (err); 2119} 2120 2121/* 2122 * Block out VOPs and close zfsvfs_t::z_os 2123 * 2124 * Note, if successful, then we return with the 'z_teardown_lock' and 2125 * 'z_teardown_inactive_lock' write held. 2126 */ 2127int 2128zfs_suspend_fs(zfsvfs_t *zfsvfs) 2129{ 2130 int error; 2131 2132 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 2133 return (error); 2134 dmu_objset_disown(zfsvfs->z_os, zfsvfs); 2135 2136 return (0); 2137} 2138 2139/* 2140 * Reopen zfsvfs_t::z_os and release VOPs. 2141 */ 2142int 2143zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) 2144{ 2145 int err; 2146 2147 ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); 2148 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); 2149 2150 err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs, 2151 &zfsvfs->z_os); 2152 if (err) { 2153 zfsvfs->z_os = NULL; 2154 } else { 2155 znode_t *zp; 2156 uint64_t sa_obj = 0; 2157 2158 /* 2159 * Make sure version hasn't changed 2160 */ 2161 2162 err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION, 2163 &zfsvfs->z_version); 2164 2165 if (err) 2166 goto bail; 2167 2168 err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, 2169 ZFS_SA_ATTRS, 8, 1, &sa_obj); 2170 2171 if (err && zfsvfs->z_version >= ZPL_VERSION_SA) 2172 goto bail; 2173 2174 if ((err = sa_setup(zfsvfs->z_os, sa_obj, 2175 zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table)) != 0) 2176 goto bail; 2177 2178 if (zfsvfs->z_version >= ZPL_VERSION_SA) 2179 sa_register_update_callback(zfsvfs->z_os, 2180 zfs_sa_upgrade); 2181 2182 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); 2183 2184 zfs_set_fuid_feature(zfsvfs); 2185 2186 /* 2187 * Attempt to re-establish all the active znodes with 2188 * their dbufs. If a zfs_rezget() fails, then we'll let 2189 * any potential callers discover that via ZFS_ENTER_VERIFY_VP 2190 * when they try to use their znode. 2191 */ 2192 mutex_enter(&zfsvfs->z_znodes_lock); 2193 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 2194 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 2195 (void) zfs_rezget(zp); 2196 } 2197 mutex_exit(&zfsvfs->z_znodes_lock); 2198 } 2199 2200bail: 2201 /* release the VOPs */ 2202 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2203 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 2204 2205 if (err) { 2206 /* 2207 * Since we couldn't reopen zfsvfs::z_os, or 2208 * setup the sa framework force unmount this file system. 2209 */ 2210 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) 2211 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); 2212 } 2213 return (err); 2214} 2215 2216static void 2217zfs_freevfs(vfs_t *vfsp) 2218{ 2219 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2220 2221#ifdef sun 2222 /* 2223 * If this is a snapshot, we have an extra VFS_HOLD on our parent 2224 * from zfs_mount(). Release it here. If we came through 2225 * zfs_mountroot() instead, we didn't grab an extra hold, so 2226 * skip the VFS_RELE for rootvfs. 2227 */ 2228 if (zfsvfs->z_issnap && (vfsp != rootvfs)) 2229 VFS_RELE(zfsvfs->z_parent->z_vfs); 2230#endif /* sun */ 2231 2232 zfsvfs_free(zfsvfs); 2233 2234 atomic_add_32(&zfs_active_fs_count, -1); 2235} 2236 2237#ifdef __i386__ 2238static int desiredvnodes_backup; 2239#endif 2240 2241static void 2242zfs_vnodes_adjust(void) 2243{ 2244#ifdef __i386__ 2245 int newdesiredvnodes; 2246 2247 desiredvnodes_backup = desiredvnodes; 2248 2249 /* 2250 * We calculate newdesiredvnodes the same way it is done in 2251 * vntblinit(). If it is equal to desiredvnodes, it means that 2252 * it wasn't tuned by the administrator and we can tune it down. 2253 */ 2254 newdesiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * 2255 vm_kmem_size / (5 * (sizeof(struct vm_object) + 2256 sizeof(struct vnode)))); 2257 if (newdesiredvnodes == desiredvnodes) 2258 desiredvnodes = (3 * newdesiredvnodes) / 4; 2259#endif 2260} 2261 2262static void 2263zfs_vnodes_adjust_back(void) 2264{ 2265 2266#ifdef __i386__ 2267 desiredvnodes = desiredvnodes_backup; 2268#endif 2269} 2270 2271void 2272zfs_init(void) 2273{ 2274 2275 printf("ZFS filesystem version " ZPL_VERSION_STRING "\n"); 2276 2277 /* 2278 * Initialize .zfs directory structures 2279 */ 2280 zfsctl_init(); 2281 2282 /* 2283 * Initialize znode cache, vnode ops, etc... 2284 */ 2285 zfs_znode_init(); 2286 2287 /* 2288 * Reduce number of vnodes. Originally number of vnodes is calculated 2289 * with UFS inode in mind. We reduce it here, because it's too big for 2290 * ZFS/i386. 2291 */ 2292 zfs_vnodes_adjust(); 2293 2294 dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); 2295} 2296 2297void 2298zfs_fini(void) 2299{ 2300 zfsctl_fini(); 2301 zfs_znode_fini(); 2302 zfs_vnodes_adjust_back(); 2303} 2304 2305int 2306zfs_busy(void) 2307{ 2308 return (zfs_active_fs_count != 0); 2309} 2310 2311int 2312zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2313{ 2314 int error; 2315 objset_t *os = zfsvfs->z_os; 2316 dmu_tx_t *tx; 2317 2318 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2319 return (EINVAL); 2320 2321 if (newvers < zfsvfs->z_version) 2322 return (EINVAL); 2323 2324 if (zfs_spa_version_map(newvers) > 2325 spa_version(dmu_objset_spa(zfsvfs->z_os))) 2326 return (ENOTSUP); 2327 2328 tx = dmu_tx_create(os); 2329 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2330 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2331 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 2332 ZFS_SA_ATTRS); 2333 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2334 } 2335 error = dmu_tx_assign(tx, TXG_WAIT); 2336 if (error) { 2337 dmu_tx_abort(tx); 2338 return (error); 2339 } 2340 2341 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2342 8, 1, &newvers, tx); 2343 2344 if (error) { 2345 dmu_tx_commit(tx); 2346 return (error); 2347 } 2348 2349 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2350 uint64_t sa_obj; 2351 2352 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, 2353 SPA_VERSION_SA); 2354 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 2355 DMU_OT_NONE, 0, tx); 2356 2357 error = zap_add(os, MASTER_NODE_OBJ, 2358 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 2359 ASSERT3U(error, ==, 0); 2360 2361 VERIFY(0 == sa_set_sa_object(os, sa_obj)); 2362 sa_register_update_callback(os, zfs_sa_upgrade); 2363 } 2364 2365 spa_history_log_internal(LOG_DS_UPGRADE, 2366 dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu", 2367 zfsvfs->z_version, newvers, dmu_objset_id(os)); 2368 2369 dmu_tx_commit(tx); 2370 2371 zfsvfs->z_version = newvers; 2372 2373 zfs_set_fuid_feature(zfsvfs); 2374 2375 return (0); 2376} 2377 2378/* 2379 * Read a property stored within the master node. 2380 */ 2381int 2382zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) 2383{ 2384 const char *pname; 2385 int error = ENOENT; 2386 2387 /* 2388 * Look up the file system's value for the property. For the 2389 * version property, we look up a slightly different string. 2390 */ 2391 if (prop == ZFS_PROP_VERSION) 2392 pname = ZPL_VERSION_STR; 2393 else 2394 pname = zfs_prop_to_name(prop); 2395 2396 if (os != NULL) 2397 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); 2398 2399 if (error == ENOENT) { 2400 /* No value set, use the default value */ 2401 switch (prop) { 2402 case ZFS_PROP_VERSION: 2403 *value = ZPL_VERSION; 2404 break; 2405 case ZFS_PROP_NORMALIZE: 2406 case ZFS_PROP_UTF8ONLY: 2407 *value = 0; 2408 break; 2409 case ZFS_PROP_CASE: 2410 *value = ZFS_CASE_SENSITIVE; 2411 break; 2412 default: 2413 return (error); 2414 } 2415 error = 0; 2416 } 2417 return (error); 2418} 2419