1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26 */ 27 28/* Portions Copyright 2010 Robert Milkowski */ 29 30#include <sys/cred.h> 31#include <sys/zfs_context.h> 32#include <sys/dmu_objset.h> 33#include <sys/dsl_dir.h> 34#include <sys/dsl_dataset.h> 35#include <sys/dsl_prop.h> 36#include <sys/dsl_pool.h> 37#include <sys/dsl_synctask.h> 38#include <sys/dsl_deleg.h> 39#include <sys/dnode.h> 40#include <sys/dbuf.h> 41#include <sys/zvol.h> 42#include <sys/dmu_tx.h> 43#include <sys/zap.h> 44#include <sys/zil.h> 45#include <sys/dmu_impl.h> 46#include <sys/zfs_ioctl.h> 47#include <sys/sa.h> 48#include <sys/zfs_onexit.h> 49#include <sys/dsl_destroy.h> 50 51/* 52 * Needed to close a window in dnode_move() that allows the objset to be freed 53 * before it can be safely accessed. 54 */ 55krwlock_t os_lock; 56 57void 58dmu_objset_init(void) 59{ 60 rw_init(&os_lock, NULL, RW_DEFAULT, NULL); 61} 62 63void 64dmu_objset_fini(void) 65{ 66 rw_destroy(&os_lock); 67} 68 69spa_t * 70dmu_objset_spa(objset_t *os) 71{ 72 return (os->os_spa); 73} 74 75zilog_t * 76dmu_objset_zil(objset_t *os) 77{ 78 return (os->os_zil); 79} 80 81dsl_pool_t * 82dmu_objset_pool(objset_t *os) 83{ 84 dsl_dataset_t *ds; 85 86 if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) 87 return (ds->ds_dir->dd_pool); 88 else 89 return (spa_get_dsl(os->os_spa)); 90} 91 92dsl_dataset_t * 93dmu_objset_ds(objset_t *os) 94{ 95 return (os->os_dsl_dataset); 96} 97 98dmu_objset_type_t 99dmu_objset_type(objset_t *os) 100{ 101 return (os->os_phys->os_type); 102} 103 104void 105dmu_objset_name(objset_t *os, char *buf) 106{ 107 dsl_dataset_name(os->os_dsl_dataset, buf); 108} 109 110uint64_t 111dmu_objset_id(objset_t *os) 112{ 113 dsl_dataset_t *ds = os->os_dsl_dataset; 114 115 return (ds ? ds->ds_object : 0); 116} 117 118uint64_t 119dmu_objset_syncprop(objset_t *os) 120{ 121 return (os->os_sync); 122} 123 124uint64_t 125dmu_objset_logbias(objset_t *os) 126{ 127 return (os->os_logbias); 128} 129 130static void 131checksum_changed_cb(void *arg, uint64_t newval) 132{ 133 objset_t *os = arg; 134 135 /* 136 * Inheritance should have been done by now. 137 */ 138 ASSERT(newval != ZIO_CHECKSUM_INHERIT); 139 140 os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); 141} 142 143static void 144compression_changed_cb(void *arg, uint64_t newval) 145{ 146 objset_t *os = arg; 147 148 /* 149 * Inheritance and range checking should have been done by now. 150 */ 151 ASSERT(newval != ZIO_COMPRESS_INHERIT); 152 153 os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); 154} 155 156static void 157copies_changed_cb(void *arg, uint64_t newval) 158{ 159 objset_t *os = arg; 160 161 /* 162 * Inheritance and range checking should have been done by now. 163 */ 164 ASSERT(newval > 0); 165 ASSERT(newval <= spa_max_replication(os->os_spa)); 166 167 os->os_copies = newval; 168} 169 170static void 171dedup_changed_cb(void *arg, uint64_t newval) 172{ 173 objset_t *os = arg; 174 spa_t *spa = os->os_spa; 175 enum zio_checksum checksum; 176 177 /* 178 * Inheritance should have been done by now. 179 */ 180 ASSERT(newval != ZIO_CHECKSUM_INHERIT); 181 182 checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); 183 184 os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; 185 os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); 186} 187 188static void 189primary_cache_changed_cb(void *arg, uint64_t newval) 190{ 191 objset_t *os = arg; 192 193 /* 194 * Inheritance and range checking should have been done by now. 195 */ 196 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || 197 newval == ZFS_CACHE_METADATA); 198 199 os->os_primary_cache = newval; 200} 201 202static void 203secondary_cache_changed_cb(void *arg, uint64_t newval) 204{ 205 objset_t *os = arg; 206 207 /* 208 * Inheritance and range checking should have been done by now. 209 */ 210 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || 211 newval == ZFS_CACHE_METADATA); 212 213 os->os_secondary_cache = newval; 214} 215 216static void 217sync_changed_cb(void *arg, uint64_t newval) 218{ 219 objset_t *os = arg; 220 221 /* 222 * Inheritance and range checking should have been done by now. 223 */ 224 ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || 225 newval == ZFS_SYNC_DISABLED); 226 227 os->os_sync = newval; 228 if (os->os_zil) 229 zil_set_sync(os->os_zil, newval); 230} 231 232static void 233logbias_changed_cb(void *arg, uint64_t newval) 234{ 235 objset_t *os = arg; 236 237 ASSERT(newval == ZFS_LOGBIAS_LATENCY || 238 newval == ZFS_LOGBIAS_THROUGHPUT); 239 os->os_logbias = newval; 240 if (os->os_zil) 241 zil_set_logbias(os->os_zil, newval); 242} 243 244void 245dmu_objset_byteswap(void *buf, size_t size) 246{ 247 objset_phys_t *osp = buf; 248 249 ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); 250 dnode_byteswap(&osp->os_meta_dnode); 251 byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); 252 osp->os_type = BSWAP_64(osp->os_type); 253 osp->os_flags = BSWAP_64(osp->os_flags); 254 if (size == sizeof (objset_phys_t)) { 255 dnode_byteswap(&osp->os_userused_dnode); 256 dnode_byteswap(&osp->os_groupused_dnode); 257 } 258} 259 260int 261dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, 262 objset_t **osp) 263{ 264 objset_t *os; 265 int i, err; 266 267 ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); 268 269 os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); 270 os->os_dsl_dataset = ds; 271 os->os_spa = spa; 272 os->os_rootbp = bp; 273 if (!BP_IS_HOLE(os->os_rootbp)) { 274 uint32_t aflags = ARC_WAIT; 275 zbookmark_t zb; 276 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 277 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 278 279 if (DMU_OS_IS_L2CACHEABLE(os)) 280 aflags |= ARC_L2CACHE; 281 if (DMU_OS_IS_L2COMPRESSIBLE(os)) 282 aflags |= ARC_L2COMPRESS; 283 284 dprintf_bp(os->os_rootbp, "reading %s", ""); 285 err = arc_read(NULL, spa, os->os_rootbp, 286 arc_getbuf_func, &os->os_phys_buf, 287 ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); 288 if (err != 0) { 289 kmem_free(os, sizeof (objset_t)); 290 /* convert checksum errors into IO errors */ 291 if (err == ECKSUM) 292 err = SET_ERROR(EIO); 293 return (err); 294 } 295 296 /* Increase the blocksize if we are permitted. */ 297 if (spa_version(spa) >= SPA_VERSION_USERSPACE && 298 arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { 299 arc_buf_t *buf = arc_buf_alloc(spa, 300 sizeof (objset_phys_t), &os->os_phys_buf, 301 ARC_BUFC_METADATA); 302 bzero(buf->b_data, sizeof (objset_phys_t)); 303 bcopy(os->os_phys_buf->b_data, buf->b_data, 304 arc_buf_size(os->os_phys_buf)); 305 (void) arc_buf_remove_ref(os->os_phys_buf, 306 &os->os_phys_buf); 307 os->os_phys_buf = buf; 308 } 309 310 os->os_phys = os->os_phys_buf->b_data; 311 os->os_flags = os->os_phys->os_flags; 312 } else { 313 int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? 314 sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; 315 os->os_phys_buf = arc_buf_alloc(spa, size, 316 &os->os_phys_buf, ARC_BUFC_METADATA); 317 os->os_phys = os->os_phys_buf->b_data; 318 bzero(os->os_phys, size); 319 } 320 321 /* 322 * Note: the changed_cb will be called once before the register 323 * func returns, thus changing the checksum/compression from the 324 * default (fletcher2/off). Snapshots don't need to know about 325 * checksum/compression/copies. 326 */ 327 if (ds) { 328 err = dsl_prop_register(ds, 329 zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), 330 primary_cache_changed_cb, os); 331 if (err == 0) { 332 err = dsl_prop_register(ds, 333 zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), 334 secondary_cache_changed_cb, os); 335 } 336 if (!dsl_dataset_is_snapshot(ds)) { 337 if (err == 0) { 338 err = dsl_prop_register(ds, 339 zfs_prop_to_name(ZFS_PROP_CHECKSUM), 340 checksum_changed_cb, os); 341 } 342 if (err == 0) { 343 err = dsl_prop_register(ds, 344 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 345 compression_changed_cb, os); 346 } 347 if (err == 0) { 348 err = dsl_prop_register(ds, 349 zfs_prop_to_name(ZFS_PROP_COPIES), 350 copies_changed_cb, os); 351 } 352 if (err == 0) { 353 err = dsl_prop_register(ds, 354 zfs_prop_to_name(ZFS_PROP_DEDUP), 355 dedup_changed_cb, os); 356 } 357 if (err == 0) { 358 err = dsl_prop_register(ds, 359 zfs_prop_to_name(ZFS_PROP_LOGBIAS), 360 logbias_changed_cb, os); 361 } 362 if (err == 0) { 363 err = dsl_prop_register(ds, 364 zfs_prop_to_name(ZFS_PROP_SYNC), 365 sync_changed_cb, os); 366 } 367 } 368 if (err != 0) { 369 VERIFY(arc_buf_remove_ref(os->os_phys_buf, 370 &os->os_phys_buf)); 371 kmem_free(os, sizeof (objset_t)); 372 return (err); 373 } 374 } else if (ds == NULL) { 375 /* It's the meta-objset. */ 376 os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; 377 os->os_compress = ZIO_COMPRESS_LZJB; 378 os->os_copies = spa_max_replication(spa); 379 os->os_dedup_checksum = ZIO_CHECKSUM_OFF; 380 os->os_dedup_verify = 0; 381 os->os_logbias = 0; 382 os->os_sync = 0; 383 os->os_primary_cache = ZFS_CACHE_ALL; 384 os->os_secondary_cache = ZFS_CACHE_ALL; 385 } 386 387 if (ds == NULL || !dsl_dataset_is_snapshot(ds)) 388 os->os_zil_header = os->os_phys->os_zil_header; 389 os->os_zil = zil_alloc(os, &os->os_zil_header); 390 391 for (i = 0; i < TXG_SIZE; i++) { 392 list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), 393 offsetof(dnode_t, dn_dirty_link[i])); 394 list_create(&os->os_free_dnodes[i], sizeof (dnode_t), 395 offsetof(dnode_t, dn_dirty_link[i])); 396 } 397 list_create(&os->os_dnodes, sizeof (dnode_t), 398 offsetof(dnode_t, dn_link)); 399 list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), 400 offsetof(dmu_buf_impl_t, db_link)); 401 402 mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); 403 mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); 404 mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); 405 406 DMU_META_DNODE(os) = dnode_special_open(os, 407 &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, 408 &os->os_meta_dnode); 409 if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { 410 DMU_USERUSED_DNODE(os) = dnode_special_open(os, 411 &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, 412 &os->os_userused_dnode); 413 DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, 414 &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, 415 &os->os_groupused_dnode); 416 } 417 418 /* 419 * We should be the only thread trying to do this because we 420 * have ds_opening_lock 421 */ 422 if (ds) { 423 mutex_enter(&ds->ds_lock); 424 ASSERT(ds->ds_objset == NULL); 425 ds->ds_objset = os; 426 mutex_exit(&ds->ds_lock); 427 } 428 429 *osp = os; 430 return (0); 431} 432 433int 434dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) 435{ 436 int err = 0; 437 438 mutex_enter(&ds->ds_opening_lock); 439 *osp = ds->ds_objset; 440 if (*osp == NULL) { 441 err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), 442 ds, dsl_dataset_get_blkptr(ds), osp); 443 } 444 mutex_exit(&ds->ds_opening_lock); 445 return (err); 446} 447 448/* 449 * Holds the pool while the objset is held. Therefore only one objset 450 * can be held at a time. 451 */ 452int 453dmu_objset_hold(const char *name, void *tag, objset_t **osp) 454{ 455 dsl_pool_t *dp; 456 dsl_dataset_t *ds; 457 int err; 458 459 err = dsl_pool_hold(name, tag, &dp); 460 if (err != 0) 461 return (err); 462 err = dsl_dataset_hold(dp, name, tag, &ds); 463 if (err != 0) { 464 dsl_pool_rele(dp, tag); 465 return (err); 466 } 467 468 err = dmu_objset_from_ds(ds, osp); 469 if (err != 0) { 470 dsl_dataset_rele(ds, tag); 471 dsl_pool_rele(dp, tag); 472 } 473 474 return (err); 475} 476 477/* 478 * dsl_pool must not be held when this is called. 479 * Upon successful return, there will be a longhold on the dataset, 480 * and the dsl_pool will not be held. 481 */ 482int 483dmu_objset_own(const char *name, dmu_objset_type_t type, 484 boolean_t readonly, void *tag, objset_t **osp) 485{ 486 dsl_pool_t *dp; 487 dsl_dataset_t *ds; 488 int err; 489 490 err = dsl_pool_hold(name, FTAG, &dp); 491 if (err != 0) 492 return (err); 493 err = dsl_dataset_own(dp, name, tag, &ds); 494 if (err != 0) { 495 dsl_pool_rele(dp, FTAG); 496 return (err); 497 } 498 499 err = dmu_objset_from_ds(ds, osp); 500 dsl_pool_rele(dp, FTAG); 501 if (err != 0) { 502 dsl_dataset_disown(ds, tag); 503 } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { 504 dsl_dataset_disown(ds, tag); 505 return (SET_ERROR(EINVAL)); 506 } else if (!readonly && dsl_dataset_is_snapshot(ds)) { 507 dsl_dataset_disown(ds, tag); 508 return (SET_ERROR(EROFS)); 509 } 510 return (err); 511} 512 513void 514dmu_objset_rele(objset_t *os, void *tag) 515{ 516 dsl_pool_t *dp = dmu_objset_pool(os); 517 dsl_dataset_rele(os->os_dsl_dataset, tag); 518 dsl_pool_rele(dp, tag); 519} 520 521/* 522 * When we are called, os MUST refer to an objset associated with a dataset 523 * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner 524 * == tag. We will then release and reacquire ownership of the dataset while 525 * holding the pool config_rwlock to avoid intervening namespace or ownership 526 * changes may occur. 527 * 528 * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to 529 * release the hold on its dataset and acquire a new one on the dataset of the 530 * same name so that it can be partially torn down and reconstructed. 531 */ 532void 533dmu_objset_refresh_ownership(objset_t *os, void *tag) 534{ 535 dsl_pool_t *dp; 536 dsl_dataset_t *ds, *newds; 537 char name[MAXNAMELEN]; 538 539 ds = os->os_dsl_dataset; 540 VERIFY3P(ds, !=, NULL); 541 VERIFY3P(ds->ds_owner, ==, tag); 542 VERIFY(dsl_dataset_long_held(ds)); 543 544 dsl_dataset_name(ds, name); 545 dp = dmu_objset_pool(os); 546 dsl_pool_config_enter(dp, FTAG); 547 dmu_objset_disown(os, tag); 548 VERIFY0(dsl_dataset_own(dp, name, tag, &newds)); 549 VERIFY3P(newds, ==, os->os_dsl_dataset); 550 dsl_pool_config_exit(dp, FTAG); 551} 552 553void 554dmu_objset_disown(objset_t *os, void *tag) 555{ 556 dsl_dataset_disown(os->os_dsl_dataset, tag); 557} 558 559void 560dmu_objset_evict_dbufs(objset_t *os) 561{ 562 dnode_t *dn; 563 564 mutex_enter(&os->os_lock); 565 566 /* process the mdn last, since the other dnodes have holds on it */ 567 list_remove(&os->os_dnodes, DMU_META_DNODE(os)); 568 list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); 569 570 /* 571 * Find the first dnode with holds. We have to do this dance 572 * because dnode_add_ref() only works if you already have a 573 * hold. If there are no holds then it has no dbufs so OK to 574 * skip. 575 */ 576 for (dn = list_head(&os->os_dnodes); 577 dn && !dnode_add_ref(dn, FTAG); 578 dn = list_next(&os->os_dnodes, dn)) 579 continue; 580 581 while (dn) { 582 dnode_t *next_dn = dn; 583 584 do { 585 next_dn = list_next(&os->os_dnodes, next_dn); 586 } while (next_dn && !dnode_add_ref(next_dn, FTAG)); 587 588 mutex_exit(&os->os_lock); 589 dnode_evict_dbufs(dn); 590 dnode_rele(dn, FTAG); 591 mutex_enter(&os->os_lock); 592 dn = next_dn; 593 } 594 mutex_exit(&os->os_lock); 595} 596 597void 598dmu_objset_evict(objset_t *os) 599{ 600 dsl_dataset_t *ds = os->os_dsl_dataset; 601 602 for (int t = 0; t < TXG_SIZE; t++) 603 ASSERT(!dmu_objset_is_dirty(os, t)); 604 605 if (ds) { 606 if (!dsl_dataset_is_snapshot(ds)) { 607 VERIFY0(dsl_prop_unregister(ds, 608 zfs_prop_to_name(ZFS_PROP_CHECKSUM), 609 checksum_changed_cb, os)); 610 VERIFY0(dsl_prop_unregister(ds, 611 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 612 compression_changed_cb, os)); 613 VERIFY0(dsl_prop_unregister(ds, 614 zfs_prop_to_name(ZFS_PROP_COPIES), 615 copies_changed_cb, os)); 616 VERIFY0(dsl_prop_unregister(ds, 617 zfs_prop_to_name(ZFS_PROP_DEDUP), 618 dedup_changed_cb, os)); 619 VERIFY0(dsl_prop_unregister(ds, 620 zfs_prop_to_name(ZFS_PROP_LOGBIAS), 621 logbias_changed_cb, os)); 622 VERIFY0(dsl_prop_unregister(ds, 623 zfs_prop_to_name(ZFS_PROP_SYNC), 624 sync_changed_cb, os)); 625 } 626 VERIFY0(dsl_prop_unregister(ds, 627 zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), 628 primary_cache_changed_cb, os)); 629 VERIFY0(dsl_prop_unregister(ds, 630 zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), 631 secondary_cache_changed_cb, os)); 632 } 633 634 if (os->os_sa) 635 sa_tear_down(os); 636 637 dmu_objset_evict_dbufs(os); 638 639 dnode_special_close(&os->os_meta_dnode); 640 if (DMU_USERUSED_DNODE(os)) { 641 dnode_special_close(&os->os_userused_dnode); 642 dnode_special_close(&os->os_groupused_dnode); 643 } 644 zil_free(os->os_zil); 645 646 ASSERT3P(list_head(&os->os_dnodes), ==, NULL); 647 648 VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); 649 650 /* 651 * This is a barrier to prevent the objset from going away in 652 * dnode_move() until we can safely ensure that the objset is still in 653 * use. We consider the objset valid before the barrier and invalid 654 * after the barrier. 655 */ 656 rw_enter(&os_lock, RW_READER); 657 rw_exit(&os_lock); 658 659 mutex_destroy(&os->os_lock); 660 mutex_destroy(&os->os_obj_lock); 661 mutex_destroy(&os->os_user_ptr_lock); 662 kmem_free(os, sizeof (objset_t)); 663} 664 665timestruc_t 666dmu_objset_snap_cmtime(objset_t *os) 667{ 668 return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); 669} 670 671/* called from dsl for meta-objset */ 672objset_t * 673dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, 674 dmu_objset_type_t type, dmu_tx_t *tx) 675{ 676 objset_t *os; 677 dnode_t *mdn; 678 679 ASSERT(dmu_tx_is_syncing(tx)); 680 681 if (ds != NULL) 682 VERIFY0(dmu_objset_from_ds(ds, &os)); 683 else 684 VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); 685 686 mdn = DMU_META_DNODE(os); 687 688 dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, 689 DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); 690 691 /* 692 * We don't want to have to increase the meta-dnode's nlevels 693 * later, because then we could do it in quescing context while 694 * we are also accessing it in open context. 695 * 696 * This precaution is not necessary for the MOS (ds == NULL), 697 * because the MOS is only updated in syncing context. 698 * This is most fortunate: the MOS is the only objset that 699 * needs to be synced multiple times as spa_sync() iterates 700 * to convergence, so minimizing its dn_nlevels matters. 701 */ 702 if (ds != NULL) { 703 int levels = 1; 704 705 /* 706 * Determine the number of levels necessary for the meta-dnode 707 * to contain DN_MAX_OBJECT dnodes. 708 */ 709 while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + 710 (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < 711 DN_MAX_OBJECT * sizeof (dnode_phys_t)) 712 levels++; 713 714 mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = 715 mdn->dn_nlevels = levels; 716 } 717 718 ASSERT(type != DMU_OST_NONE); 719 ASSERT(type != DMU_OST_ANY); 720 ASSERT(type < DMU_OST_NUMTYPES); 721 os->os_phys->os_type = type; 722 if (dmu_objset_userused_enabled(os)) { 723 os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; 724 os->os_flags = os->os_phys->os_flags; 725 } 726 727 dsl_dataset_dirty(ds, tx); 728 729 return (os); 730} 731 732typedef struct dmu_objset_create_arg { 733 const char *doca_name; 734 cred_t *doca_cred; 735 void (*doca_userfunc)(objset_t *os, void *arg, 736 cred_t *cr, dmu_tx_t *tx); 737 void *doca_userarg; 738 dmu_objset_type_t doca_type; 739 uint64_t doca_flags; 740} dmu_objset_create_arg_t; 741 742/*ARGSUSED*/ 743static int 744dmu_objset_create_check(void *arg, dmu_tx_t *tx) 745{ 746 dmu_objset_create_arg_t *doca = arg; 747 dsl_pool_t *dp = dmu_tx_pool(tx); 748 dsl_dir_t *pdd; 749 const char *tail; 750 int error; 751 752 if (strchr(doca->doca_name, '@') != NULL) 753 return (SET_ERROR(EINVAL)); 754 755 error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); 756 if (error != 0) 757 return (error); 758 if (tail == NULL) { 759 dsl_dir_rele(pdd, FTAG); 760 return (SET_ERROR(EEXIST)); 761 } 762 error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, 763 doca->doca_cred); 764 dsl_dir_rele(pdd, FTAG); 765 766 return (error); 767} 768 769static void 770dmu_objset_create_sync(void *arg, dmu_tx_t *tx) 771{ 772 dmu_objset_create_arg_t *doca = arg; 773 dsl_pool_t *dp = dmu_tx_pool(tx); 774 dsl_dir_t *pdd; 775 const char *tail; 776 dsl_dataset_t *ds; 777 uint64_t obj; 778 blkptr_t *bp; 779 objset_t *os; 780 781 VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); 782 783 obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, 784 doca->doca_cred, tx); 785 786 VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); 787 bp = dsl_dataset_get_blkptr(ds); 788 os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, 789 ds, bp, doca->doca_type, tx); 790 791 if (doca->doca_userfunc != NULL) { 792 doca->doca_userfunc(os, doca->doca_userarg, 793 doca->doca_cred, tx); 794 } 795 796 spa_history_log_internal_ds(ds, "create", tx, ""); 797 dsl_dataset_rele(ds, FTAG); 798 dsl_dir_rele(pdd, FTAG); 799} 800 801int 802dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, 803 void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) 804{ 805 dmu_objset_create_arg_t doca; 806 807 doca.doca_name = name; 808 doca.doca_cred = CRED(); 809 doca.doca_flags = flags; 810 doca.doca_userfunc = func; 811 doca.doca_userarg = arg; 812 doca.doca_type = type; 813 814 return (dsl_sync_task(name, 815 dmu_objset_create_check, dmu_objset_create_sync, &doca, 5)); 816} 817 818typedef struct dmu_objset_clone_arg { 819 const char *doca_clone; 820 const char *doca_origin; 821 cred_t *doca_cred; 822} dmu_objset_clone_arg_t; 823 824/*ARGSUSED*/ 825static int 826dmu_objset_clone_check(void *arg, dmu_tx_t *tx) 827{ 828 dmu_objset_clone_arg_t *doca = arg; 829 dsl_dir_t *pdd; 830 const char *tail; 831 int error; 832 dsl_dataset_t *origin; 833 dsl_pool_t *dp = dmu_tx_pool(tx); 834 835 if (strchr(doca->doca_clone, '@') != NULL) 836 return (SET_ERROR(EINVAL)); 837 838 error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); 839 if (error != 0) 840 return (error); 841 if (tail == NULL) { 842 dsl_dir_rele(pdd, FTAG); 843 return (SET_ERROR(EEXIST)); 844 } 845 /* You can't clone across pools. */ 846 if (pdd->dd_pool != dp) { 847 dsl_dir_rele(pdd, FTAG); 848 return (SET_ERROR(EXDEV)); 849 } 850 error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, 851 doca->doca_cred); 852 if (error != 0) { 853 dsl_dir_rele(pdd, FTAG); 854 return (SET_ERROR(EDQUOT)); 855 } 856 dsl_dir_rele(pdd, FTAG); 857 858 error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); 859 if (error != 0) 860 return (error); 861 862 /* You can't clone across pools. */ 863 if (origin->ds_dir->dd_pool != dp) { 864 dsl_dataset_rele(origin, FTAG); 865 return (SET_ERROR(EXDEV)); 866 } 867 868 /* You can only clone snapshots, not the head datasets. */ 869 if (!dsl_dataset_is_snapshot(origin)) { 870 dsl_dataset_rele(origin, FTAG); 871 return (SET_ERROR(EINVAL)); 872 } 873 dsl_dataset_rele(origin, FTAG); 874 875 return (0); 876} 877 878static void 879dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) 880{ 881 dmu_objset_clone_arg_t *doca = arg; 882 dsl_pool_t *dp = dmu_tx_pool(tx); 883 dsl_dir_t *pdd; 884 const char *tail; 885 dsl_dataset_t *origin, *ds; 886 uint64_t obj; 887 char namebuf[MAXNAMELEN]; 888 889 VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); 890 VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); 891 892 obj = dsl_dataset_create_sync(pdd, tail, origin, 0, 893 doca->doca_cred, tx); 894 895 VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); 896 dsl_dataset_name(origin, namebuf); 897 spa_history_log_internal_ds(ds, "clone", tx, 898 "origin=%s (%llu)", namebuf, origin->ds_object); 899 dsl_dataset_rele(ds, FTAG); 900 dsl_dataset_rele(origin, FTAG); 901 dsl_dir_rele(pdd, FTAG); 902} 903 904int 905dmu_objset_clone(const char *clone, const char *origin) 906{ 907 dmu_objset_clone_arg_t doca; 908 909 doca.doca_clone = clone; 910 doca.doca_origin = origin; 911 doca.doca_cred = CRED(); 912 913 return (dsl_sync_task(clone, 914 dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5)); 915} 916 917int 918dmu_objset_snapshot_one(const char *fsname, const char *snapname) 919{ 920 int err; 921 char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); 922 nvlist_t *snaps = fnvlist_alloc(); 923 924 fnvlist_add_boolean(snaps, longsnap); 925 strfree(longsnap); 926 err = dsl_dataset_snapshot(snaps, NULL, NULL); 927 fnvlist_free(snaps); 928 return (err); 929} 930 931static void 932dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) 933{ 934 dnode_t *dn; 935 936 while (dn = list_head(list)) { 937 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 938 ASSERT(dn->dn_dbuf->db_data_pending); 939 /* 940 * Initialize dn_zio outside dnode_sync() because the 941 * meta-dnode needs to set it ouside dnode_sync(). 942 */ 943 dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; 944 ASSERT(dn->dn_zio); 945 946 ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); 947 list_remove(list, dn); 948 949 if (newlist) { 950 (void) dnode_add_ref(dn, newlist); 951 list_insert_tail(newlist, dn); 952 } 953 954 dnode_sync(dn, tx); 955 } 956} 957 958/* ARGSUSED */ 959static void 960dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) 961{ 962 blkptr_t *bp = zio->io_bp; 963 objset_t *os = arg; 964 dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; 965 966 ASSERT3P(bp, ==, os->os_rootbp); 967 ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); 968 ASSERT0(BP_GET_LEVEL(bp)); 969 970 /* 971 * Update rootbp fill count: it should be the number of objects 972 * allocated in the object set (not counting the "special" 973 * objects that are stored in the objset_phys_t -- the meta 974 * dnode and user/group accounting objects). 975 */ 976 bp->blk_fill = 0; 977 for (int i = 0; i < dnp->dn_nblkptr; i++) 978 bp->blk_fill += dnp->dn_blkptr[i].blk_fill; 979} 980 981/* ARGSUSED */ 982static void 983dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) 984{ 985 blkptr_t *bp = zio->io_bp; 986 blkptr_t *bp_orig = &zio->io_bp_orig; 987 objset_t *os = arg; 988 989 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 990 ASSERT(BP_EQUAL(bp, bp_orig)); 991 } else { 992 dsl_dataset_t *ds = os->os_dsl_dataset; 993 dmu_tx_t *tx = os->os_synctx; 994 995 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 996 dsl_dataset_block_born(ds, bp, tx); 997 } 998} 999 1000/* called from dsl */ 1001void 1002dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) 1003{ 1004 int txgoff; 1005 zbookmark_t zb; 1006 zio_prop_t zp; 1007 zio_t *zio; 1008 list_t *list; 1009 list_t *newlist = NULL; 1010 dbuf_dirty_record_t *dr; 1011 1012 dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); 1013 1014 ASSERT(dmu_tx_is_syncing(tx)); 1015 /* XXX the write_done callback should really give us the tx... */ 1016 os->os_synctx = tx; 1017 1018 if (os->os_dsl_dataset == NULL) { 1019 /* 1020 * This is the MOS. If we have upgraded, 1021 * spa_max_replication() could change, so reset 1022 * os_copies here. 1023 */ 1024 os->os_copies = spa_max_replication(os->os_spa); 1025 } 1026 1027 /* 1028 * Create the root block IO 1029 */ 1030 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 1031 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 1032 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 1033 arc_release(os->os_phys_buf, &os->os_phys_buf); 1034 1035 dmu_write_policy(os, NULL, 0, 0, &zp); 1036 1037 zio = arc_write(pio, os->os_spa, tx->tx_txg, 1038 os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), 1039 DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, 1040 NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, 1041 ZIO_FLAG_MUSTSUCCEED, &zb); 1042 1043 /* 1044 * Sync special dnodes - the parent IO for the sync is the root block 1045 */ 1046 DMU_META_DNODE(os)->dn_zio = zio; 1047 dnode_sync(DMU_META_DNODE(os), tx); 1048 1049 os->os_phys->os_flags = os->os_flags; 1050 1051 if (DMU_USERUSED_DNODE(os) && 1052 DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { 1053 DMU_USERUSED_DNODE(os)->dn_zio = zio; 1054 dnode_sync(DMU_USERUSED_DNODE(os), tx); 1055 DMU_GROUPUSED_DNODE(os)->dn_zio = zio; 1056 dnode_sync(DMU_GROUPUSED_DNODE(os), tx); 1057 } 1058 1059 txgoff = tx->tx_txg & TXG_MASK; 1060 1061 if (dmu_objset_userused_enabled(os)) { 1062 newlist = &os->os_synced_dnodes; 1063 /* 1064 * We must create the list here because it uses the 1065 * dn_dirty_link[] of this txg. 1066 */ 1067 list_create(newlist, sizeof (dnode_t), 1068 offsetof(dnode_t, dn_dirty_link[txgoff])); 1069 } 1070 1071 dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); 1072 dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); 1073 1074 list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; 1075 while (dr = list_head(list)) { 1076 ASSERT0(dr->dr_dbuf->db_level); 1077 list_remove(list, dr); 1078 if (dr->dr_zio) 1079 zio_nowait(dr->dr_zio); 1080 } 1081 /* 1082 * Free intent log blocks up to this tx. 1083 */ 1084 zil_sync(os->os_zil, tx); 1085 os->os_phys->os_zil_header = os->os_zil_header; 1086 zio_nowait(zio); 1087} 1088 1089boolean_t 1090dmu_objset_is_dirty(objset_t *os, uint64_t txg) 1091{ 1092 return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || 1093 !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); 1094} 1095 1096static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; 1097 1098void 1099dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) 1100{ 1101 used_cbs[ost] = cb; 1102} 1103 1104boolean_t 1105dmu_objset_userused_enabled(objset_t *os) 1106{ 1107 return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && 1108 used_cbs[os->os_phys->os_type] != NULL && 1109 DMU_USERUSED_DNODE(os) != NULL); 1110} 1111 1112static void 1113do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, 1114 uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) 1115{ 1116 if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { 1117 int64_t delta = DNODE_SIZE + used; 1118 if (subtract) 1119 delta = -delta; 1120 VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, 1121 user, delta, tx)); 1122 VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, 1123 group, delta, tx)); 1124 } 1125} 1126 1127void 1128dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) 1129{ 1130 dnode_t *dn; 1131 list_t *list = &os->os_synced_dnodes; 1132 1133 ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); 1134 1135 while (dn = list_head(list)) { 1136 int flags; 1137 ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); 1138 ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || 1139 dn->dn_phys->dn_flags & 1140 DNODE_FLAG_USERUSED_ACCOUNTED); 1141 1142 /* Allocate the user/groupused objects if necessary. */ 1143 if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { 1144 VERIFY(0 == zap_create_claim(os, 1145 DMU_USERUSED_OBJECT, 1146 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); 1147 VERIFY(0 == zap_create_claim(os, 1148 DMU_GROUPUSED_OBJECT, 1149 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); 1150 } 1151 1152 /* 1153 * We intentionally modify the zap object even if the 1154 * net delta is zero. Otherwise 1155 * the block of the zap obj could be shared between 1156 * datasets but need to be different between them after 1157 * a bprewrite. 1158 */ 1159 1160 flags = dn->dn_id_flags; 1161 ASSERT(flags); 1162 if (flags & DN_ID_OLD_EXIST) { 1163 do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, 1164 dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); 1165 } 1166 if (flags & DN_ID_NEW_EXIST) { 1167 do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), 1168 dn->dn_phys->dn_flags, dn->dn_newuid, 1169 dn->dn_newgid, B_FALSE, tx); 1170 } 1171 1172 mutex_enter(&dn->dn_mtx); 1173 dn->dn_oldused = 0; 1174 dn->dn_oldflags = 0; 1175 if (dn->dn_id_flags & DN_ID_NEW_EXIST) { 1176 dn->dn_olduid = dn->dn_newuid; 1177 dn->dn_oldgid = dn->dn_newgid; 1178 dn->dn_id_flags |= DN_ID_OLD_EXIST; 1179 if (dn->dn_bonuslen == 0) 1180 dn->dn_id_flags |= DN_ID_CHKED_SPILL; 1181 else 1182 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1183 } 1184 dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); 1185 mutex_exit(&dn->dn_mtx); 1186 1187 list_remove(list, dn); 1188 dnode_rele(dn, list); 1189 } 1190} 1191 1192/* 1193 * Returns a pointer to data to find uid/gid from 1194 * 1195 * If a dirty record for transaction group that is syncing can't 1196 * be found then NULL is returned. In the NULL case it is assumed 1197 * the uid/gid aren't changing. 1198 */ 1199static void * 1200dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) 1201{ 1202 dbuf_dirty_record_t *dr, **drp; 1203 void *data; 1204 1205 if (db->db_dirtycnt == 0) 1206 return (db->db.db_data); /* Nothing is changing */ 1207 1208 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1209 if (dr->dr_txg == tx->tx_txg) 1210 break; 1211 1212 if (dr == NULL) { 1213 data = NULL; 1214 } else { 1215 dnode_t *dn; 1216 1217 DB_DNODE_ENTER(dr->dr_dbuf); 1218 dn = DB_DNODE(dr->dr_dbuf); 1219 1220 if (dn->dn_bonuslen == 0 && 1221 dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) 1222 data = dr->dt.dl.dr_data->b_data; 1223 else 1224 data = dr->dt.dl.dr_data; 1225 1226 DB_DNODE_EXIT(dr->dr_dbuf); 1227 } 1228 1229 return (data); 1230} 1231 1232void 1233dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) 1234{ 1235 objset_t *os = dn->dn_objset; 1236 void *data = NULL; 1237 dmu_buf_impl_t *db = NULL; 1238 uint64_t *user = NULL; 1239 uint64_t *group = NULL; 1240 int flags = dn->dn_id_flags; 1241 int error; 1242 boolean_t have_spill = B_FALSE; 1243 1244 if (!dmu_objset_userused_enabled(dn->dn_objset)) 1245 return; 1246 1247 if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| 1248 DN_ID_CHKED_SPILL))) 1249 return; 1250 1251 if (before && dn->dn_bonuslen != 0) 1252 data = DN_BONUS(dn->dn_phys); 1253 else if (!before && dn->dn_bonuslen != 0) { 1254 if (dn->dn_bonus) { 1255 db = dn->dn_bonus; 1256 mutex_enter(&db->db_mtx); 1257 data = dmu_objset_userquota_find_data(db, tx); 1258 } else { 1259 data = DN_BONUS(dn->dn_phys); 1260 } 1261 } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { 1262 int rf = 0; 1263 1264 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) 1265 rf |= DB_RF_HAVESTRUCT; 1266 error = dmu_spill_hold_by_dnode(dn, 1267 rf | DB_RF_MUST_SUCCEED, 1268 FTAG, (dmu_buf_t **)&db); 1269 ASSERT(error == 0); 1270 mutex_enter(&db->db_mtx); 1271 data = (before) ? db->db.db_data : 1272 dmu_objset_userquota_find_data(db, tx); 1273 have_spill = B_TRUE; 1274 } else { 1275 mutex_enter(&dn->dn_mtx); 1276 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1277 mutex_exit(&dn->dn_mtx); 1278 return; 1279 } 1280 1281 if (before) { 1282 ASSERT(data); 1283 user = &dn->dn_olduid; 1284 group = &dn->dn_oldgid; 1285 } else if (data) { 1286 user = &dn->dn_newuid; 1287 group = &dn->dn_newgid; 1288 } 1289 1290 /* 1291 * Must always call the callback in case the object 1292 * type has changed and that type isn't an object type to track 1293 */ 1294 error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, 1295 user, group); 1296 1297 /* 1298 * Preserve existing uid/gid when the callback can't determine 1299 * what the new uid/gid are and the callback returned EEXIST. 1300 * The EEXIST error tells us to just use the existing uid/gid. 1301 * If we don't know what the old values are then just assign 1302 * them to 0, since that is a new file being created. 1303 */ 1304 if (!before && data == NULL && error == EEXIST) { 1305 if (flags & DN_ID_OLD_EXIST) { 1306 dn->dn_newuid = dn->dn_olduid; 1307 dn->dn_newgid = dn->dn_oldgid; 1308 } else { 1309 dn->dn_newuid = 0; 1310 dn->dn_newgid = 0; 1311 } 1312 error = 0; 1313 } 1314 1315 if (db) 1316 mutex_exit(&db->db_mtx); 1317 1318 mutex_enter(&dn->dn_mtx); 1319 if (error == 0 && before) 1320 dn->dn_id_flags |= DN_ID_OLD_EXIST; 1321 if (error == 0 && !before) 1322 dn->dn_id_flags |= DN_ID_NEW_EXIST; 1323 1324 if (have_spill) { 1325 dn->dn_id_flags |= DN_ID_CHKED_SPILL; 1326 } else { 1327 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1328 } 1329 mutex_exit(&dn->dn_mtx); 1330 if (have_spill) 1331 dmu_buf_rele((dmu_buf_t *)db, FTAG); 1332} 1333 1334boolean_t 1335dmu_objset_userspace_present(objset_t *os) 1336{ 1337 return (os->os_phys->os_flags & 1338 OBJSET_FLAG_USERACCOUNTING_COMPLETE); 1339} 1340 1341int 1342dmu_objset_userspace_upgrade(objset_t *os) 1343{ 1344 uint64_t obj; 1345 int err = 0; 1346 1347 if (dmu_objset_userspace_present(os)) 1348 return (0); 1349 if (!dmu_objset_userused_enabled(os)) 1350 return (SET_ERROR(ENOTSUP)); 1351 if (dmu_objset_is_snapshot(os)) 1352 return (SET_ERROR(EINVAL)); 1353 1354 /* 1355 * We simply need to mark every object dirty, so that it will be 1356 * synced out and now accounted. If this is called 1357 * concurrently, or if we already did some work before crashing, 1358 * that's fine, since we track each object's accounted state 1359 * independently. 1360 */ 1361 1362 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 1363 dmu_tx_t *tx; 1364 dmu_buf_t *db; 1365 int objerr; 1366 1367 if (issig(JUSTLOOKING) && issig(FORREAL)) 1368 return (SET_ERROR(EINTR)); 1369 1370 objerr = dmu_bonus_hold(os, obj, FTAG, &db); 1371 if (objerr != 0) 1372 continue; 1373 tx = dmu_tx_create(os); 1374 dmu_tx_hold_bonus(tx, obj); 1375 objerr = dmu_tx_assign(tx, TXG_WAIT); 1376 if (objerr != 0) { 1377 dmu_tx_abort(tx); 1378 continue; 1379 } 1380 dmu_buf_will_dirty(db, tx); 1381 dmu_buf_rele(db, FTAG); 1382 dmu_tx_commit(tx); 1383 } 1384 1385 os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; 1386 txg_wait_synced(dmu_objset_pool(os), 0); 1387 return (0); 1388} 1389 1390void 1391dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, 1392 uint64_t *usedobjsp, uint64_t *availobjsp) 1393{ 1394 dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, 1395 usedobjsp, availobjsp); 1396} 1397 1398uint64_t 1399dmu_objset_fsid_guid(objset_t *os) 1400{ 1401 return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); 1402} 1403 1404void 1405dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) 1406{ 1407 stat->dds_type = os->os_phys->os_type; 1408 if (os->os_dsl_dataset) 1409 dsl_dataset_fast_stat(os->os_dsl_dataset, stat); 1410} 1411 1412void 1413dmu_objset_stats(objset_t *os, nvlist_t *nv) 1414{ 1415 ASSERT(os->os_dsl_dataset || 1416 os->os_phys->os_type == DMU_OST_META); 1417 1418 if (os->os_dsl_dataset != NULL) 1419 dsl_dataset_stats(os->os_dsl_dataset, nv); 1420 1421 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, 1422 os->os_phys->os_type); 1423 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, 1424 dmu_objset_userspace_present(os)); 1425} 1426 1427int 1428dmu_objset_is_snapshot(objset_t *os) 1429{ 1430 if (os->os_dsl_dataset != NULL) 1431 return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); 1432 else 1433 return (B_FALSE); 1434} 1435 1436int 1437dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, 1438 boolean_t *conflict) 1439{ 1440 dsl_dataset_t *ds = os->os_dsl_dataset; 1441 uint64_t ignored; 1442 1443 if (ds->ds_phys->ds_snapnames_zapobj == 0) 1444 return (SET_ERROR(ENOENT)); 1445 1446 return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, 1447 ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, 1448 real, maxlen, conflict)); 1449} 1450 1451int 1452dmu_snapshot_list_next(objset_t *os, int namelen, char *name, 1453 uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) 1454{ 1455 dsl_dataset_t *ds = os->os_dsl_dataset; 1456 zap_cursor_t cursor; 1457 zap_attribute_t attr; 1458 1459 ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); 1460 1461 if (ds->ds_phys->ds_snapnames_zapobj == 0) 1462 return (SET_ERROR(ENOENT)); 1463 1464 zap_cursor_init_serialized(&cursor, 1465 ds->ds_dir->dd_pool->dp_meta_objset, 1466 ds->ds_phys->ds_snapnames_zapobj, *offp); 1467 1468 if (zap_cursor_retrieve(&cursor, &attr) != 0) { 1469 zap_cursor_fini(&cursor); 1470 return (SET_ERROR(ENOENT)); 1471 } 1472 1473 if (strlen(attr.za_name) + 1 > namelen) { 1474 zap_cursor_fini(&cursor); 1475 return (SET_ERROR(ENAMETOOLONG)); 1476 } 1477 1478 (void) strcpy(name, attr.za_name); 1479 if (idp) 1480 *idp = attr.za_first_integer; 1481 if (case_conflict) 1482 *case_conflict = attr.za_normalization_conflict; 1483 zap_cursor_advance(&cursor); 1484 *offp = zap_cursor_serialize(&cursor); 1485 zap_cursor_fini(&cursor); 1486 1487 return (0); 1488} 1489 1490int 1491dmu_dir_list_next(objset_t *os, int namelen, char *name, 1492 uint64_t *idp, uint64_t *offp) 1493{ 1494 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; 1495 zap_cursor_t cursor; 1496 zap_attribute_t attr; 1497 1498 /* there is no next dir on a snapshot! */ 1499 if (os->os_dsl_dataset->ds_object != 1500 dd->dd_phys->dd_head_dataset_obj) 1501 return (SET_ERROR(ENOENT)); 1502 1503 zap_cursor_init_serialized(&cursor, 1504 dd->dd_pool->dp_meta_objset, 1505 dd->dd_phys->dd_child_dir_zapobj, *offp); 1506 1507 if (zap_cursor_retrieve(&cursor, &attr) != 0) { 1508 zap_cursor_fini(&cursor); 1509 return (SET_ERROR(ENOENT)); 1510 } 1511 1512 if (strlen(attr.za_name) + 1 > namelen) { 1513 zap_cursor_fini(&cursor); 1514 return (SET_ERROR(ENAMETOOLONG)); 1515 } 1516 1517 (void) strcpy(name, attr.za_name); 1518 if (idp) 1519 *idp = attr.za_first_integer; 1520 zap_cursor_advance(&cursor); 1521 *offp = zap_cursor_serialize(&cursor); 1522 zap_cursor_fini(&cursor); 1523 1524 return (0); 1525} 1526 1527/* 1528 * Find objsets under and including ddobj, call func(ds) on each. 1529 */ 1530int 1531dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, 1532 int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) 1533{ 1534 dsl_dir_t *dd; 1535 dsl_dataset_t *ds; 1536 zap_cursor_t zc; 1537 zap_attribute_t *attr; 1538 uint64_t thisobj; 1539 int err; 1540 1541 ASSERT(dsl_pool_config_held(dp)); 1542 1543 err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); 1544 if (err != 0) 1545 return (err); 1546 1547 /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ 1548 if (dd->dd_myname[0] == '$') { 1549 dsl_dir_rele(dd, FTAG); 1550 return (0); 1551 } 1552 1553 thisobj = dd->dd_phys->dd_head_dataset_obj; 1554 attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 1555 1556 /* 1557 * Iterate over all children. 1558 */ 1559 if (flags & DS_FIND_CHILDREN) { 1560 for (zap_cursor_init(&zc, dp->dp_meta_objset, 1561 dd->dd_phys->dd_child_dir_zapobj); 1562 zap_cursor_retrieve(&zc, attr) == 0; 1563 (void) zap_cursor_advance(&zc)) { 1564 ASSERT3U(attr->za_integer_length, ==, 1565 sizeof (uint64_t)); 1566 ASSERT3U(attr->za_num_integers, ==, 1); 1567 1568 err = dmu_objset_find_dp(dp, attr->za_first_integer, 1569 func, arg, flags); 1570 if (err != 0) 1571 break; 1572 } 1573 zap_cursor_fini(&zc); 1574 1575 if (err != 0) { 1576 dsl_dir_rele(dd, FTAG); 1577 kmem_free(attr, sizeof (zap_attribute_t)); 1578 return (err); 1579 } 1580 } 1581 1582 /* 1583 * Iterate over all snapshots. 1584 */ 1585 if (flags & DS_FIND_SNAPSHOTS) { 1586 dsl_dataset_t *ds; 1587 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); 1588 1589 if (err == 0) { 1590 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 1591 dsl_dataset_rele(ds, FTAG); 1592 1593 for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); 1594 zap_cursor_retrieve(&zc, attr) == 0; 1595 (void) zap_cursor_advance(&zc)) { 1596 ASSERT3U(attr->za_integer_length, ==, 1597 sizeof (uint64_t)); 1598 ASSERT3U(attr->za_num_integers, ==, 1); 1599 1600 err = dsl_dataset_hold_obj(dp, 1601 attr->za_first_integer, FTAG, &ds); 1602 if (err != 0) 1603 break; 1604 err = func(dp, ds, arg); 1605 dsl_dataset_rele(ds, FTAG); 1606 if (err != 0) 1607 break; 1608 } 1609 zap_cursor_fini(&zc); 1610 } 1611 } 1612 1613 dsl_dir_rele(dd, FTAG); 1614 kmem_free(attr, sizeof (zap_attribute_t)); 1615 1616 if (err != 0) 1617 return (err); 1618 1619 /* 1620 * Apply to self. 1621 */ 1622 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); 1623 if (err != 0) 1624 return (err); 1625 err = func(dp, ds, arg); 1626 dsl_dataset_rele(ds, FTAG); 1627 return (err); 1628} 1629 1630/* 1631 * Find all objsets under name, and for each, call 'func(child_name, arg)'. 1632 * The dp_config_rwlock must not be held when this is called, and it 1633 * will not be held when the callback is called. 1634 * Therefore this function should only be used when the pool is not changing 1635 * (e.g. in syncing context), or the callback can deal with the possible races. 1636 */ 1637static int 1638dmu_objset_find_impl(spa_t *spa, const char *name, 1639 int func(const char *, void *), void *arg, int flags) 1640{ 1641 dsl_dir_t *dd; 1642 dsl_pool_t *dp = spa_get_dsl(spa); 1643 dsl_dataset_t *ds; 1644 zap_cursor_t zc; 1645 zap_attribute_t *attr; 1646 char *child; 1647 uint64_t thisobj; 1648 int err; 1649 1650 dsl_pool_config_enter(dp, FTAG); 1651 1652 err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); 1653 if (err != 0) { 1654 dsl_pool_config_exit(dp, FTAG); 1655 return (err); 1656 } 1657 1658 /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ 1659 if (dd->dd_myname[0] == '$') { 1660 dsl_dir_rele(dd, FTAG); 1661 dsl_pool_config_exit(dp, FTAG); 1662 return (0); 1663 } 1664 1665 thisobj = dd->dd_phys->dd_head_dataset_obj; 1666 attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 1667 1668 /* 1669 * Iterate over all children. 1670 */ 1671 if (flags & DS_FIND_CHILDREN) { 1672 for (zap_cursor_init(&zc, dp->dp_meta_objset, 1673 dd->dd_phys->dd_child_dir_zapobj); 1674 zap_cursor_retrieve(&zc, attr) == 0; 1675 (void) zap_cursor_advance(&zc)) { 1676 ASSERT3U(attr->za_integer_length, ==, 1677 sizeof (uint64_t)); 1678 ASSERT3U(attr->za_num_integers, ==, 1); 1679 1680 child = kmem_asprintf("%s/%s", name, attr->za_name); 1681 dsl_pool_config_exit(dp, FTAG); 1682 err = dmu_objset_find_impl(spa, child, 1683 func, arg, flags); 1684 dsl_pool_config_enter(dp, FTAG); 1685 strfree(child); 1686 if (err != 0) 1687 break; 1688 } 1689 zap_cursor_fini(&zc); 1690 1691 if (err != 0) { 1692 dsl_dir_rele(dd, FTAG); 1693 dsl_pool_config_exit(dp, FTAG); 1694 kmem_free(attr, sizeof (zap_attribute_t)); 1695 return (err); 1696 } 1697 } 1698 1699 /* 1700 * Iterate over all snapshots. 1701 */ 1702 if (flags & DS_FIND_SNAPSHOTS) { 1703 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); 1704 1705 if (err == 0) { 1706 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 1707 dsl_dataset_rele(ds, FTAG); 1708 1709 for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); 1710 zap_cursor_retrieve(&zc, attr) == 0; 1711 (void) zap_cursor_advance(&zc)) { 1712 ASSERT3U(attr->za_integer_length, ==, 1713 sizeof (uint64_t)); 1714 ASSERT3U(attr->za_num_integers, ==, 1); 1715 1716 child = kmem_asprintf("%s@%s", 1717 name, attr->za_name); 1718 dsl_pool_config_exit(dp, FTAG); 1719 err = func(child, arg); 1720 dsl_pool_config_enter(dp, FTAG); 1721 strfree(child); 1722 if (err != 0) 1723 break; 1724 } 1725 zap_cursor_fini(&zc); 1726 } 1727 } 1728 1729 dsl_dir_rele(dd, FTAG); 1730 kmem_free(attr, sizeof (zap_attribute_t)); 1731 dsl_pool_config_exit(dp, FTAG); 1732 1733 if (err != 0) 1734 return (err); 1735 1736 /* Apply to self. */ 1737 return (func(name, arg)); 1738} 1739 1740/* 1741 * See comment above dmu_objset_find_impl(). 1742 */ 1743int 1744dmu_objset_find(char *name, int func(const char *, void *), void *arg, 1745 int flags) 1746{ 1747 spa_t *spa; 1748 int error; 1749 1750 error = spa_open(name, &spa, FTAG); 1751 if (error != 0) 1752 return (error); 1753 error = dmu_objset_find_impl(spa, name, func, arg, flags); 1754 spa_close(spa, FTAG); 1755 return (error); 1756} 1757 1758void 1759dmu_objset_set_user(objset_t *os, void *user_ptr) 1760{ 1761 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); 1762 os->os_user_ptr = user_ptr; 1763} 1764 1765void * 1766dmu_objset_get_user(objset_t *os) 1767{ 1768 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); 1769 return (os->os_user_ptr); 1770} 1771 1772/* 1773 * Determine name of filesystem, given name of snapshot. 1774 * buf must be at least MAXNAMELEN bytes 1775 */ 1776int 1777dmu_fsname(const char *snapname, char *buf) 1778{ 1779 char *atp = strchr(snapname, '@'); 1780 if (atp == NULL) 1781 return (SET_ERROR(EINVAL)); 1782 if (atp - snapname >= MAXNAMELEN) 1783 return (SET_ERROR(ENAMETOOLONG)); 1784 (void) strlcpy(buf, snapname, atp - snapname + 1); 1785 return (0); 1786} 1787