dmu_objset.c revision 219089
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25/* Portions Copyright 2010 Robert Milkowski */ 26 27#include <sys/cred.h> 28#include <sys/zfs_context.h> 29#include <sys/dmu_objset.h> 30#include <sys/dsl_dir.h> 31#include <sys/dsl_dataset.h> 32#include <sys/dsl_prop.h> 33#include <sys/dsl_pool.h> 34#include <sys/dsl_synctask.h> 35#include <sys/dsl_deleg.h> 36#include <sys/dnode.h> 37#include <sys/dbuf.h> 38#include <sys/zvol.h> 39#include <sys/dmu_tx.h> 40#include <sys/zap.h> 41#include <sys/zil.h> 42#include <sys/dmu_impl.h> 43#include <sys/zfs_ioctl.h> 44#include <sys/sa.h> 45#include <sys/zfs_onexit.h> 46 47/* 48 * Needed to close a window in dnode_move() that allows the objset to be freed 49 * before it can be safely accessed. 50 */ 51krwlock_t os_lock; 52 53void 54dmu_objset_init(void) 55{ 56 rw_init(&os_lock, NULL, RW_DEFAULT, NULL); 57} 58 59void 60dmu_objset_fini(void) 61{ 62 rw_destroy(&os_lock); 63} 64 65spa_t * 66dmu_objset_spa(objset_t *os) 67{ 68 return (os->os_spa); 69} 70 71zilog_t * 72dmu_objset_zil(objset_t *os) 73{ 74 return (os->os_zil); 75} 76 77dsl_pool_t * 78dmu_objset_pool(objset_t *os) 79{ 80 dsl_dataset_t *ds; 81 82 if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) 83 return (ds->ds_dir->dd_pool); 84 else 85 return (spa_get_dsl(os->os_spa)); 86} 87 88dsl_dataset_t * 89dmu_objset_ds(objset_t *os) 90{ 91 return (os->os_dsl_dataset); 92} 93 94dmu_objset_type_t 95dmu_objset_type(objset_t *os) 96{ 97 return (os->os_phys->os_type); 98} 99 100void 101dmu_objset_name(objset_t *os, char *buf) 102{ 103 dsl_dataset_name(os->os_dsl_dataset, buf); 104} 105 106uint64_t 107dmu_objset_id(objset_t *os) 108{ 109 dsl_dataset_t *ds = os->os_dsl_dataset; 110 111 return (ds ? ds->ds_object : 0); 112} 113 114uint64_t 115dmu_objset_syncprop(objset_t *os) 116{ 117 return (os->os_sync); 118} 119 120uint64_t 121dmu_objset_logbias(objset_t *os) 122{ 123 return (os->os_logbias); 124} 125 126static void 127checksum_changed_cb(void *arg, uint64_t newval) 128{ 129 objset_t *os = arg; 130 131 /* 132 * Inheritance should have been done by now. 133 */ 134 ASSERT(newval != ZIO_CHECKSUM_INHERIT); 135 136 os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); 137} 138 139static void 140compression_changed_cb(void *arg, uint64_t newval) 141{ 142 objset_t *os = arg; 143 144 /* 145 * Inheritance and range checking should have been done by now. 146 */ 147 ASSERT(newval != ZIO_COMPRESS_INHERIT); 148 149 os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); 150} 151 152static void 153copies_changed_cb(void *arg, uint64_t newval) 154{ 155 objset_t *os = arg; 156 157 /* 158 * Inheritance and range checking should have been done by now. 159 */ 160 ASSERT(newval > 0); 161 ASSERT(newval <= spa_max_replication(os->os_spa)); 162 163 os->os_copies = newval; 164} 165 166static void 167dedup_changed_cb(void *arg, uint64_t newval) 168{ 169 objset_t *os = arg; 170 spa_t *spa = os->os_spa; 171 enum zio_checksum checksum; 172 173 /* 174 * Inheritance should have been done by now. 175 */ 176 ASSERT(newval != ZIO_CHECKSUM_INHERIT); 177 178 checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); 179 180 os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; 181 os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); 182} 183 184static void 185primary_cache_changed_cb(void *arg, uint64_t newval) 186{ 187 objset_t *os = arg; 188 189 /* 190 * Inheritance and range checking should have been done by now. 191 */ 192 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || 193 newval == ZFS_CACHE_METADATA); 194 195 os->os_primary_cache = newval; 196} 197 198static void 199secondary_cache_changed_cb(void *arg, uint64_t newval) 200{ 201 objset_t *os = arg; 202 203 /* 204 * Inheritance and range checking should have been done by now. 205 */ 206 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || 207 newval == ZFS_CACHE_METADATA); 208 209 os->os_secondary_cache = newval; 210} 211 212static void 213sync_changed_cb(void *arg, uint64_t newval) 214{ 215 objset_t *os = arg; 216 217 /* 218 * Inheritance and range checking should have been done by now. 219 */ 220 ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || 221 newval == ZFS_SYNC_DISABLED); 222 223 os->os_sync = newval; 224 if (os->os_zil) 225 zil_set_sync(os->os_zil, newval); 226} 227 228static void 229logbias_changed_cb(void *arg, uint64_t newval) 230{ 231 objset_t *os = arg; 232 233 ASSERT(newval == ZFS_LOGBIAS_LATENCY || 234 newval == ZFS_LOGBIAS_THROUGHPUT); 235 os->os_logbias = newval; 236 if (os->os_zil) 237 zil_set_logbias(os->os_zil, newval); 238} 239 240void 241dmu_objset_byteswap(void *buf, size_t size) 242{ 243 objset_phys_t *osp = buf; 244 245 ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); 246 dnode_byteswap(&osp->os_meta_dnode); 247 byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); 248 osp->os_type = BSWAP_64(osp->os_type); 249 osp->os_flags = BSWAP_64(osp->os_flags); 250 if (size == sizeof (objset_phys_t)) { 251 dnode_byteswap(&osp->os_userused_dnode); 252 dnode_byteswap(&osp->os_groupused_dnode); 253 } 254} 255 256int 257dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, 258 objset_t **osp) 259{ 260 objset_t *os; 261 int i, err; 262 263 ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); 264 265 os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); 266 os->os_dsl_dataset = ds; 267 os->os_spa = spa; 268 os->os_rootbp = bp; 269 if (!BP_IS_HOLE(os->os_rootbp)) { 270 uint32_t aflags = ARC_WAIT; 271 zbookmark_t zb; 272 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 273 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 274 275 if (DMU_OS_IS_L2CACHEABLE(os)) 276 aflags |= ARC_L2CACHE; 277 278 dprintf_bp(os->os_rootbp, "reading %s", ""); 279 /* 280 * XXX when bprewrite scrub can change the bp, 281 * and this is called from dmu_objset_open_ds_os, the bp 282 * could change, and we'll need a lock. 283 */ 284 err = dsl_read_nolock(NULL, spa, os->os_rootbp, 285 arc_getbuf_func, &os->os_phys_buf, 286 ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); 287 if (err) { 288 kmem_free(os, sizeof (objset_t)); 289 /* convert checksum errors into IO errors */ 290 if (err == ECKSUM) 291 err = EIO; 292 return (err); 293 } 294 295 /* Increase the blocksize if we are permitted. */ 296 if (spa_version(spa) >= SPA_VERSION_USERSPACE && 297 arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { 298 arc_buf_t *buf = arc_buf_alloc(spa, 299 sizeof (objset_phys_t), &os->os_phys_buf, 300 ARC_BUFC_METADATA); 301 bzero(buf->b_data, sizeof (objset_phys_t)); 302 bcopy(os->os_phys_buf->b_data, buf->b_data, 303 arc_buf_size(os->os_phys_buf)); 304 (void) arc_buf_remove_ref(os->os_phys_buf, 305 &os->os_phys_buf); 306 os->os_phys_buf = buf; 307 } 308 309 os->os_phys = os->os_phys_buf->b_data; 310 os->os_flags = os->os_phys->os_flags; 311 } else { 312 int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? 313 sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; 314 os->os_phys_buf = arc_buf_alloc(spa, size, 315 &os->os_phys_buf, ARC_BUFC_METADATA); 316 os->os_phys = os->os_phys_buf->b_data; 317 bzero(os->os_phys, size); 318 } 319 320 /* 321 * Note: the changed_cb will be called once before the register 322 * func returns, thus changing the checksum/compression from the 323 * default (fletcher2/off). Snapshots don't need to know about 324 * checksum/compression/copies. 325 */ 326 if (ds) { 327 err = dsl_prop_register(ds, "primarycache", 328 primary_cache_changed_cb, os); 329 if (err == 0) 330 err = dsl_prop_register(ds, "secondarycache", 331 secondary_cache_changed_cb, os); 332 if (!dsl_dataset_is_snapshot(ds)) { 333 if (err == 0) 334 err = dsl_prop_register(ds, "checksum", 335 checksum_changed_cb, os); 336 if (err == 0) 337 err = dsl_prop_register(ds, "compression", 338 compression_changed_cb, os); 339 if (err == 0) 340 err = dsl_prop_register(ds, "copies", 341 copies_changed_cb, os); 342 if (err == 0) 343 err = dsl_prop_register(ds, "dedup", 344 dedup_changed_cb, os); 345 if (err == 0) 346 err = dsl_prop_register(ds, "logbias", 347 logbias_changed_cb, os); 348 if (err == 0) 349 err = dsl_prop_register(ds, "sync", 350 sync_changed_cb, os); 351 } 352 if (err) { 353 VERIFY(arc_buf_remove_ref(os->os_phys_buf, 354 &os->os_phys_buf) == 1); 355 kmem_free(os, sizeof (objset_t)); 356 return (err); 357 } 358 } else if (ds == NULL) { 359 /* It's the meta-objset. */ 360 os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; 361 os->os_compress = ZIO_COMPRESS_LZJB; 362 os->os_copies = spa_max_replication(spa); 363 os->os_dedup_checksum = ZIO_CHECKSUM_OFF; 364 os->os_dedup_verify = 0; 365 os->os_logbias = 0; 366 os->os_sync = 0; 367 os->os_primary_cache = ZFS_CACHE_ALL; 368 os->os_secondary_cache = ZFS_CACHE_ALL; 369 } 370 371 if (ds == NULL || !dsl_dataset_is_snapshot(ds)) 372 os->os_zil_header = os->os_phys->os_zil_header; 373 os->os_zil = zil_alloc(os, &os->os_zil_header); 374 375 for (i = 0; i < TXG_SIZE; i++) { 376 list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), 377 offsetof(dnode_t, dn_dirty_link[i])); 378 list_create(&os->os_free_dnodes[i], sizeof (dnode_t), 379 offsetof(dnode_t, dn_dirty_link[i])); 380 } 381 list_create(&os->os_dnodes, sizeof (dnode_t), 382 offsetof(dnode_t, dn_link)); 383 list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), 384 offsetof(dmu_buf_impl_t, db_link)); 385 386 mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); 387 mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); 388 mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); 389 390 DMU_META_DNODE(os) = dnode_special_open(os, 391 &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, 392 &os->os_meta_dnode); 393 if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { 394 DMU_USERUSED_DNODE(os) = dnode_special_open(os, 395 &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, 396 &os->os_userused_dnode); 397 DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, 398 &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, 399 &os->os_groupused_dnode); 400 } 401 402 /* 403 * We should be the only thread trying to do this because we 404 * have ds_opening_lock 405 */ 406 if (ds) { 407 mutex_enter(&ds->ds_lock); 408 ASSERT(ds->ds_objset == NULL); 409 ds->ds_objset = os; 410 mutex_exit(&ds->ds_lock); 411 } 412 413 *osp = os; 414 return (0); 415} 416 417int 418dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) 419{ 420 int err = 0; 421 422 mutex_enter(&ds->ds_opening_lock); 423 *osp = ds->ds_objset; 424 if (*osp == NULL) { 425 err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), 426 ds, dsl_dataset_get_blkptr(ds), osp); 427 } 428 mutex_exit(&ds->ds_opening_lock); 429 return (err); 430} 431 432/* called from zpl */ 433int 434dmu_objset_hold(const char *name, void *tag, objset_t **osp) 435{ 436 dsl_dataset_t *ds; 437 int err; 438 439 err = dsl_dataset_hold(name, tag, &ds); 440 if (err) 441 return (err); 442 443 err = dmu_objset_from_ds(ds, osp); 444 if (err) 445 dsl_dataset_rele(ds, tag); 446 447 return (err); 448} 449 450/* called from zpl */ 451int 452dmu_objset_own(const char *name, dmu_objset_type_t type, 453 boolean_t readonly, void *tag, objset_t **osp) 454{ 455 dsl_dataset_t *ds; 456 int err; 457 458 err = dsl_dataset_own(name, B_FALSE, tag, &ds); 459 if (err) 460 return (err); 461 462 err = dmu_objset_from_ds(ds, osp); 463 if (err) { 464 dsl_dataset_disown(ds, tag); 465 } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { 466 dmu_objset_disown(*osp, tag); 467 return (EINVAL); 468 } else if (!readonly && dsl_dataset_is_snapshot(ds)) { 469 dmu_objset_disown(*osp, tag); 470 return (EROFS); 471 } 472 return (err); 473} 474 475void 476dmu_objset_rele(objset_t *os, void *tag) 477{ 478 dsl_dataset_rele(os->os_dsl_dataset, tag); 479} 480 481void 482dmu_objset_disown(objset_t *os, void *tag) 483{ 484 dsl_dataset_disown(os->os_dsl_dataset, tag); 485} 486 487int 488dmu_objset_evict_dbufs(objset_t *os) 489{ 490 dnode_t *dn; 491 492 mutex_enter(&os->os_lock); 493 494 /* process the mdn last, since the other dnodes have holds on it */ 495 list_remove(&os->os_dnodes, DMU_META_DNODE(os)); 496 list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); 497 498 /* 499 * Find the first dnode with holds. We have to do this dance 500 * because dnode_add_ref() only works if you already have a 501 * hold. If there are no holds then it has no dbufs so OK to 502 * skip. 503 */ 504 for (dn = list_head(&os->os_dnodes); 505 dn && !dnode_add_ref(dn, FTAG); 506 dn = list_next(&os->os_dnodes, dn)) 507 continue; 508 509 while (dn) { 510 dnode_t *next_dn = dn; 511 512 do { 513 next_dn = list_next(&os->os_dnodes, next_dn); 514 } while (next_dn && !dnode_add_ref(next_dn, FTAG)); 515 516 mutex_exit(&os->os_lock); 517 dnode_evict_dbufs(dn); 518 dnode_rele(dn, FTAG); 519 mutex_enter(&os->os_lock); 520 dn = next_dn; 521 } 522 dn = list_head(&os->os_dnodes); 523 mutex_exit(&os->os_lock); 524 return (dn != DMU_META_DNODE(os)); 525} 526 527void 528dmu_objset_evict(objset_t *os) 529{ 530 dsl_dataset_t *ds = os->os_dsl_dataset; 531 532 for (int t = 0; t < TXG_SIZE; t++) 533 ASSERT(!dmu_objset_is_dirty(os, t)); 534 535 if (ds) { 536 if (!dsl_dataset_is_snapshot(ds)) { 537 VERIFY(0 == dsl_prop_unregister(ds, "checksum", 538 checksum_changed_cb, os)); 539 VERIFY(0 == dsl_prop_unregister(ds, "compression", 540 compression_changed_cb, os)); 541 VERIFY(0 == dsl_prop_unregister(ds, "copies", 542 copies_changed_cb, os)); 543 VERIFY(0 == dsl_prop_unregister(ds, "dedup", 544 dedup_changed_cb, os)); 545 VERIFY(0 == dsl_prop_unregister(ds, "logbias", 546 logbias_changed_cb, os)); 547 VERIFY(0 == dsl_prop_unregister(ds, "sync", 548 sync_changed_cb, os)); 549 } 550 VERIFY(0 == dsl_prop_unregister(ds, "primarycache", 551 primary_cache_changed_cb, os)); 552 VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", 553 secondary_cache_changed_cb, os)); 554 } 555 556 if (os->os_sa) 557 sa_tear_down(os); 558 559 /* 560 * We should need only a single pass over the dnode list, since 561 * nothing can be added to the list at this point. 562 */ 563 (void) dmu_objset_evict_dbufs(os); 564 565 dnode_special_close(&os->os_meta_dnode); 566 if (DMU_USERUSED_DNODE(os)) { 567 dnode_special_close(&os->os_userused_dnode); 568 dnode_special_close(&os->os_groupused_dnode); 569 } 570 zil_free(os->os_zil); 571 572 ASSERT3P(list_head(&os->os_dnodes), ==, NULL); 573 574 VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); 575 576 /* 577 * This is a barrier to prevent the objset from going away in 578 * dnode_move() until we can safely ensure that the objset is still in 579 * use. We consider the objset valid before the barrier and invalid 580 * after the barrier. 581 */ 582 rw_enter(&os_lock, RW_READER); 583 rw_exit(&os_lock); 584 585 mutex_destroy(&os->os_lock); 586 mutex_destroy(&os->os_obj_lock); 587 mutex_destroy(&os->os_user_ptr_lock); 588 kmem_free(os, sizeof (objset_t)); 589} 590 591timestruc_t 592dmu_objset_snap_cmtime(objset_t *os) 593{ 594 return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); 595} 596 597/* called from dsl for meta-objset */ 598objset_t * 599dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, 600 dmu_objset_type_t type, dmu_tx_t *tx) 601{ 602 objset_t *os; 603 dnode_t *mdn; 604 605 ASSERT(dmu_tx_is_syncing(tx)); 606 if (ds != NULL) 607 VERIFY(0 == dmu_objset_from_ds(ds, &os)); 608 else 609 VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os)); 610 611 mdn = DMU_META_DNODE(os); 612 613 dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, 614 DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); 615 616 /* 617 * We don't want to have to increase the meta-dnode's nlevels 618 * later, because then we could do it in quescing context while 619 * we are also accessing it in open context. 620 * 621 * This precaution is not necessary for the MOS (ds == NULL), 622 * because the MOS is only updated in syncing context. 623 * This is most fortunate: the MOS is the only objset that 624 * needs to be synced multiple times as spa_sync() iterates 625 * to convergence, so minimizing its dn_nlevels matters. 626 */ 627 if (ds != NULL) { 628 int levels = 1; 629 630 /* 631 * Determine the number of levels necessary for the meta-dnode 632 * to contain DN_MAX_OBJECT dnodes. 633 */ 634 while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + 635 (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < 636 DN_MAX_OBJECT * sizeof (dnode_phys_t)) 637 levels++; 638 639 mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = 640 mdn->dn_nlevels = levels; 641 } 642 643 ASSERT(type != DMU_OST_NONE); 644 ASSERT(type != DMU_OST_ANY); 645 ASSERT(type < DMU_OST_NUMTYPES); 646 os->os_phys->os_type = type; 647 if (dmu_objset_userused_enabled(os)) { 648 os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; 649 os->os_flags = os->os_phys->os_flags; 650 } 651 652 dsl_dataset_dirty(ds, tx); 653 654 return (os); 655} 656 657struct oscarg { 658 void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); 659 void *userarg; 660 dsl_dataset_t *clone_origin; 661 const char *lastname; 662 dmu_objset_type_t type; 663 uint64_t flags; 664 cred_t *cr; 665}; 666 667/*ARGSUSED*/ 668static int 669dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) 670{ 671 dsl_dir_t *dd = arg1; 672 struct oscarg *oa = arg2; 673 objset_t *mos = dd->dd_pool->dp_meta_objset; 674 int err; 675 uint64_t ddobj; 676 677 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 678 oa->lastname, sizeof (uint64_t), 1, &ddobj); 679 if (err != ENOENT) 680 return (err ? err : EEXIST); 681 682 if (oa->clone_origin != NULL) { 683 /* You can't clone across pools. */ 684 if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) 685 return (EXDEV); 686 687 /* You can only clone snapshots, not the head datasets. */ 688 if (!dsl_dataset_is_snapshot(oa->clone_origin)) 689 return (EINVAL); 690 } 691 692 return (0); 693} 694 695static void 696dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) 697{ 698 dsl_dir_t *dd = arg1; 699 spa_t *spa = dd->dd_pool->dp_spa; 700 struct oscarg *oa = arg2; 701 uint64_t obj; 702 703 ASSERT(dmu_tx_is_syncing(tx)); 704 705 obj = dsl_dataset_create_sync(dd, oa->lastname, 706 oa->clone_origin, oa->flags, oa->cr, tx); 707 708 if (oa->clone_origin == NULL) { 709 dsl_pool_t *dp = dd->dd_pool; 710 dsl_dataset_t *ds; 711 blkptr_t *bp; 712 objset_t *os; 713 714 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); 715 bp = dsl_dataset_get_blkptr(ds); 716 ASSERT(BP_IS_HOLE(bp)); 717 718 os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx); 719 720 if (oa->userfunc) 721 oa->userfunc(os, oa->userarg, oa->cr, tx); 722 dsl_dataset_rele(ds, FTAG); 723 } 724 725 spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj); 726} 727 728int 729dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, 730 void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) 731{ 732 dsl_dir_t *pdd; 733 const char *tail; 734 int err = 0; 735 struct oscarg oa = { 0 }; 736 737 ASSERT(strchr(name, '@') == NULL); 738 err = dsl_dir_open(name, FTAG, &pdd, &tail); 739 if (err) 740 return (err); 741 if (tail == NULL) { 742 dsl_dir_close(pdd, FTAG); 743 return (EEXIST); 744 } 745 746 oa.userfunc = func; 747 oa.userarg = arg; 748 oa.lastname = tail; 749 oa.type = type; 750 oa.flags = flags; 751 oa.cr = CRED(); 752 753 err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, 754 dmu_objset_create_sync, pdd, &oa, 5); 755 dsl_dir_close(pdd, FTAG); 756 return (err); 757} 758 759int 760dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) 761{ 762 dsl_dir_t *pdd; 763 const char *tail; 764 int err = 0; 765 struct oscarg oa = { 0 }; 766 767 ASSERT(strchr(name, '@') == NULL); 768 err = dsl_dir_open(name, FTAG, &pdd, &tail); 769 if (err) 770 return (err); 771 if (tail == NULL) { 772 dsl_dir_close(pdd, FTAG); 773 return (EEXIST); 774 } 775 776 oa.lastname = tail; 777 oa.clone_origin = clone_origin; 778 oa.flags = flags; 779 oa.cr = CRED(); 780 781 err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, 782 dmu_objset_create_sync, pdd, &oa, 5); 783 dsl_dir_close(pdd, FTAG); 784 return (err); 785} 786 787int 788dmu_objset_destroy(const char *name, boolean_t defer) 789{ 790 dsl_dataset_t *ds; 791 int error; 792 793 error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); 794 if (error == 0) { 795 error = dsl_dataset_destroy(ds, FTAG, defer); 796 /* dsl_dataset_destroy() closes the ds. */ 797 } 798 799 return (error); 800} 801 802struct snaparg { 803 dsl_sync_task_group_t *dstg; 804 char *snapname; 805 char *htag; 806 char failed[MAXPATHLEN]; 807 boolean_t recursive; 808 boolean_t needsuspend; 809 boolean_t temporary; 810 nvlist_t *props; 811 struct dsl_ds_holdarg *ha; /* only needed in the temporary case */ 812 dsl_dataset_t *newds; 813}; 814 815static int 816snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) 817{ 818 objset_t *os = arg1; 819 struct snaparg *sn = arg2; 820 int error; 821 822 /* The props have already been checked by zfs_check_userprops(). */ 823 824 error = dsl_dataset_snapshot_check(os->os_dsl_dataset, 825 sn->snapname, tx); 826 if (error) 827 return (error); 828 829 if (sn->temporary) { 830 /* 831 * Ideally we would just call 832 * dsl_dataset_user_hold_check() and 833 * dsl_dataset_destroy_check() here. However the 834 * dataset we want to hold and destroy is the snapshot 835 * that we just confirmed we can create, but it won't 836 * exist until after these checks are run. Do any 837 * checks we can here and if more checks are added to 838 * those routines in the future, similar checks may be 839 * necessary here. 840 */ 841 if (spa_version(os->os_spa) < SPA_VERSION_USERREFS) 842 return (ENOTSUP); 843 /* 844 * Not checking number of tags because the tag will be 845 * unique, as it will be the only tag. 846 */ 847 if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) 848 return (E2BIG); 849 850 sn->ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 851 sn->ha->temphold = B_TRUE; 852 sn->ha->htag = sn->htag; 853 } 854 return (error); 855} 856 857static void 858snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) 859{ 860 objset_t *os = arg1; 861 dsl_dataset_t *ds = os->os_dsl_dataset; 862 struct snaparg *sn = arg2; 863 864 dsl_dataset_snapshot_sync(ds, sn->snapname, tx); 865 866 if (sn->props) { 867 dsl_props_arg_t pa; 868 pa.pa_props = sn->props; 869 pa.pa_source = ZPROP_SRC_LOCAL; 870 dsl_props_set_sync(ds->ds_prev, &pa, tx); 871 } 872 873 if (sn->temporary) { 874 struct dsl_ds_destroyarg da; 875 876 dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx); 877 kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg)); 878 sn->ha = NULL; 879 sn->newds = ds->ds_prev; 880 881 da.ds = ds->ds_prev; 882 da.defer = B_TRUE; 883 dsl_dataset_destroy_sync(&da, FTAG, tx); 884 } 885} 886 887static int 888dmu_objset_snapshot_one(const char *name, void *arg) 889{ 890 struct snaparg *sn = arg; 891 objset_t *os; 892 int err; 893 char *cp; 894 895 /* 896 * If the objset starts with a '%', then ignore it unless it was 897 * explicitly named (ie, not recursive). These hidden datasets 898 * are always inconsistent, and by not opening them here, we can 899 * avoid a race with dsl_dir_destroy_check(). 900 */ 901 cp = strrchr(name, '/'); 902 if (cp && cp[1] == '%' && sn->recursive) 903 return (0); 904 905 (void) strcpy(sn->failed, name); 906 907 /* 908 * Check permissions if we are doing a recursive snapshot. The 909 * permission checks for the starting dataset have already been 910 * performed in zfs_secpolicy_snapshot() 911 */ 912 if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED()))) 913 return (err); 914 915 err = dmu_objset_hold(name, sn, &os); 916 if (err != 0) 917 return (err); 918 919 /* 920 * If the objset is in an inconsistent state (eg, in the process 921 * of being destroyed), don't snapshot it. As with %hidden 922 * datasets, we return EBUSY if this name was explicitly 923 * requested (ie, not recursive), and otherwise ignore it. 924 */ 925 if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { 926 dmu_objset_rele(os, sn); 927 return (sn->recursive ? 0 : EBUSY); 928 } 929 930 if (sn->needsuspend) { 931 err = zil_suspend(dmu_objset_zil(os)); 932 if (err) { 933 dmu_objset_rele(os, sn); 934 return (err); 935 } 936 } 937 dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync, 938 os, sn, 3); 939 940 return (0); 941} 942 943int 944dmu_objset_snapshot(char *fsname, char *snapname, char *tag, 945 nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd) 946{ 947 dsl_sync_task_t *dst; 948 struct snaparg sn; 949 spa_t *spa; 950 minor_t minor; 951 int err; 952 953 (void) strcpy(sn.failed, fsname); 954 955 err = spa_open(fsname, &spa, FTAG); 956 if (err) 957 return (err); 958 959 if (temporary) { 960 if (cleanup_fd < 0) { 961 spa_close(spa, FTAG); 962 return (EINVAL); 963 } 964 if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) { 965 spa_close(spa, FTAG); 966 return (err); 967 } 968 } 969 970 sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 971 sn.snapname = snapname; 972 sn.htag = tag; 973 sn.props = props; 974 sn.recursive = recursive; 975 sn.needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); 976 sn.temporary = temporary; 977 sn.ha = NULL; 978 sn.newds = NULL; 979 980 if (recursive) { 981 err = dmu_objset_find(fsname, 982 dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN); 983 } else { 984 err = dmu_objset_snapshot_one(fsname, &sn); 985 } 986 987 if (err == 0) 988 err = dsl_sync_task_group_wait(sn.dstg); 989 990 for (dst = list_head(&sn.dstg->dstg_tasks); dst; 991 dst = list_next(&sn.dstg->dstg_tasks, dst)) { 992 objset_t *os = dst->dst_arg1; 993 dsl_dataset_t *ds = os->os_dsl_dataset; 994 if (dst->dst_err) { 995 dsl_dataset_name(ds, sn.failed); 996 } else if (temporary) { 997 dsl_register_onexit_hold_cleanup(sn.newds, tag, minor); 998 } 999 if (sn.needsuspend) 1000 zil_resume(dmu_objset_zil(os)); 1001#ifdef __FreeBSD__ 1002#ifdef _KERNEL 1003 if (dst->dst_err == 0 && dmu_objset_type(os) == DMU_OST_ZVOL) { 1004 char name[MAXNAMELEN]; 1005 1006 dmu_objset_name(os, name); 1007 strlcat(name, "@", sizeof(name)); 1008 strlcat(name, snapname, sizeof(name)); 1009 zvol_create_minors(name); 1010 } 1011#endif 1012#endif 1013 dmu_objset_rele(os, &sn); 1014 } 1015 1016 if (err) 1017 (void) strcpy(fsname, sn.failed); 1018 if (temporary) 1019 zfs_onexit_fd_rele(cleanup_fd); 1020 dsl_sync_task_group_destroy(sn.dstg); 1021 spa_close(spa, FTAG); 1022 return (err); 1023} 1024 1025static void 1026dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) 1027{ 1028 dnode_t *dn; 1029 1030 while (dn = list_head(list)) { 1031 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 1032 ASSERT(dn->dn_dbuf->db_data_pending); 1033 /* 1034 * Initialize dn_zio outside dnode_sync() because the 1035 * meta-dnode needs to set it ouside dnode_sync(). 1036 */ 1037 dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; 1038 ASSERT(dn->dn_zio); 1039 1040 ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); 1041 list_remove(list, dn); 1042 1043 if (newlist) { 1044 (void) dnode_add_ref(dn, newlist); 1045 list_insert_tail(newlist, dn); 1046 } 1047 1048 dnode_sync(dn, tx); 1049 } 1050} 1051 1052/* ARGSUSED */ 1053static void 1054dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) 1055{ 1056 blkptr_t *bp = zio->io_bp; 1057 objset_t *os = arg; 1058 dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; 1059 1060 ASSERT(bp == os->os_rootbp); 1061 ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); 1062 ASSERT(BP_GET_LEVEL(bp) == 0); 1063 1064 /* 1065 * Update rootbp fill count: it should be the number of objects 1066 * allocated in the object set (not counting the "special" 1067 * objects that are stored in the objset_phys_t -- the meta 1068 * dnode and user/group accounting objects). 1069 */ 1070 bp->blk_fill = 0; 1071 for (int i = 0; i < dnp->dn_nblkptr; i++) 1072 bp->blk_fill += dnp->dn_blkptr[i].blk_fill; 1073} 1074 1075/* ARGSUSED */ 1076static void 1077dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) 1078{ 1079 blkptr_t *bp = zio->io_bp; 1080 blkptr_t *bp_orig = &zio->io_bp_orig; 1081 objset_t *os = arg; 1082 1083 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 1084 ASSERT(BP_EQUAL(bp, bp_orig)); 1085 } else { 1086 dsl_dataset_t *ds = os->os_dsl_dataset; 1087 dmu_tx_t *tx = os->os_synctx; 1088 1089 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 1090 dsl_dataset_block_born(ds, bp, tx); 1091 } 1092} 1093 1094/* called from dsl */ 1095void 1096dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) 1097{ 1098 int txgoff; 1099 zbookmark_t zb; 1100 zio_prop_t zp; 1101 zio_t *zio; 1102 list_t *list; 1103 list_t *newlist = NULL; 1104 dbuf_dirty_record_t *dr; 1105 1106 dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); 1107 1108 ASSERT(dmu_tx_is_syncing(tx)); 1109 /* XXX the write_done callback should really give us the tx... */ 1110 os->os_synctx = tx; 1111 1112 if (os->os_dsl_dataset == NULL) { 1113 /* 1114 * This is the MOS. If we have upgraded, 1115 * spa_max_replication() could change, so reset 1116 * os_copies here. 1117 */ 1118 os->os_copies = spa_max_replication(os->os_spa); 1119 } 1120 1121 /* 1122 * Create the root block IO 1123 */ 1124 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 1125 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 1126 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 1127 VERIFY3U(0, ==, arc_release_bp(os->os_phys_buf, &os->os_phys_buf, 1128 os->os_rootbp, os->os_spa, &zb)); 1129 1130 dmu_write_policy(os, NULL, 0, 0, &zp); 1131 1132 zio = arc_write(pio, os->os_spa, tx->tx_txg, 1133 os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp, 1134 dmu_objset_write_ready, dmu_objset_write_done, os, 1135 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 1136 1137 /* 1138 * Sync special dnodes - the parent IO for the sync is the root block 1139 */ 1140 DMU_META_DNODE(os)->dn_zio = zio; 1141 dnode_sync(DMU_META_DNODE(os), tx); 1142 1143 os->os_phys->os_flags = os->os_flags; 1144 1145 if (DMU_USERUSED_DNODE(os) && 1146 DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { 1147 DMU_USERUSED_DNODE(os)->dn_zio = zio; 1148 dnode_sync(DMU_USERUSED_DNODE(os), tx); 1149 DMU_GROUPUSED_DNODE(os)->dn_zio = zio; 1150 dnode_sync(DMU_GROUPUSED_DNODE(os), tx); 1151 } 1152 1153 txgoff = tx->tx_txg & TXG_MASK; 1154 1155 if (dmu_objset_userused_enabled(os)) { 1156 newlist = &os->os_synced_dnodes; 1157 /* 1158 * We must create the list here because it uses the 1159 * dn_dirty_link[] of this txg. 1160 */ 1161 list_create(newlist, sizeof (dnode_t), 1162 offsetof(dnode_t, dn_dirty_link[txgoff])); 1163 } 1164 1165 dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); 1166 dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); 1167 1168 list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; 1169 while (dr = list_head(list)) { 1170 ASSERT(dr->dr_dbuf->db_level == 0); 1171 list_remove(list, dr); 1172 if (dr->dr_zio) 1173 zio_nowait(dr->dr_zio); 1174 } 1175 /* 1176 * Free intent log blocks up to this tx. 1177 */ 1178 zil_sync(os->os_zil, tx); 1179 os->os_phys->os_zil_header = os->os_zil_header; 1180 zio_nowait(zio); 1181} 1182 1183boolean_t 1184dmu_objset_is_dirty(objset_t *os, uint64_t txg) 1185{ 1186 return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || 1187 !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); 1188} 1189 1190boolean_t 1191dmu_objset_is_dirty_anywhere(objset_t *os) 1192{ 1193 for (int t = 0; t < TXG_SIZE; t++) 1194 if (dmu_objset_is_dirty(os, t)) 1195 return (B_TRUE); 1196 return (B_FALSE); 1197} 1198 1199static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; 1200 1201void 1202dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) 1203{ 1204 used_cbs[ost] = cb; 1205} 1206 1207boolean_t 1208dmu_objset_userused_enabled(objset_t *os) 1209{ 1210 return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && 1211 used_cbs[os->os_phys->os_type] != NULL && 1212 DMU_USERUSED_DNODE(os) != NULL); 1213} 1214 1215static void 1216do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, 1217 uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) 1218{ 1219 if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { 1220 int64_t delta = DNODE_SIZE + used; 1221 if (subtract) 1222 delta = -delta; 1223 VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, 1224 user, delta, tx)); 1225 VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, 1226 group, delta, tx)); 1227 } 1228} 1229 1230void 1231dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) 1232{ 1233 dnode_t *dn; 1234 list_t *list = &os->os_synced_dnodes; 1235 1236 ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); 1237 1238 while (dn = list_head(list)) { 1239 int flags; 1240 ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); 1241 ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || 1242 dn->dn_phys->dn_flags & 1243 DNODE_FLAG_USERUSED_ACCOUNTED); 1244 1245 /* Allocate the user/groupused objects if necessary. */ 1246 if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { 1247 VERIFY(0 == zap_create_claim(os, 1248 DMU_USERUSED_OBJECT, 1249 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); 1250 VERIFY(0 == zap_create_claim(os, 1251 DMU_GROUPUSED_OBJECT, 1252 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); 1253 } 1254 1255 /* 1256 * We intentionally modify the zap object even if the 1257 * net delta is zero. Otherwise 1258 * the block of the zap obj could be shared between 1259 * datasets but need to be different between them after 1260 * a bprewrite. 1261 */ 1262 1263 flags = dn->dn_id_flags; 1264 ASSERT(flags); 1265 if (flags & DN_ID_OLD_EXIST) { 1266 do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, 1267 dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); 1268 } 1269 if (flags & DN_ID_NEW_EXIST) { 1270 do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), 1271 dn->dn_phys->dn_flags, dn->dn_newuid, 1272 dn->dn_newgid, B_FALSE, tx); 1273 } 1274 1275 mutex_enter(&dn->dn_mtx); 1276 dn->dn_oldused = 0; 1277 dn->dn_oldflags = 0; 1278 if (dn->dn_id_flags & DN_ID_NEW_EXIST) { 1279 dn->dn_olduid = dn->dn_newuid; 1280 dn->dn_oldgid = dn->dn_newgid; 1281 dn->dn_id_flags |= DN_ID_OLD_EXIST; 1282 if (dn->dn_bonuslen == 0) 1283 dn->dn_id_flags |= DN_ID_CHKED_SPILL; 1284 else 1285 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1286 } 1287 dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); 1288 mutex_exit(&dn->dn_mtx); 1289 1290 list_remove(list, dn); 1291 dnode_rele(dn, list); 1292 } 1293} 1294 1295/* 1296 * Returns a pointer to data to find uid/gid from 1297 * 1298 * If a dirty record for transaction group that is syncing can't 1299 * be found then NULL is returned. In the NULL case it is assumed 1300 * the uid/gid aren't changing. 1301 */ 1302static void * 1303dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) 1304{ 1305 dbuf_dirty_record_t *dr, **drp; 1306 void *data; 1307 1308 if (db->db_dirtycnt == 0) 1309 return (db->db.db_data); /* Nothing is changing */ 1310 1311 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1312 if (dr->dr_txg == tx->tx_txg) 1313 break; 1314 1315 if (dr == NULL) { 1316 data = NULL; 1317 } else { 1318 dnode_t *dn; 1319 1320 DB_DNODE_ENTER(dr->dr_dbuf); 1321 dn = DB_DNODE(dr->dr_dbuf); 1322 1323 if (dn->dn_bonuslen == 0 && 1324 dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) 1325 data = dr->dt.dl.dr_data->b_data; 1326 else 1327 data = dr->dt.dl.dr_data; 1328 1329 DB_DNODE_EXIT(dr->dr_dbuf); 1330 } 1331 1332 return (data); 1333} 1334 1335void 1336dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) 1337{ 1338 objset_t *os = dn->dn_objset; 1339 void *data = NULL; 1340 dmu_buf_impl_t *db = NULL; 1341 uint64_t *user, *group; 1342 int flags = dn->dn_id_flags; 1343 int error; 1344 boolean_t have_spill = B_FALSE; 1345 1346 if (!dmu_objset_userused_enabled(dn->dn_objset)) 1347 return; 1348 1349 if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| 1350 DN_ID_CHKED_SPILL))) 1351 return; 1352 1353 if (before && dn->dn_bonuslen != 0) 1354 data = DN_BONUS(dn->dn_phys); 1355 else if (!before && dn->dn_bonuslen != 0) { 1356 if (dn->dn_bonus) { 1357 db = dn->dn_bonus; 1358 mutex_enter(&db->db_mtx); 1359 data = dmu_objset_userquota_find_data(db, tx); 1360 } else { 1361 data = DN_BONUS(dn->dn_phys); 1362 } 1363 } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { 1364 int rf = 0; 1365 1366 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) 1367 rf |= DB_RF_HAVESTRUCT; 1368 error = dmu_spill_hold_by_dnode(dn, 1369 rf | DB_RF_MUST_SUCCEED, 1370 FTAG, (dmu_buf_t **)&db); 1371 ASSERT(error == 0); 1372 mutex_enter(&db->db_mtx); 1373 data = (before) ? db->db.db_data : 1374 dmu_objset_userquota_find_data(db, tx); 1375 have_spill = B_TRUE; 1376 } else { 1377 mutex_enter(&dn->dn_mtx); 1378 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1379 mutex_exit(&dn->dn_mtx); 1380 return; 1381 } 1382 1383 if (before) { 1384 ASSERT(data); 1385 user = &dn->dn_olduid; 1386 group = &dn->dn_oldgid; 1387 } else if (data) { 1388 user = &dn->dn_newuid; 1389 group = &dn->dn_newgid; 1390 } 1391 1392 /* 1393 * Must always call the callback in case the object 1394 * type has changed and that type isn't an object type to track 1395 */ 1396 error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, 1397 user, group); 1398 1399 /* 1400 * Preserve existing uid/gid when the callback can't determine 1401 * what the new uid/gid are and the callback returned EEXIST. 1402 * The EEXIST error tells us to just use the existing uid/gid. 1403 * If we don't know what the old values are then just assign 1404 * them to 0, since that is a new file being created. 1405 */ 1406 if (!before && data == NULL && error == EEXIST) { 1407 if (flags & DN_ID_OLD_EXIST) { 1408 dn->dn_newuid = dn->dn_olduid; 1409 dn->dn_newgid = dn->dn_oldgid; 1410 } else { 1411 dn->dn_newuid = 0; 1412 dn->dn_newgid = 0; 1413 } 1414 error = 0; 1415 } 1416 1417 if (db) 1418 mutex_exit(&db->db_mtx); 1419 1420 mutex_enter(&dn->dn_mtx); 1421 if (error == 0 && before) 1422 dn->dn_id_flags |= DN_ID_OLD_EXIST; 1423 if (error == 0 && !before) 1424 dn->dn_id_flags |= DN_ID_NEW_EXIST; 1425 1426 if (have_spill) { 1427 dn->dn_id_flags |= DN_ID_CHKED_SPILL; 1428 } else { 1429 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1430 } 1431 mutex_exit(&dn->dn_mtx); 1432 if (have_spill) 1433 dmu_buf_rele((dmu_buf_t *)db, FTAG); 1434} 1435 1436boolean_t 1437dmu_objset_userspace_present(objset_t *os) 1438{ 1439 return (os->os_phys->os_flags & 1440 OBJSET_FLAG_USERACCOUNTING_COMPLETE); 1441} 1442 1443int 1444dmu_objset_userspace_upgrade(objset_t *os) 1445{ 1446 uint64_t obj; 1447 int err = 0; 1448 1449 if (dmu_objset_userspace_present(os)) 1450 return (0); 1451 if (!dmu_objset_userused_enabled(os)) 1452 return (ENOTSUP); 1453 if (dmu_objset_is_snapshot(os)) 1454 return (EINVAL); 1455 1456 /* 1457 * We simply need to mark every object dirty, so that it will be 1458 * synced out and now accounted. If this is called 1459 * concurrently, or if we already did some work before crashing, 1460 * that's fine, since we track each object's accounted state 1461 * independently. 1462 */ 1463 1464 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 1465 dmu_tx_t *tx; 1466 dmu_buf_t *db; 1467 int objerr; 1468 1469 if (issig(JUSTLOOKING) && issig(FORREAL)) 1470 return (EINTR); 1471 1472 objerr = dmu_bonus_hold(os, obj, FTAG, &db); 1473 if (objerr) 1474 continue; 1475 tx = dmu_tx_create(os); 1476 dmu_tx_hold_bonus(tx, obj); 1477 objerr = dmu_tx_assign(tx, TXG_WAIT); 1478 if (objerr) { 1479 dmu_tx_abort(tx); 1480 continue; 1481 } 1482 dmu_buf_will_dirty(db, tx); 1483 dmu_buf_rele(db, FTAG); 1484 dmu_tx_commit(tx); 1485 } 1486 1487 os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; 1488 txg_wait_synced(dmu_objset_pool(os), 0); 1489 return (0); 1490} 1491 1492void 1493dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, 1494 uint64_t *usedobjsp, uint64_t *availobjsp) 1495{ 1496 dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, 1497 usedobjsp, availobjsp); 1498} 1499 1500uint64_t 1501dmu_objset_fsid_guid(objset_t *os) 1502{ 1503 return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); 1504} 1505 1506void 1507dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) 1508{ 1509 stat->dds_type = os->os_phys->os_type; 1510 if (os->os_dsl_dataset) 1511 dsl_dataset_fast_stat(os->os_dsl_dataset, stat); 1512} 1513 1514void 1515dmu_objset_stats(objset_t *os, nvlist_t *nv) 1516{ 1517 ASSERT(os->os_dsl_dataset || 1518 os->os_phys->os_type == DMU_OST_META); 1519 1520 if (os->os_dsl_dataset != NULL) 1521 dsl_dataset_stats(os->os_dsl_dataset, nv); 1522 1523 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, 1524 os->os_phys->os_type); 1525 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, 1526 dmu_objset_userspace_present(os)); 1527} 1528 1529int 1530dmu_objset_is_snapshot(objset_t *os) 1531{ 1532 if (os->os_dsl_dataset != NULL) 1533 return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); 1534 else 1535 return (B_FALSE); 1536} 1537 1538int 1539dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, 1540 boolean_t *conflict) 1541{ 1542 dsl_dataset_t *ds = os->os_dsl_dataset; 1543 uint64_t ignored; 1544 1545 if (ds->ds_phys->ds_snapnames_zapobj == 0) 1546 return (ENOENT); 1547 1548 return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, 1549 ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, 1550 real, maxlen, conflict)); 1551} 1552 1553int 1554dmu_snapshot_list_next(objset_t *os, int namelen, char *name, 1555 uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) 1556{ 1557 dsl_dataset_t *ds = os->os_dsl_dataset; 1558 zap_cursor_t cursor; 1559 zap_attribute_t attr; 1560 1561 if (ds->ds_phys->ds_snapnames_zapobj == 0) 1562 return (ENOENT); 1563 1564 zap_cursor_init_serialized(&cursor, 1565 ds->ds_dir->dd_pool->dp_meta_objset, 1566 ds->ds_phys->ds_snapnames_zapobj, *offp); 1567 1568 if (zap_cursor_retrieve(&cursor, &attr) != 0) { 1569 zap_cursor_fini(&cursor); 1570 return (ENOENT); 1571 } 1572 1573 if (strlen(attr.za_name) + 1 > namelen) { 1574 zap_cursor_fini(&cursor); 1575 return (ENAMETOOLONG); 1576 } 1577 1578 (void) strcpy(name, attr.za_name); 1579 if (idp) 1580 *idp = attr.za_first_integer; 1581 if (case_conflict) 1582 *case_conflict = attr.za_normalization_conflict; 1583 zap_cursor_advance(&cursor); 1584 *offp = zap_cursor_serialize(&cursor); 1585 zap_cursor_fini(&cursor); 1586 1587 return (0); 1588} 1589 1590int 1591dmu_dir_list_next(objset_t *os, int namelen, char *name, 1592 uint64_t *idp, uint64_t *offp) 1593{ 1594 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; 1595 zap_cursor_t cursor; 1596 zap_attribute_t attr; 1597 1598 /* there is no next dir on a snapshot! */ 1599 if (os->os_dsl_dataset->ds_object != 1600 dd->dd_phys->dd_head_dataset_obj) 1601 return (ENOENT); 1602 1603 zap_cursor_init_serialized(&cursor, 1604 dd->dd_pool->dp_meta_objset, 1605 dd->dd_phys->dd_child_dir_zapobj, *offp); 1606 1607 if (zap_cursor_retrieve(&cursor, &attr) != 0) { 1608 zap_cursor_fini(&cursor); 1609 return (ENOENT); 1610 } 1611 1612 if (strlen(attr.za_name) + 1 > namelen) { 1613 zap_cursor_fini(&cursor); 1614 return (ENAMETOOLONG); 1615 } 1616 1617 (void) strcpy(name, attr.za_name); 1618 if (idp) 1619 *idp = attr.za_first_integer; 1620 zap_cursor_advance(&cursor); 1621 *offp = zap_cursor_serialize(&cursor); 1622 zap_cursor_fini(&cursor); 1623 1624 return (0); 1625} 1626 1627struct findarg { 1628 int (*func)(const char *, void *); 1629 void *arg; 1630}; 1631 1632/* ARGSUSED */ 1633static int 1634findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 1635{ 1636 struct findarg *fa = arg; 1637 return (fa->func(dsname, fa->arg)); 1638} 1639 1640/* 1641 * Find all objsets under name, and for each, call 'func(child_name, arg)'. 1642 * Perhaps change all callers to use dmu_objset_find_spa()? 1643 */ 1644int 1645dmu_objset_find(const char *name, int func(const char *, void *), void *arg, 1646 int flags) 1647{ 1648 struct findarg fa; 1649 fa.func = func; 1650 fa.arg = arg; 1651 return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); 1652} 1653 1654/* 1655 * Find all objsets under name, call func on each 1656 */ 1657int 1658dmu_objset_find_spa(spa_t *spa, const char *name, 1659 int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) 1660{ 1661 dsl_dir_t *dd; 1662 dsl_pool_t *dp; 1663 dsl_dataset_t *ds; 1664 zap_cursor_t zc; 1665 zap_attribute_t *attr; 1666 char *child; 1667 uint64_t thisobj; 1668 int err; 1669 1670 if (name == NULL) 1671 name = spa_name(spa); 1672 err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); 1673 if (err) 1674 return (err); 1675 1676 /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ 1677 if (dd->dd_myname[0] == '$') { 1678 dsl_dir_close(dd, FTAG); 1679 return (0); 1680 } 1681 1682 thisobj = dd->dd_phys->dd_head_dataset_obj; 1683 attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 1684 dp = dd->dd_pool; 1685 1686 /* 1687 * Iterate over all children. 1688 */ 1689 if (flags & DS_FIND_CHILDREN) { 1690 for (zap_cursor_init(&zc, dp->dp_meta_objset, 1691 dd->dd_phys->dd_child_dir_zapobj); 1692 zap_cursor_retrieve(&zc, attr) == 0; 1693 (void) zap_cursor_advance(&zc)) { 1694 ASSERT(attr->za_integer_length == sizeof (uint64_t)); 1695 ASSERT(attr->za_num_integers == 1); 1696 1697 child = kmem_asprintf("%s/%s", name, attr->za_name); 1698 err = dmu_objset_find_spa(spa, child, func, arg, flags); 1699 strfree(child); 1700 if (err) 1701 break; 1702 } 1703 zap_cursor_fini(&zc); 1704 1705 if (err) { 1706 dsl_dir_close(dd, FTAG); 1707 kmem_free(attr, sizeof (zap_attribute_t)); 1708 return (err); 1709 } 1710 } 1711 1712 /* 1713 * Iterate over all snapshots. 1714 */ 1715 if (flags & DS_FIND_SNAPSHOTS) { 1716 if (!dsl_pool_sync_context(dp)) 1717 rw_enter(&dp->dp_config_rwlock, RW_READER); 1718 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); 1719 if (!dsl_pool_sync_context(dp)) 1720 rw_exit(&dp->dp_config_rwlock); 1721 1722 if (err == 0) { 1723 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 1724 dsl_dataset_rele(ds, FTAG); 1725 1726 for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); 1727 zap_cursor_retrieve(&zc, attr) == 0; 1728 (void) zap_cursor_advance(&zc)) { 1729 ASSERT(attr->za_integer_length == 1730 sizeof (uint64_t)); 1731 ASSERT(attr->za_num_integers == 1); 1732 1733 child = kmem_asprintf("%s@%s", 1734 name, attr->za_name); 1735 err = func(spa, attr->za_first_integer, 1736 child, arg); 1737 strfree(child); 1738 if (err) 1739 break; 1740 } 1741 zap_cursor_fini(&zc); 1742 } 1743 } 1744 1745 dsl_dir_close(dd, FTAG); 1746 kmem_free(attr, sizeof (zap_attribute_t)); 1747 1748 if (err) 1749 return (err); 1750 1751 /* 1752 * Apply to self if appropriate. 1753 */ 1754 err = func(spa, thisobj, name, arg); 1755 return (err); 1756} 1757 1758/* ARGSUSED */ 1759int 1760dmu_objset_prefetch(const char *name, void *arg) 1761{ 1762 dsl_dataset_t *ds; 1763 1764 if (dsl_dataset_hold(name, FTAG, &ds)) 1765 return (0); 1766 1767 if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { 1768 mutex_enter(&ds->ds_opening_lock); 1769 if (ds->ds_objset == NULL) { 1770 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1771 zbookmark_t zb; 1772 1773 SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, 1774 ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 1775 1776 (void) dsl_read_nolock(NULL, dsl_dataset_get_spa(ds), 1777 &ds->ds_phys->ds_bp, NULL, NULL, 1778 ZIO_PRIORITY_ASYNC_READ, 1779 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1780 &aflags, &zb); 1781 } 1782 mutex_exit(&ds->ds_opening_lock); 1783 } 1784 1785 dsl_dataset_rele(ds, FTAG); 1786 return (0); 1787} 1788 1789void 1790dmu_objset_set_user(objset_t *os, void *user_ptr) 1791{ 1792 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); 1793 os->os_user_ptr = user_ptr; 1794} 1795 1796void * 1797dmu_objset_get_user(objset_t *os) 1798{ 1799 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); 1800 return (os->os_user_ptr); 1801} 1802