1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23226707Spjd * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. 24226707Spjd * All rights reserved. 25332547Smav * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 26264835Sdelphij * Copyright (c) 2014 Joyent, Inc. All rights reserved. 27286575Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28282126Savg * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 29359722Sfreqlabs * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. 30168404Spjd */ 31168404Spjd 32168404Spjd#include <sys/dmu.h> 33185029Spjd#include <sys/dmu_objset.h> 34168404Spjd#include <sys/dmu_tx.h> 35168404Spjd#include <sys/dsl_dataset.h> 36168404Spjd#include <sys/dsl_dir.h> 37168404Spjd#include <sys/dsl_prop.h> 38168404Spjd#include <sys/dsl_synctask.h> 39185029Spjd#include <sys/dsl_deleg.h> 40259813Sdelphij#include <sys/dmu_impl.h> 41168404Spjd#include <sys/spa.h> 42219089Spjd#include <sys/metaslab.h> 43168404Spjd#include <sys/zap.h> 44168404Spjd#include <sys/zio.h> 45168404Spjd#include <sys/arc.h> 46185029Spjd#include <sys/sunddi.h> 47219317Spjd#include <sys/zvol.h> 48226678Spjd#ifdef _KERNEL 49226676Spjd#include <sys/zfs_vfsops.h> 50226678Spjd#endif 51264835Sdelphij#include <sys/zfeature.h> 52264835Sdelphij#include <sys/policy.h> 53264835Sdelphij#include <sys/zfs_znode.h> 54168404Spjd#include "zfs_namecheck.h" 55264835Sdelphij#include "zfs_prop.h" 56168404Spjd 57264835Sdelphij/* 58264835Sdelphij * Filesystem and Snapshot Limits 59264835Sdelphij * ------------------------------ 60264835Sdelphij * 61264835Sdelphij * These limits are used to restrict the number of filesystems and/or snapshots 62264835Sdelphij * that can be created at a given level in the tree or below. A typical 63264835Sdelphij * use-case is with a delegated dataset where the administrator wants to ensure 64264835Sdelphij * that a user within the zone is not creating too many additional filesystems 65264835Sdelphij * or snapshots, even though they're not exceeding their space quota. 66264835Sdelphij * 67264835Sdelphij * The filesystem and snapshot counts are stored as extensible properties. This 68264835Sdelphij * capability is controlled by a feature flag and must be enabled to be used. 69264835Sdelphij * Once enabled, the feature is not active until the first limit is set. At 70264835Sdelphij * that point, future operations to create/destroy filesystems or snapshots 71264835Sdelphij * will validate and update the counts. 72264835Sdelphij * 73264835Sdelphij * Because the count properties will not exist before the feature is active, 74264835Sdelphij * the counts are updated when a limit is first set on an uninitialized 75264835Sdelphij * dsl_dir node in the tree (The filesystem/snapshot count on a node includes 76264835Sdelphij * all of the nested filesystems/snapshots. Thus, a new leaf node has a 77264835Sdelphij * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and 78264835Sdelphij * snapshot count properties on a node indicate uninitialized counts on that 79264835Sdelphij * node.) When first setting a limit on an uninitialized node, the code starts 80264835Sdelphij * at the filesystem with the new limit and descends into all sub-filesystems 81264835Sdelphij * to add the count properties. 82264835Sdelphij * 83264835Sdelphij * In practice this is lightweight since a limit is typically set when the 84264835Sdelphij * filesystem is created and thus has no children. Once valid, changing the 85264835Sdelphij * limit value won't require a re-traversal since the counts are already valid. 86264835Sdelphij * When recursively fixing the counts, if a node with a limit is encountered 87264835Sdelphij * during the descent, the counts are known to be valid and there is no need to 88264835Sdelphij * descend into that filesystem's children. The counts on filesystems above the 89264835Sdelphij * one with the new limit will still be uninitialized, unless a limit is 90264835Sdelphij * eventually set on one of those filesystems. The counts are always recursively 91264835Sdelphij * updated when a limit is set on a dataset, unless there is already a limit. 92264835Sdelphij * When a new limit value is set on a filesystem with an existing limit, it is 93264835Sdelphij * possible for the new limit to be less than the current count at that level 94264835Sdelphij * since a user who can change the limit is also allowed to exceed the limit. 95264835Sdelphij * 96264835Sdelphij * Once the feature is active, then whenever a filesystem or snapshot is 97264835Sdelphij * created, the code recurses up the tree, validating the new count against the 98264835Sdelphij * limit at each initialized level. In practice, most levels will not have a 99264835Sdelphij * limit set. If there is a limit at any initialized level up the tree, the 100264835Sdelphij * check must pass or the creation will fail. Likewise, when a filesystem or 101264835Sdelphij * snapshot is destroyed, the counts are recursively adjusted all the way up 102264835Sdelphij * the initizized nodes in the tree. Renaming a filesystem into different point 103264835Sdelphij * in the tree will first validate, then update the counts on each branch up to 104264835Sdelphij * the common ancestor. A receive will also validate the counts and then update 105264835Sdelphij * them. 106264835Sdelphij * 107264835Sdelphij * An exception to the above behavior is that the limit is not enforced if the 108264835Sdelphij * user has permission to modify the limit. This is primarily so that 109264835Sdelphij * recursive snapshots in the global zone always work. We want to prevent a 110264835Sdelphij * denial-of-service in which a lower level delegated dataset could max out its 111264835Sdelphij * limit and thus block recursive snapshots from being taken in the global zone. 112264835Sdelphij * Because of this, it is possible for the snapshot count to be over the limit 113264835Sdelphij * and snapshots taken in the global zone could cause a lower level dataset to 114264835Sdelphij * hit or exceed its limit. The administrator taking the global zone recursive 115264835Sdelphij * snapshot should be aware of this side-effect and behave accordingly. 116264835Sdelphij * For consistency, the filesystem limit is also not enforced if the user can 117264835Sdelphij * modify the limit. 118264835Sdelphij * 119264835Sdelphij * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() 120264835Sdelphij * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in 121264835Sdelphij * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by 122264835Sdelphij * dsl_dir_init_fs_ss_count(). 123264835Sdelphij * 124264835Sdelphij * There is a special case when we receive a filesystem that already exists. In 125264835Sdelphij * this case a temporary clone name of %X is created (see dmu_recv_begin). We 126264835Sdelphij * never update the filesystem counts for temporary clones. 127264835Sdelphij * 128264835Sdelphij * Likewise, we do not update the snapshot counts for temporary snapshots, 129264835Sdelphij * such as those created by zfs diff. 130264835Sdelphij */ 131264835Sdelphij 132275782Sdelphijextern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); 133275782Sdelphij 134185029Spjdstatic uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); 135168404Spjd 136332525Smavtypedef struct ddulrt_arg { 137332525Smav dsl_dir_t *ddulrta_dd; 138332525Smav uint64_t ddlrta_txg; 139332525Smav} ddulrt_arg_t; 140332525Smav 141168404Spjdstatic void 142321527Smavdsl_dir_evict_async(void *dbu) 143168404Spjd{ 144286575Smav dsl_dir_t *dd = dbu; 145168404Spjd dsl_pool_t *dp = dd->dd_pool; 146168404Spjd int t; 147168404Spjd 148286575Smav dd->dd_dbuf = NULL; 149286575Smav 150168404Spjd for (t = 0; t < TXG_SIZE; t++) { 151168404Spjd ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); 152168404Spjd ASSERT(dd->dd_tempreserved[t] == 0); 153168404Spjd ASSERT(dd->dd_space_towrite[t] == 0); 154168404Spjd } 155168404Spjd 156168404Spjd if (dd->dd_parent) 157286575Smav dsl_dir_async_rele(dd->dd_parent, dd); 158168404Spjd 159286575Smav spa_async_close(dd->dd_pool->dp_spa, dd); 160168404Spjd 161288204Sdelphij dsl_prop_fini(dd); 162168404Spjd mutex_destroy(&dd->dd_lock); 163168404Spjd kmem_free(dd, sizeof (dsl_dir_t)); 164168404Spjd} 165168404Spjd 166168404Spjdint 167248571Smmdsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, 168168404Spjd const char *tail, void *tag, dsl_dir_t **ddp) 169168404Spjd{ 170168404Spjd dmu_buf_t *dbuf; 171168404Spjd dsl_dir_t *dd; 172168404Spjd int err; 173168404Spjd 174248571Smm ASSERT(dsl_pool_config_held(dp)); 175168404Spjd 176168404Spjd err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); 177248571Smm if (err != 0) 178168404Spjd return (err); 179168404Spjd dd = dmu_buf_get_user(dbuf); 180168404Spjd#ifdef ZFS_DEBUG 181168404Spjd { 182168404Spjd dmu_object_info_t doi; 183168404Spjd dmu_object_info_from_db(dbuf, &doi); 184259813Sdelphij ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); 185185029Spjd ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); 186168404Spjd } 187168404Spjd#endif 188168404Spjd if (dd == NULL) { 189168404Spjd dsl_dir_t *winner; 190168404Spjd 191168404Spjd dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); 192168404Spjd dd->dd_object = ddobj; 193168404Spjd dd->dd_dbuf = dbuf; 194168404Spjd dd->dd_pool = dp; 195168404Spjd mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); 196288204Sdelphij dsl_prop_init(dd); 197168404Spjd 198219089Spjd dsl_dir_snap_cmtime_update(dd); 199219089Spjd 200275782Sdelphij if (dsl_dir_phys(dd)->dd_parent_obj) { 201275782Sdelphij err = dsl_dir_hold_obj(dp, 202275782Sdelphij dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, 203275782Sdelphij &dd->dd_parent); 204248571Smm if (err != 0) 205185029Spjd goto errout; 206168404Spjd if (tail) { 207168404Spjd#ifdef ZFS_DEBUG 208168404Spjd uint64_t foundobj; 209168404Spjd 210168404Spjd err = zap_lookup(dp->dp_meta_objset, 211275782Sdelphij dsl_dir_phys(dd->dd_parent)-> 212275782Sdelphij dd_child_dir_zapobj, tail, 213275782Sdelphij sizeof (foundobj), 1, &foundobj); 214168404Spjd ASSERT(err || foundobj == ddobj); 215168404Spjd#endif 216168404Spjd (void) strcpy(dd->dd_myname, tail); 217168404Spjd } else { 218168404Spjd err = zap_value_search(dp->dp_meta_objset, 219275782Sdelphij dsl_dir_phys(dd->dd_parent)-> 220275782Sdelphij dd_child_dir_zapobj, 221185029Spjd ddobj, 0, dd->dd_myname); 222168404Spjd } 223248571Smm if (err != 0) 224185029Spjd goto errout; 225168404Spjd } else { 226168404Spjd (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); 227168404Spjd } 228168404Spjd 229219089Spjd if (dsl_dir_is_clone(dd)) { 230219089Spjd dmu_buf_t *origin_bonus; 231219089Spjd dsl_dataset_phys_t *origin_phys; 232219089Spjd 233219089Spjd /* 234219089Spjd * We can't open the origin dataset, because 235219089Spjd * that would require opening this dsl_dir. 236219089Spjd * Just look at its phys directly instead. 237219089Spjd */ 238219089Spjd err = dmu_bonus_hold(dp->dp_meta_objset, 239275782Sdelphij dsl_dir_phys(dd)->dd_origin_obj, FTAG, 240275782Sdelphij &origin_bonus); 241248571Smm if (err != 0) 242219089Spjd goto errout; 243219089Spjd origin_phys = origin_bonus->db_data; 244219089Spjd dd->dd_origin_txg = 245219089Spjd origin_phys->ds_creation_txg; 246219089Spjd dmu_buf_rele(origin_bonus, FTAG); 247219089Spjd } 248219089Spjd 249321527Smav dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, 250321527Smav &dd->dd_dbuf); 251286575Smav winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); 252286575Smav if (winner != NULL) { 253168404Spjd if (dd->dd_parent) 254248571Smm dsl_dir_rele(dd->dd_parent, dd); 255288204Sdelphij dsl_prop_fini(dd); 256168404Spjd mutex_destroy(&dd->dd_lock); 257168404Spjd kmem_free(dd, sizeof (dsl_dir_t)); 258168404Spjd dd = winner; 259168404Spjd } else { 260168404Spjd spa_open_ref(dp->dp_spa, dd); 261168404Spjd } 262168404Spjd } 263168404Spjd 264168404Spjd /* 265168404Spjd * The dsl_dir_t has both open-to-close and instantiate-to-evict 266168404Spjd * holds on the spa. We need the open-to-close holds because 267168404Spjd * otherwise the spa_refcnt wouldn't change when we open a 268168404Spjd * dir which the spa also has open, so we could incorrectly 269168404Spjd * think it was OK to unload/export/destroy the pool. We need 270168404Spjd * the instantiate-to-evict hold because the dsl_dir_t has a 271168404Spjd * pointer to the dd_pool, which has a pointer to the spa_t. 272168404Spjd */ 273168404Spjd spa_open_ref(dp->dp_spa, tag); 274168404Spjd ASSERT3P(dd->dd_pool, ==, dp); 275168404Spjd ASSERT3U(dd->dd_object, ==, ddobj); 276168404Spjd ASSERT3P(dd->dd_dbuf, ==, dbuf); 277168404Spjd *ddp = dd; 278168404Spjd return (0); 279185029Spjd 280185029Spjderrout: 281185029Spjd if (dd->dd_parent) 282248571Smm dsl_dir_rele(dd->dd_parent, dd); 283288204Sdelphij dsl_prop_fini(dd); 284185029Spjd mutex_destroy(&dd->dd_lock); 285185029Spjd kmem_free(dd, sizeof (dsl_dir_t)); 286185029Spjd dmu_buf_rele(dbuf, tag); 287185029Spjd return (err); 288168404Spjd} 289168404Spjd 290168404Spjdvoid 291248571Smmdsl_dir_rele(dsl_dir_t *dd, void *tag) 292168404Spjd{ 293168404Spjd dprintf_dd(dd, "%s\n", ""); 294168404Spjd spa_close(dd->dd_pool->dp_spa, tag); 295168404Spjd dmu_buf_rele(dd->dd_dbuf, tag); 296168404Spjd} 297168404Spjd 298286575Smav/* 299286575Smav * Remove a reference to the given dsl dir that is being asynchronously 300286575Smav * released. Async releases occur from a taskq performing eviction of 301286575Smav * dsl datasets and dirs. This process is identical to a normal release 302286575Smav * with the exception of using the async API for releasing the reference on 303286575Smav * the spa. 304286575Smav */ 305286575Smavvoid 306286575Smavdsl_dir_async_rele(dsl_dir_t *dd, void *tag) 307286575Smav{ 308286575Smav dprintf_dd(dd, "%s\n", ""); 309286575Smav spa_async_close(dd->dd_pool->dp_spa, tag); 310286575Smav dmu_buf_rele(dd->dd_dbuf, tag); 311286575Smav} 312286575Smav 313307108Smav/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ 314168404Spjdvoid 315168404Spjddsl_dir_name(dsl_dir_t *dd, char *buf) 316168404Spjd{ 317168404Spjd if (dd->dd_parent) { 318168404Spjd dsl_dir_name(dd->dd_parent, buf); 319307108Smav VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <, 320307108Smav ZFS_MAX_DATASET_NAME_LEN); 321168404Spjd } else { 322168404Spjd buf[0] = '\0'; 323168404Spjd } 324168404Spjd if (!MUTEX_HELD(&dd->dd_lock)) { 325168404Spjd /* 326168404Spjd * recursive mutex so that we can use 327168404Spjd * dprintf_dd() with dd_lock held 328168404Spjd */ 329168404Spjd mutex_enter(&dd->dd_lock); 330307108Smav VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), 331307108Smav <, ZFS_MAX_DATASET_NAME_LEN); 332168404Spjd mutex_exit(&dd->dd_lock); 333168404Spjd } else { 334307108Smav VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), 335307108Smav <, ZFS_MAX_DATASET_NAME_LEN); 336168404Spjd } 337168404Spjd} 338168404Spjd 339239620Smm/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ 340168404Spjdint 341168498Spjddsl_dir_namelen(dsl_dir_t *dd) 342168498Spjd{ 343168498Spjd int result = 0; 344168498Spjd 345168498Spjd if (dd->dd_parent) { 346168498Spjd /* parent's name + 1 for the "/" */ 347168498Spjd result = dsl_dir_namelen(dd->dd_parent) + 1; 348168498Spjd } 349168498Spjd 350168498Spjd if (!MUTEX_HELD(&dd->dd_lock)) { 351168498Spjd /* see dsl_dir_name */ 352168498Spjd mutex_enter(&dd->dd_lock); 353168498Spjd result += strlen(dd->dd_myname); 354168498Spjd mutex_exit(&dd->dd_lock); 355168498Spjd } else { 356168498Spjd result += strlen(dd->dd_myname); 357168498Spjd } 358168498Spjd 359168498Spjd return (result); 360168498Spjd} 361168498Spjd 362168404Spjdstatic int 363168404Spjdgetcomponent(const char *path, char *component, const char **nextp) 364168404Spjd{ 365168404Spjd char *p; 366248571Smm 367209962Smm if ((path == NULL) || (path[0] == '\0')) 368249195Smm return (SET_ERROR(ENOENT)); 369168404Spjd /* This would be a good place to reserve some namespace... */ 370168404Spjd p = strpbrk(path, "/@"); 371168404Spjd if (p && (p[1] == '/' || p[1] == '@')) { 372168404Spjd /* two separators in a row */ 373249195Smm return (SET_ERROR(EINVAL)); 374168404Spjd } 375168404Spjd if (p == NULL || p == path) { 376168404Spjd /* 377168404Spjd * if the first thing is an @ or /, it had better be an 378168404Spjd * @ and it had better not have any more ats or slashes, 379168404Spjd * and it had better have something after the @. 380168404Spjd */ 381168404Spjd if (p != NULL && 382168404Spjd (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) 383249195Smm return (SET_ERROR(EINVAL)); 384307108Smav if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) 385249195Smm return (SET_ERROR(ENAMETOOLONG)); 386168404Spjd (void) strcpy(component, path); 387168404Spjd p = NULL; 388168404Spjd } else if (p[0] == '/') { 389307108Smav if (p - path >= ZFS_MAX_DATASET_NAME_LEN) 390249195Smm return (SET_ERROR(ENAMETOOLONG)); 391168404Spjd (void) strncpy(component, path, p - path); 392248571Smm component[p - path] = '\0'; 393168404Spjd p++; 394168404Spjd } else if (p[0] == '@') { 395168404Spjd /* 396168404Spjd * if the next separator is an @, there better not be 397168404Spjd * any more slashes. 398168404Spjd */ 399168404Spjd if (strchr(path, '/')) 400249195Smm return (SET_ERROR(EINVAL)); 401307108Smav if (p - path >= ZFS_MAX_DATASET_NAME_LEN) 402249195Smm return (SET_ERROR(ENAMETOOLONG)); 403168404Spjd (void) strncpy(component, path, p - path); 404248571Smm component[p - path] = '\0'; 405168404Spjd } else { 406248571Smm panic("invalid p=%p", (void *)p); 407168404Spjd } 408168404Spjd *nextp = p; 409168404Spjd return (0); 410168404Spjd} 411168404Spjd 412168404Spjd/* 413248571Smm * Return the dsl_dir_t, and possibly the last component which couldn't 414248571Smm * be found in *tail. The name must be in the specified dsl_pool_t. This 415248571Smm * thread must hold the dp_config_rwlock for the pool. Returns NULL if the 416248571Smm * path is bogus, or if tail==NULL and we couldn't parse the whole name. 417248571Smm * (*tail)[0] == '@' means that the last component is a snapshot. 418168404Spjd */ 419168404Spjdint 420248571Smmdsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, 421168404Spjd dsl_dir_t **ddp, const char **tailp) 422168404Spjd{ 423307108Smav char buf[ZFS_MAX_DATASET_NAME_LEN]; 424248571Smm const char *spaname, *next, *nextnext = NULL; 425168404Spjd int err; 426168404Spjd dsl_dir_t *dd; 427168404Spjd uint64_t ddobj; 428168404Spjd 429168404Spjd err = getcomponent(name, buf, &next); 430248571Smm if (err != 0) 431168404Spjd return (err); 432168404Spjd 433248571Smm /* Make sure the name is in the specified pool. */ 434248571Smm spaname = spa_name(dp->dp_spa); 435248571Smm if (strcmp(buf, spaname) != 0) 436282126Savg return (SET_ERROR(EXDEV)); 437168404Spjd 438248571Smm ASSERT(dsl_pool_config_held(dp)); 439168404Spjd 440248571Smm err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); 441248571Smm if (err != 0) { 442168404Spjd return (err); 443168404Spjd } 444168404Spjd 445168404Spjd while (next != NULL) { 446286575Smav dsl_dir_t *child_dd; 447168404Spjd err = getcomponent(next, buf, &nextnext); 448248571Smm if (err != 0) 449168404Spjd break; 450168404Spjd ASSERT(next[0] != '\0'); 451168404Spjd if (next[0] == '@') 452168404Spjd break; 453168404Spjd dprintf("looking up %s in obj%lld\n", 454275782Sdelphij buf, dsl_dir_phys(dd)->dd_child_dir_zapobj); 455168404Spjd 456168404Spjd err = zap_lookup(dp->dp_meta_objset, 457275782Sdelphij dsl_dir_phys(dd)->dd_child_dir_zapobj, 458168404Spjd buf, sizeof (ddobj), 1, &ddobj); 459248571Smm if (err != 0) { 460168404Spjd if (err == ENOENT) 461168404Spjd err = 0; 462168404Spjd break; 463168404Spjd } 464168404Spjd 465286575Smav err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); 466248571Smm if (err != 0) 467168404Spjd break; 468248571Smm dsl_dir_rele(dd, tag); 469286575Smav dd = child_dd; 470168404Spjd next = nextnext; 471168404Spjd } 472168404Spjd 473248571Smm if (err != 0) { 474248571Smm dsl_dir_rele(dd, tag); 475168404Spjd return (err); 476168404Spjd } 477168404Spjd 478168404Spjd /* 479168404Spjd * It's an error if there's more than one component left, or 480168404Spjd * tailp==NULL and there's any component left. 481168404Spjd */ 482168404Spjd if (next != NULL && 483168404Spjd (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { 484168404Spjd /* bad path name */ 485248571Smm dsl_dir_rele(dd, tag); 486168404Spjd dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); 487249195Smm err = SET_ERROR(ENOENT); 488168404Spjd } 489248571Smm if (tailp != NULL) 490168404Spjd *tailp = next; 491168404Spjd *ddp = dd; 492168404Spjd return (err); 493168404Spjd} 494168404Spjd 495264835Sdelphij/* 496264835Sdelphij * If the counts are already initialized for this filesystem and its 497264835Sdelphij * descendants then do nothing, otherwise initialize the counts. 498264835Sdelphij * 499264835Sdelphij * The counts on this filesystem, and those below, may be uninitialized due to 500264835Sdelphij * either the use of a pre-existing pool which did not support the 501264835Sdelphij * filesystem/snapshot limit feature, or one in which the feature had not yet 502264835Sdelphij * been enabled. 503264835Sdelphij * 504264835Sdelphij * Recursively descend the filesystem tree and update the filesystem/snapshot 505264835Sdelphij * counts on each filesystem below, then update the cumulative count on the 506264835Sdelphij * current filesystem. If the filesystem already has a count set on it, 507264835Sdelphij * then we know that its counts, and the counts on the filesystems below it, 508264835Sdelphij * are already correct, so we don't have to update this filesystem. 509264835Sdelphij */ 510264835Sdelphijstatic void 511264835Sdelphijdsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) 512264835Sdelphij{ 513264835Sdelphij uint64_t my_fs_cnt = 0; 514264835Sdelphij uint64_t my_ss_cnt = 0; 515264835Sdelphij dsl_pool_t *dp = dd->dd_pool; 516264835Sdelphij objset_t *os = dp->dp_meta_objset; 517264835Sdelphij zap_cursor_t *zc; 518264835Sdelphij zap_attribute_t *za; 519264835Sdelphij dsl_dataset_t *ds; 520264835Sdelphij 521266915Sdelphij ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); 522264835Sdelphij ASSERT(dsl_pool_config_held(dp)); 523264835Sdelphij ASSERT(dmu_tx_is_syncing(tx)); 524264835Sdelphij 525264835Sdelphij dsl_dir_zapify(dd, tx); 526264835Sdelphij 527264835Sdelphij /* 528264835Sdelphij * If the filesystem count has already been initialized then we 529264835Sdelphij * don't need to recurse down any further. 530264835Sdelphij */ 531264835Sdelphij if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) 532264835Sdelphij return; 533264835Sdelphij 534264835Sdelphij zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); 535264835Sdelphij za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 536264835Sdelphij 537264835Sdelphij /* Iterate my child dirs */ 538275782Sdelphij for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); 539264835Sdelphij zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { 540264835Sdelphij dsl_dir_t *chld_dd; 541264835Sdelphij uint64_t count; 542264835Sdelphij 543264835Sdelphij VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, 544264835Sdelphij &chld_dd)); 545264835Sdelphij 546264835Sdelphij /* 547264835Sdelphij * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and 548264835Sdelphij * temporary datasets. 549264835Sdelphij */ 550264835Sdelphij if (chld_dd->dd_myname[0] == '$' || 551264835Sdelphij chld_dd->dd_myname[0] == '%') { 552264835Sdelphij dsl_dir_rele(chld_dd, FTAG); 553264835Sdelphij continue; 554264835Sdelphij } 555264835Sdelphij 556264835Sdelphij my_fs_cnt++; /* count this child */ 557264835Sdelphij 558264835Sdelphij dsl_dir_init_fs_ss_count(chld_dd, tx); 559264835Sdelphij 560264835Sdelphij VERIFY0(zap_lookup(os, chld_dd->dd_object, 561264835Sdelphij DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); 562264835Sdelphij my_fs_cnt += count; 563264835Sdelphij VERIFY0(zap_lookup(os, chld_dd->dd_object, 564264835Sdelphij DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); 565264835Sdelphij my_ss_cnt += count; 566264835Sdelphij 567264835Sdelphij dsl_dir_rele(chld_dd, FTAG); 568264835Sdelphij } 569264835Sdelphij zap_cursor_fini(zc); 570264835Sdelphij /* Count my snapshots (we counted children's snapshots above) */ 571264835Sdelphij VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, 572275782Sdelphij dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); 573264835Sdelphij 574275782Sdelphij for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); 575264835Sdelphij zap_cursor_retrieve(zc, za) == 0; 576264835Sdelphij zap_cursor_advance(zc)) { 577264835Sdelphij /* Don't count temporary snapshots */ 578264835Sdelphij if (za->za_name[0] != '%') 579264835Sdelphij my_ss_cnt++; 580264835Sdelphij } 581266915Sdelphij zap_cursor_fini(zc); 582264835Sdelphij 583264835Sdelphij dsl_dataset_rele(ds, FTAG); 584264835Sdelphij 585264835Sdelphij kmem_free(zc, sizeof (zap_cursor_t)); 586264835Sdelphij kmem_free(za, sizeof (zap_attribute_t)); 587264835Sdelphij 588264835Sdelphij /* we're in a sync task, update counts */ 589264835Sdelphij dmu_buf_will_dirty(dd->dd_dbuf, tx); 590264835Sdelphij VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, 591264835Sdelphij sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); 592264835Sdelphij VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, 593264835Sdelphij sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); 594264835Sdelphij} 595264835Sdelphij 596264835Sdelphijstatic int 597264835Sdelphijdsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) 598264835Sdelphij{ 599264835Sdelphij char *ddname = (char *)arg; 600264835Sdelphij dsl_pool_t *dp = dmu_tx_pool(tx); 601264835Sdelphij dsl_dataset_t *ds; 602264835Sdelphij dsl_dir_t *dd; 603264835Sdelphij int error; 604264835Sdelphij 605264835Sdelphij error = dsl_dataset_hold(dp, ddname, FTAG, &ds); 606264835Sdelphij if (error != 0) 607264835Sdelphij return (error); 608264835Sdelphij 609264835Sdelphij if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { 610264835Sdelphij dsl_dataset_rele(ds, FTAG); 611264835Sdelphij return (SET_ERROR(ENOTSUP)); 612264835Sdelphij } 613264835Sdelphij 614264835Sdelphij dd = ds->ds_dir; 615264835Sdelphij if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && 616264835Sdelphij dsl_dir_is_zapified(dd) && 617264835Sdelphij zap_contains(dp->dp_meta_objset, dd->dd_object, 618264835Sdelphij DD_FIELD_FILESYSTEM_COUNT) == 0) { 619264835Sdelphij dsl_dataset_rele(ds, FTAG); 620264835Sdelphij return (SET_ERROR(EALREADY)); 621264835Sdelphij } 622264835Sdelphij 623264835Sdelphij dsl_dataset_rele(ds, FTAG); 624264835Sdelphij return (0); 625264835Sdelphij} 626264835Sdelphij 627264835Sdelphijstatic void 628264835Sdelphijdsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) 629264835Sdelphij{ 630264835Sdelphij char *ddname = (char *)arg; 631264835Sdelphij dsl_pool_t *dp = dmu_tx_pool(tx); 632264835Sdelphij dsl_dataset_t *ds; 633264835Sdelphij spa_t *spa; 634264835Sdelphij 635264835Sdelphij VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); 636264835Sdelphij 637264835Sdelphij spa = dsl_dataset_get_spa(ds); 638264835Sdelphij 639264835Sdelphij if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { 640264835Sdelphij /* 641264835Sdelphij * Since the feature was not active and we're now setting a 642264835Sdelphij * limit, increment the feature-active counter so that the 643264835Sdelphij * feature becomes active for the first time. 644264835Sdelphij * 645264835Sdelphij * We are already in a sync task so we can update the MOS. 646264835Sdelphij */ 647264835Sdelphij spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); 648264835Sdelphij } 649264835Sdelphij 650264835Sdelphij /* 651264835Sdelphij * Since we are now setting a non-UINT64_MAX limit on the filesystem, 652264835Sdelphij * we need to ensure the counts are correct. Descend down the tree from 653264835Sdelphij * this point and update all of the counts to be accurate. 654264835Sdelphij */ 655264835Sdelphij dsl_dir_init_fs_ss_count(ds->ds_dir, tx); 656264835Sdelphij 657264835Sdelphij dsl_dataset_rele(ds, FTAG); 658264835Sdelphij} 659264835Sdelphij 660264835Sdelphij/* 661264835Sdelphij * Make sure the feature is enabled and activate it if necessary. 662264835Sdelphij * Since we're setting a limit, ensure the on-disk counts are valid. 663264835Sdelphij * This is only called by the ioctl path when setting a limit value. 664264835Sdelphij * 665264835Sdelphij * We do not need to validate the new limit, since users who can change the 666264835Sdelphij * limit are also allowed to exceed the limit. 667264835Sdelphij */ 668264835Sdelphijint 669264835Sdelphijdsl_dir_activate_fs_ss_limit(const char *ddname) 670264835Sdelphij{ 671264835Sdelphij int error; 672264835Sdelphij 673264835Sdelphij error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, 674268473Sdelphij dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0, 675268473Sdelphij ZFS_SPACE_CHECK_RESERVED); 676264835Sdelphij 677264835Sdelphij if (error == EALREADY) 678264835Sdelphij error = 0; 679264835Sdelphij 680264835Sdelphij return (error); 681264835Sdelphij} 682264835Sdelphij 683264835Sdelphij/* 684264835Sdelphij * Used to determine if the filesystem_limit or snapshot_limit should be 685264835Sdelphij * enforced. We allow the limit to be exceeded if the user has permission to 686264835Sdelphij * write the property value. We pass in the creds that we got in the open 687264835Sdelphij * context since we will always be the GZ root in syncing context. We also have 688264835Sdelphij * to handle the case where we are allowed to change the limit on the current 689264835Sdelphij * dataset, but there may be another limit in the tree above. 690264835Sdelphij * 691264835Sdelphij * We can never modify these two properties within a non-global zone. In 692264835Sdelphij * addition, the other checks are modeled on zfs_secpolicy_write_perms. We 693264835Sdelphij * can't use that function since we are already holding the dp_config_rwlock. 694264835Sdelphij * In addition, we already have the dd and dealing with snapshots is simplified 695264835Sdelphij * in this code. 696264835Sdelphij */ 697264835Sdelphij 698264835Sdelphijtypedef enum { 699264835Sdelphij ENFORCE_ALWAYS, 700264835Sdelphij ENFORCE_NEVER, 701264835Sdelphij ENFORCE_ABOVE 702264835Sdelphij} enforce_res_t; 703264835Sdelphij 704264835Sdelphijstatic enforce_res_t 705264835Sdelphijdsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) 706264835Sdelphij{ 707264835Sdelphij enforce_res_t enforce = ENFORCE_ALWAYS; 708264835Sdelphij uint64_t obj; 709264835Sdelphij dsl_dataset_t *ds; 710264835Sdelphij uint64_t zoned; 711264835Sdelphij 712264835Sdelphij ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || 713264835Sdelphij prop == ZFS_PROP_SNAPSHOT_LIMIT); 714264835Sdelphij 715264835Sdelphij#ifdef _KERNEL 716264835Sdelphij#ifdef __FreeBSD__ 717264835Sdelphij if (jailed(cr)) 718264835Sdelphij#else 719264835Sdelphij if (crgetzoneid(cr) != GLOBAL_ZONEID) 720264835Sdelphij#endif 721264835Sdelphij return (ENFORCE_ALWAYS); 722264835Sdelphij 723264835Sdelphij if (secpolicy_zfs(cr) == 0) 724264835Sdelphij return (ENFORCE_NEVER); 725264835Sdelphij#endif 726264835Sdelphij 727275782Sdelphij if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) 728264835Sdelphij return (ENFORCE_ALWAYS); 729264835Sdelphij 730264835Sdelphij ASSERT(dsl_pool_config_held(dd->dd_pool)); 731264835Sdelphij 732264835Sdelphij if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) 733264835Sdelphij return (ENFORCE_ALWAYS); 734264835Sdelphij 735264835Sdelphij if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) { 736264835Sdelphij /* Only root can access zoned fs's from the GZ */ 737264835Sdelphij enforce = ENFORCE_ALWAYS; 738264835Sdelphij } else { 739264835Sdelphij if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) 740264835Sdelphij enforce = ENFORCE_ABOVE; 741264835Sdelphij } 742264835Sdelphij 743264835Sdelphij dsl_dataset_rele(ds, FTAG); 744264835Sdelphij return (enforce); 745264835Sdelphij} 746264835Sdelphij 747332525Smavstatic void 748332525Smavdsl_dir_update_last_remap_txg_sync(void *varg, dmu_tx_t *tx) 749332525Smav{ 750332525Smav ddulrt_arg_t *arg = varg; 751332525Smav uint64_t last_remap_txg; 752332525Smav dsl_dir_t *dd = arg->ddulrta_dd; 753332525Smav objset_t *mos = dd->dd_pool->dp_meta_objset; 754332525Smav 755332525Smav dsl_dir_zapify(dd, tx); 756332525Smav if (zap_lookup(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, 757332525Smav sizeof (last_remap_txg), 1, &last_remap_txg) != 0 || 758332525Smav last_remap_txg < arg->ddlrta_txg) { 759332525Smav VERIFY0(zap_update(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, 760332525Smav sizeof (arg->ddlrta_txg), 1, &arg->ddlrta_txg, tx)); 761332525Smav } 762332525Smav} 763332525Smav 764332525Smavint 765332525Smavdsl_dir_update_last_remap_txg(dsl_dir_t *dd, uint64_t txg) 766332525Smav{ 767332525Smav ddulrt_arg_t arg; 768332525Smav arg.ddulrta_dd = dd; 769332525Smav arg.ddlrta_txg = txg; 770332525Smav 771332525Smav return (dsl_sync_task(spa_name(dd->dd_pool->dp_spa), 772332525Smav NULL, dsl_dir_update_last_remap_txg_sync, &arg, 773332525Smav 1, ZFS_SPACE_CHECK_RESERVED)); 774332525Smav} 775332525Smav 776264835Sdelphij/* 777264835Sdelphij * Check if adding additional child filesystem(s) would exceed any filesystem 778264835Sdelphij * limits or adding additional snapshot(s) would exceed any snapshot limits. 779264835Sdelphij * The prop argument indicates which limit to check. 780264835Sdelphij * 781264835Sdelphij * Note that all filesystem limits up to the root (or the highest 782264835Sdelphij * initialized) filesystem or the given ancestor must be satisfied. 783264835Sdelphij */ 784264835Sdelphijint 785264835Sdelphijdsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, 786264835Sdelphij dsl_dir_t *ancestor, cred_t *cr) 787264835Sdelphij{ 788264835Sdelphij objset_t *os = dd->dd_pool->dp_meta_objset; 789264835Sdelphij uint64_t limit, count; 790264835Sdelphij char *count_prop; 791264835Sdelphij enforce_res_t enforce; 792264835Sdelphij int err = 0; 793264835Sdelphij 794264835Sdelphij ASSERT(dsl_pool_config_held(dd->dd_pool)); 795264835Sdelphij ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || 796264835Sdelphij prop == ZFS_PROP_SNAPSHOT_LIMIT); 797264835Sdelphij 798264835Sdelphij /* 799264835Sdelphij * If we're allowed to change the limit, don't enforce the limit 800264835Sdelphij * e.g. this can happen if a snapshot is taken by an administrative 801264835Sdelphij * user in the global zone (i.e. a recursive snapshot by root). 802264835Sdelphij * However, we must handle the case of delegated permissions where we 803264835Sdelphij * are allowed to change the limit on the current dataset, but there 804264835Sdelphij * is another limit in the tree above. 805264835Sdelphij */ 806264835Sdelphij enforce = dsl_enforce_ds_ss_limits(dd, prop, cr); 807264835Sdelphij if (enforce == ENFORCE_NEVER) 808264835Sdelphij return (0); 809264835Sdelphij 810264835Sdelphij /* 811264835Sdelphij * e.g. if renaming a dataset with no snapshots, count adjustment 812264835Sdelphij * is 0. 813264835Sdelphij */ 814264835Sdelphij if (delta == 0) 815264835Sdelphij return (0); 816264835Sdelphij 817264835Sdelphij if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { 818264835Sdelphij /* 819264835Sdelphij * We don't enforce the limit for temporary snapshots. This is 820264835Sdelphij * indicated by a NULL cred_t argument. 821264835Sdelphij */ 822264835Sdelphij if (cr == NULL) 823264835Sdelphij return (0); 824264835Sdelphij 825264835Sdelphij count_prop = DD_FIELD_SNAPSHOT_COUNT; 826264835Sdelphij } else { 827264835Sdelphij count_prop = DD_FIELD_FILESYSTEM_COUNT; 828264835Sdelphij } 829264835Sdelphij 830264835Sdelphij /* 831264835Sdelphij * If an ancestor has been provided, stop checking the limit once we 832264835Sdelphij * hit that dir. We need this during rename so that we don't overcount 833264835Sdelphij * the check once we recurse up to the common ancestor. 834264835Sdelphij */ 835264835Sdelphij if (ancestor == dd) 836264835Sdelphij return (0); 837264835Sdelphij 838264835Sdelphij /* 839264835Sdelphij * If we hit an uninitialized node while recursing up the tree, we can 840264835Sdelphij * stop since we know there is no limit here (or above). The counts are 841264835Sdelphij * not valid on this node and we know we won't touch this node's counts. 842264835Sdelphij */ 843264835Sdelphij if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object, 844264835Sdelphij count_prop, sizeof (count), 1, &count) == ENOENT) 845264835Sdelphij return (0); 846264835Sdelphij 847264835Sdelphij err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, 848264835Sdelphij B_FALSE); 849264835Sdelphij if (err != 0) 850264835Sdelphij return (err); 851264835Sdelphij 852264835Sdelphij /* Is there a limit which we've hit? */ 853264835Sdelphij if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) 854264835Sdelphij return (SET_ERROR(EDQUOT)); 855264835Sdelphij 856264835Sdelphij if (dd->dd_parent != NULL) 857264835Sdelphij err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, 858264835Sdelphij ancestor, cr); 859264835Sdelphij 860264835Sdelphij return (err); 861264835Sdelphij} 862264835Sdelphij 863264835Sdelphij/* 864264835Sdelphij * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all 865264835Sdelphij * parents. When a new filesystem/snapshot is created, increment the count on 866264835Sdelphij * all parents, and when a filesystem/snapshot is destroyed, decrement the 867264835Sdelphij * count. 868264835Sdelphij */ 869264835Sdelphijvoid 870264835Sdelphijdsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, 871264835Sdelphij dmu_tx_t *tx) 872264835Sdelphij{ 873264835Sdelphij int err; 874264835Sdelphij objset_t *os = dd->dd_pool->dp_meta_objset; 875264835Sdelphij uint64_t count; 876264835Sdelphij 877264835Sdelphij ASSERT(dsl_pool_config_held(dd->dd_pool)); 878264835Sdelphij ASSERT(dmu_tx_is_syncing(tx)); 879264835Sdelphij ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || 880264835Sdelphij strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); 881264835Sdelphij 882264835Sdelphij /* 883264835Sdelphij * When we receive an incremental stream into a filesystem that already 884264835Sdelphij * exists, a temporary clone is created. We don't count this temporary 885264835Sdelphij * clone, whose name begins with a '%'. We also ignore hidden ($FREE, 886264835Sdelphij * $MOS & $ORIGIN) objsets. 887264835Sdelphij */ 888264835Sdelphij if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') && 889264835Sdelphij strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) 890264835Sdelphij return; 891264835Sdelphij 892264835Sdelphij /* 893264835Sdelphij * e.g. if renaming a dataset with no snapshots, count adjustment is 0 894264835Sdelphij */ 895264835Sdelphij if (delta == 0) 896264835Sdelphij return; 897264835Sdelphij 898264835Sdelphij /* 899264835Sdelphij * If we hit an uninitialized node while recursing up the tree, we can 900264835Sdelphij * stop since we know the counts are not valid on this node and we 901264835Sdelphij * know we shouldn't touch this node's counts. An uninitialized count 902264835Sdelphij * on the node indicates that either the feature has not yet been 903264835Sdelphij * activated or there are no limits on this part of the tree. 904264835Sdelphij */ 905264835Sdelphij if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, 906264835Sdelphij prop, sizeof (count), 1, &count)) == ENOENT) 907264835Sdelphij return; 908264835Sdelphij VERIFY0(err); 909264835Sdelphij 910264835Sdelphij count += delta; 911264835Sdelphij /* Use a signed verify to make sure we're not neg. */ 912264835Sdelphij VERIFY3S(count, >=, 0); 913264835Sdelphij 914264835Sdelphij VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, 915264835Sdelphij tx)); 916264835Sdelphij 917264835Sdelphij /* Roll up this additional count into our ancestors */ 918264835Sdelphij if (dd->dd_parent != NULL) 919264835Sdelphij dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); 920264835Sdelphij} 921264835Sdelphij 922168404Spjduint64_t 923185029Spjddsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, 924185029Spjd dmu_tx_t *tx) 925168404Spjd{ 926185029Spjd objset_t *mos = dp->dp_meta_objset; 927168404Spjd uint64_t ddobj; 928219089Spjd dsl_dir_phys_t *ddphys; 929168404Spjd dmu_buf_t *dbuf; 930168404Spjd 931168404Spjd ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, 932168404Spjd DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); 933185029Spjd if (pds) { 934332547Smav VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, 935185029Spjd name, sizeof (uint64_t), 1, &ddobj, tx)); 936185029Spjd } else { 937185029Spjd /* it's the root dir */ 938332547Smav VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, 939185029Spjd DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); 940185029Spjd } 941332547Smav VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); 942168404Spjd dmu_buf_will_dirty(dbuf, tx); 943219089Spjd ddphys = dbuf->db_data; 944168404Spjd 945219089Spjd ddphys->dd_creation_time = gethrestime_sec(); 946264835Sdelphij if (pds) { 947219089Spjd ddphys->dd_parent_obj = pds->dd_object; 948264835Sdelphij 949264835Sdelphij /* update the filesystem counts */ 950264835Sdelphij dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); 951264835Sdelphij } 952219089Spjd ddphys->dd_props_zapobj = zap_create(mos, 953168404Spjd DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); 954219089Spjd ddphys->dd_child_dir_zapobj = zap_create(mos, 955168404Spjd DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); 956185029Spjd if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) 957219089Spjd ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; 958168404Spjd dmu_buf_rele(dbuf, FTAG); 959168404Spjd 960168404Spjd return (ddobj); 961168404Spjd} 962168404Spjd 963185029Spjdboolean_t 964185029Spjddsl_dir_is_clone(dsl_dir_t *dd) 965168404Spjd{ 966275782Sdelphij return (dsl_dir_phys(dd)->dd_origin_obj && 967185029Spjd (dd->dd_pool->dp_origin_snap == NULL || 968275782Sdelphij dsl_dir_phys(dd)->dd_origin_obj != 969185029Spjd dd->dd_pool->dp_origin_snap->ds_object)); 970168404Spjd} 971168404Spjd 972325534Savg 973325534Savguint64_t 974325534Savgdsl_dir_get_used(dsl_dir_t *dd) 975325534Savg{ 976325534Savg return (dsl_dir_phys(dd)->dd_used_bytes); 977325534Savg} 978325534Savg 979325534Savguint64_t 980332547Smavdsl_dir_get_compressed(dsl_dir_t *dd) 981332547Smav{ 982332547Smav return (dsl_dir_phys(dd)->dd_compressed_bytes); 983332547Smav} 984332547Smav 985332547Smavuint64_t 986325534Savgdsl_dir_get_quota(dsl_dir_t *dd) 987325534Savg{ 988325534Savg return (dsl_dir_phys(dd)->dd_quota); 989325534Savg} 990325534Savg 991325534Savguint64_t 992325534Savgdsl_dir_get_reservation(dsl_dir_t *dd) 993325534Savg{ 994325534Savg return (dsl_dir_phys(dd)->dd_reserved); 995325534Savg} 996325534Savg 997325534Savguint64_t 998325534Savgdsl_dir_get_compressratio(dsl_dir_t *dd) 999325534Savg{ 1000325534Savg /* a fixed point number, 100x the ratio */ 1001325534Savg return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : 1002325534Savg (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / 1003325534Savg dsl_dir_phys(dd)->dd_compressed_bytes)); 1004325534Savg} 1005325534Savg 1006325534Savguint64_t 1007325534Savgdsl_dir_get_logicalused(dsl_dir_t *dd) 1008325534Savg{ 1009325534Savg return (dsl_dir_phys(dd)->dd_uncompressed_bytes); 1010325534Savg} 1011325534Savg 1012325534Savguint64_t 1013325534Savgdsl_dir_get_usedsnap(dsl_dir_t *dd) 1014325534Savg{ 1015325534Savg return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); 1016325534Savg} 1017325534Savg 1018325534Savguint64_t 1019325534Savgdsl_dir_get_usedds(dsl_dir_t *dd) 1020325534Savg{ 1021325534Savg return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); 1022325534Savg} 1023325534Savg 1024325534Savguint64_t 1025325534Savgdsl_dir_get_usedrefreserv(dsl_dir_t *dd) 1026325534Savg{ 1027325534Savg return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); 1028325534Savg} 1029325534Savg 1030325534Savguint64_t 1031325534Savgdsl_dir_get_usedchild(dsl_dir_t *dd) 1032325534Savg{ 1033325534Savg return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + 1034325534Savg dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); 1035325534Savg} 1036325534Savg 1037168404Spjdvoid 1038325534Savgdsl_dir_get_origin(dsl_dir_t *dd, char *buf) 1039325534Savg{ 1040325534Savg dsl_dataset_t *ds; 1041325534Savg VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, 1042325534Savg dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); 1043325534Savg 1044325534Savg dsl_dataset_name(ds, buf); 1045325534Savg 1046325534Savg dsl_dataset_rele(ds, FTAG); 1047325534Savg} 1048325534Savg 1049325534Savgint 1050325534Savgdsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count) 1051325534Savg{ 1052325534Savg if (dsl_dir_is_zapified(dd)) { 1053325534Savg objset_t *os = dd->dd_pool->dp_meta_objset; 1054325534Savg return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, 1055325534Savg sizeof (*count), 1, count)); 1056325534Savg } else { 1057325534Savg return (ENOENT); 1058325534Savg } 1059325534Savg} 1060325534Savg 1061325534Savgint 1062325534Savgdsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count) 1063325534Savg{ 1064325534Savg if (dsl_dir_is_zapified(dd)) { 1065325534Savg objset_t *os = dd->dd_pool->dp_meta_objset; 1066325534Savg return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, 1067325534Savg sizeof (*count), 1, count)); 1068325534Savg } else { 1069325534Savg return (ENOENT); 1070325534Savg } 1071325534Savg} 1072325534Savg 1073332525Smavint 1074332525Smavdsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count) 1075332525Smav{ 1076332525Smav if (dsl_dir_is_zapified(dd)) { 1077332525Smav objset_t *os = dd->dd_pool->dp_meta_objset; 1078332525Smav return (zap_lookup(os, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, 1079332525Smav sizeof (*count), 1, count)); 1080332525Smav } else { 1081332525Smav return (ENOENT); 1082332525Smav } 1083332525Smav} 1084332525Smav 1085325534Savgvoid 1086168404Spjddsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) 1087168404Spjd{ 1088168404Spjd mutex_enter(&dd->dd_lock); 1089275782Sdelphij dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, 1090325534Savg dsl_dir_get_quota(dd)); 1091168404Spjd dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, 1092325534Savg dsl_dir_get_reservation(dd)); 1093247585Smm dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, 1094325534Savg dsl_dir_get_logicalused(dd)); 1095275782Sdelphij if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { 1096185029Spjd dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, 1097325534Savg dsl_dir_get_usedsnap(dd)); 1098185029Spjd dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, 1099325534Savg dsl_dir_get_usedds(dd)); 1100185029Spjd dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, 1101325534Savg dsl_dir_get_usedrefreserv(dd)); 1102185029Spjd dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, 1103325534Savg dsl_dir_get_usedchild(dd)); 1104185029Spjd } 1105168404Spjd mutex_exit(&dd->dd_lock); 1106168404Spjd 1107325534Savg uint64_t count; 1108325534Savg if (dsl_dir_get_filesystem_count(dd, &count) == 0) { 1109325534Savg dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT, 1110325534Savg count); 1111264835Sdelphij } 1112325534Savg if (dsl_dir_get_snapshot_count(dd, &count) == 0) { 1113325534Savg dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT, 1114325534Savg count); 1115325534Savg } 1116332525Smav if (dsl_dir_get_remaptxg(dd, &count) == 0) { 1117332525Smav dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REMAPTXG, 1118332525Smav count); 1119332525Smav } 1120264835Sdelphij 1121185029Spjd if (dsl_dir_is_clone(dd)) { 1122307108Smav char buf[ZFS_MAX_DATASET_NAME_LEN]; 1123325534Savg dsl_dir_get_origin(dd, buf); 1124168404Spjd dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); 1125168404Spjd } 1126325534Savg 1127168404Spjd} 1128168404Spjd 1129168404Spjdvoid 1130168404Spjddsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) 1131168404Spjd{ 1132168404Spjd dsl_pool_t *dp = dd->dd_pool; 1133168404Spjd 1134275782Sdelphij ASSERT(dsl_dir_phys(dd)); 1135168404Spjd 1136248571Smm if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { 1137168404Spjd /* up the hold count until we can be written out */ 1138168404Spjd dmu_buf_add_ref(dd->dd_dbuf, dd); 1139168404Spjd } 1140168404Spjd} 1141168404Spjd 1142168404Spjdstatic int64_t 1143168404Spjdparent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) 1144168404Spjd{ 1145275782Sdelphij uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); 1146275782Sdelphij uint64_t new_accounted = 1147275782Sdelphij MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); 1148168404Spjd return (new_accounted - old_accounted); 1149168404Spjd} 1150168404Spjd 1151168404Spjdvoid 1152168404Spjddsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) 1153168404Spjd{ 1154168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 1155168404Spjd 1156168404Spjd mutex_enter(&dd->dd_lock); 1157240415Smm ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]); 1158168404Spjd dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, 1159168404Spjd dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); 1160168404Spjd dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; 1161168404Spjd mutex_exit(&dd->dd_lock); 1162168404Spjd 1163168404Spjd /* release the hold from dsl_dir_dirty */ 1164168404Spjd dmu_buf_rele(dd->dd_dbuf, dd); 1165168404Spjd} 1166168404Spjd 1167168404Spjdstatic uint64_t 1168185029Spjddsl_dir_space_towrite(dsl_dir_t *dd) 1169168404Spjd{ 1170185029Spjd uint64_t space = 0; 1171168404Spjd 1172168404Spjd ASSERT(MUTEX_HELD(&dd->dd_lock)); 1173168404Spjd 1174321547Smav for (int i = 0; i < TXG_SIZE; i++) { 1175321547Smav space += dd->dd_space_towrite[i & TXG_MASK]; 1176321547Smav ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0); 1177168404Spjd } 1178168404Spjd return (space); 1179168404Spjd} 1180168404Spjd 1181168404Spjd/* 1182168404Spjd * How much space would dd have available if ancestor had delta applied 1183168404Spjd * to it? If ondiskonly is set, we're only interested in what's 1184168404Spjd * on-disk, not estimated pending changes. 1185168404Spjd */ 1186168404Spjduint64_t 1187168404Spjddsl_dir_space_available(dsl_dir_t *dd, 1188168404Spjd dsl_dir_t *ancestor, int64_t delta, int ondiskonly) 1189168404Spjd{ 1190168404Spjd uint64_t parentspace, myspace, quota, used; 1191168404Spjd 1192168404Spjd /* 1193168404Spjd * If there are no restrictions otherwise, assume we have 1194168404Spjd * unlimited space available. 1195168404Spjd */ 1196168404Spjd quota = UINT64_MAX; 1197168404Spjd parentspace = UINT64_MAX; 1198168404Spjd 1199168404Spjd if (dd->dd_parent != NULL) { 1200168404Spjd parentspace = dsl_dir_space_available(dd->dd_parent, 1201168404Spjd ancestor, delta, ondiskonly); 1202168404Spjd } 1203168404Spjd 1204168404Spjd mutex_enter(&dd->dd_lock); 1205275782Sdelphij if (dsl_dir_phys(dd)->dd_quota != 0) 1206275782Sdelphij quota = dsl_dir_phys(dd)->dd_quota; 1207275782Sdelphij used = dsl_dir_phys(dd)->dd_used_bytes; 1208185029Spjd if (!ondiskonly) 1209185029Spjd used += dsl_dir_space_towrite(dd); 1210168404Spjd 1211168404Spjd if (dd->dd_parent == NULL) { 1212332547Smav uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, 1213332547Smav ZFS_SPACE_CHECK_NORMAL); 1214168404Spjd quota = MIN(quota, poolsize); 1215168404Spjd } 1216168404Spjd 1217275782Sdelphij if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { 1218168404Spjd /* 1219168404Spjd * We have some space reserved, in addition to what our 1220168404Spjd * parent gave us. 1221168404Spjd */ 1222275782Sdelphij parentspace += dsl_dir_phys(dd)->dd_reserved - used; 1223168404Spjd } 1224168404Spjd 1225185029Spjd if (dd == ancestor) { 1226185029Spjd ASSERT(delta <= 0); 1227185029Spjd ASSERT(used >= -delta); 1228185029Spjd used += delta; 1229185029Spjd if (parentspace != UINT64_MAX) 1230185029Spjd parentspace -= delta; 1231185029Spjd } 1232185029Spjd 1233168404Spjd if (used > quota) { 1234168404Spjd /* over quota */ 1235168404Spjd myspace = 0; 1236168404Spjd } else { 1237168404Spjd /* 1238168404Spjd * the lesser of the space provided by our parent and 1239168404Spjd * the space left in our quota 1240168404Spjd */ 1241168404Spjd myspace = MIN(parentspace, quota - used); 1242168404Spjd } 1243168404Spjd 1244168404Spjd mutex_exit(&dd->dd_lock); 1245168404Spjd 1246168404Spjd return (myspace); 1247168404Spjd} 1248168404Spjd 1249168404Spjdstruct tempreserve { 1250168404Spjd list_node_t tr_node; 1251168404Spjd dsl_dir_t *tr_ds; 1252168404Spjd uint64_t tr_size; 1253168404Spjd}; 1254168404Spjd 1255168404Spjdstatic int 1256185029Spjddsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, 1257321547Smav boolean_t ignorequota, list_t *tr_list, 1258185029Spjd dmu_tx_t *tx, boolean_t first) 1259168404Spjd{ 1260168404Spjd uint64_t txg = tx->tx_txg; 1261321547Smav uint64_t quota; 1262185029Spjd struct tempreserve *tr; 1263219089Spjd int retval = EDQUOT; 1264185029Spjd uint64_t ref_rsrv = 0; 1265168404Spjd 1266168404Spjd ASSERT3U(txg, !=, 0); 1267185029Spjd ASSERT3S(asize, >, 0); 1268168404Spjd 1269168404Spjd mutex_enter(&dd->dd_lock); 1270185029Spjd 1271168404Spjd /* 1272168404Spjd * Check against the dsl_dir's quota. We don't add in the delta 1273168404Spjd * when checking for over-quota because they get one free hit. 1274168404Spjd */ 1275321547Smav uint64_t est_inflight = dsl_dir_space_towrite(dd); 1276321547Smav for (int i = 0; i < TXG_SIZE; i++) 1277185029Spjd est_inflight += dd->dd_tempreserved[i]; 1278321547Smav uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; 1279168404Spjd 1280185029Spjd /* 1281185029Spjd * On the first iteration, fetch the dataset's used-on-disk and 1282185029Spjd * refreservation values. Also, if checkrefquota is set, test if 1283185029Spjd * allocating this space would exceed the dataset's refquota. 1284185029Spjd */ 1285185029Spjd if (first && tx->tx_objset) { 1286185029Spjd int error; 1287219089Spjd dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; 1288168404Spjd 1289321547Smav error = dsl_dataset_check_quota(ds, !netfree, 1290185029Spjd asize, est_inflight, &used_on_disk, &ref_rsrv); 1291321547Smav if (error != 0) { 1292185029Spjd mutex_exit(&dd->dd_lock); 1293185029Spjd return (error); 1294185029Spjd } 1295185029Spjd } 1296185029Spjd 1297185029Spjd /* 1298185029Spjd * If this transaction will result in a net free of space, 1299185029Spjd * we want to let it through. 1300185029Spjd */ 1301275782Sdelphij if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0) 1302185029Spjd quota = UINT64_MAX; 1303185029Spjd else 1304275782Sdelphij quota = dsl_dir_phys(dd)->dd_quota; 1305168404Spjd 1306168404Spjd /* 1307219089Spjd * Adjust the quota against the actual pool size at the root 1308219089Spjd * minus any outstanding deferred frees. 1309185029Spjd * To ensure that it's possible to remove files from a full 1310185029Spjd * pool without inducing transient overcommits, we throttle 1311168404Spjd * netfree transactions against a quota that is slightly larger, 1312168404Spjd * but still within the pool's allocation slop. In cases where 1313168404Spjd * we're very close to full, this will allow a steady trickle of 1314168404Spjd * removes to get through. 1315168404Spjd */ 1316321547Smav uint64_t deferred = 0; 1317168404Spjd if (dd->dd_parent == NULL) { 1318332547Smav uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool, 1319332547Smav (netfree) ? 1320332547Smav ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL); 1321332547Smav 1322332547Smav if (avail < quota) { 1323332547Smav quota = avail; 1324219089Spjd retval = ENOSPC; 1325168404Spjd } 1326168404Spjd } 1327168404Spjd 1328168404Spjd /* 1329168404Spjd * If they are requesting more space, and our current estimate 1330185029Spjd * is over quota, they get to try again unless the actual 1331168404Spjd * on-disk is over quota and there are no pending changes (which 1332168404Spjd * may free up space for us). 1333168404Spjd */ 1334219089Spjd if (used_on_disk + est_inflight >= quota) { 1335219089Spjd if (est_inflight > 0 || used_on_disk < quota || 1336219089Spjd (retval == ENOSPC && used_on_disk < quota + deferred)) 1337219089Spjd retval = ERESTART; 1338185029Spjd dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " 1339168404Spjd "quota=%lluK tr=%lluK err=%d\n", 1340185029Spjd used_on_disk>>10, est_inflight>>10, 1341219089Spjd quota>>10, asize>>10, retval); 1342168404Spjd mutex_exit(&dd->dd_lock); 1343249195Smm return (SET_ERROR(retval)); 1344168404Spjd } 1345168404Spjd 1346168404Spjd /* We need to up our estimated delta before dropping dd_lock */ 1347321547Smav dd->dd_tempreserved[txg & TXG_MASK] += asize; 1348168404Spjd 1349321547Smav uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, 1350185029Spjd asize - ref_rsrv); 1351168404Spjd mutex_exit(&dd->dd_lock); 1352168404Spjd 1353185029Spjd tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); 1354168404Spjd tr->tr_ds = dd; 1355168404Spjd tr->tr_size = asize; 1356168404Spjd list_insert_tail(tr_list, tr); 1357168404Spjd 1358168404Spjd /* see if it's OK with our parent */ 1359321547Smav if (dd->dd_parent != NULL && parent_rsrv != 0) { 1360275782Sdelphij boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); 1361185029Spjd 1362168404Spjd return (dsl_dir_tempreserve_impl(dd->dd_parent, 1363321547Smav parent_rsrv, netfree, ismos, tr_list, tx, B_FALSE)); 1364168404Spjd } else { 1365168404Spjd return (0); 1366168404Spjd } 1367168404Spjd} 1368168404Spjd 1369168404Spjd/* 1370168404Spjd * Reserve space in this dsl_dir, to be used in this tx's txg. 1371185029Spjd * After the space has been dirtied (and dsl_dir_willuse_space() 1372185029Spjd * has been called), the reservation should be canceled, using 1373185029Spjd * dsl_dir_tempreserve_clear(). 1374168404Spjd */ 1375168404Spjdint 1376185029Spjddsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, 1377321547Smav boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx) 1378168404Spjd{ 1379185029Spjd int err; 1380168404Spjd list_t *tr_list; 1381168404Spjd 1382185029Spjd if (asize == 0) { 1383185029Spjd *tr_cookiep = NULL; 1384185029Spjd return (0); 1385185029Spjd } 1386185029Spjd 1387168404Spjd tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 1388168404Spjd list_create(tr_list, sizeof (struct tempreserve), 1389168404Spjd offsetof(struct tempreserve, tr_node)); 1390185029Spjd ASSERT3S(asize, >, 0); 1391168404Spjd 1392339141Smav err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg); 1393168404Spjd if (err == 0) { 1394168404Spjd struct tempreserve *tr; 1395168404Spjd 1396185029Spjd tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); 1397185029Spjd tr->tr_size = lsize; 1398185029Spjd list_insert_tail(tr_list, tr); 1399185029Spjd } else { 1400185029Spjd if (err == EAGAIN) { 1401258632Savg /* 1402258632Savg * If arc_memory_throttle() detected that pageout 1403258632Savg * is running and we are low on memory, we delay new 1404258632Savg * non-pageout transactions to give pageout an 1405258632Savg * advantage. 1406258632Savg * 1407258632Savg * It is unfortunate to be delaying while the caller's 1408258632Savg * locks are held. 1409258632Savg */ 1410255437Sdelphij txg_delay(dd->dd_pool, tx->tx_txg, 1411255437Sdelphij MSEC2NSEC(10), MSEC2NSEC(10)); 1412249195Smm err = SET_ERROR(ERESTART); 1413168404Spjd } 1414168404Spjd } 1415168404Spjd 1416185029Spjd if (err == 0) { 1417321547Smav err = dsl_dir_tempreserve_impl(dd, asize, netfree, 1418321547Smav B_FALSE, tr_list, tx, B_TRUE); 1419185029Spjd } 1420185029Spjd 1421248571Smm if (err != 0) 1422168404Spjd dsl_dir_tempreserve_clear(tr_list, tx); 1423168404Spjd else 1424168404Spjd *tr_cookiep = tr_list; 1425185029Spjd 1426168404Spjd return (err); 1427168404Spjd} 1428168404Spjd 1429168404Spjd/* 1430168404Spjd * Clear a temporary reservation that we previously made with 1431168404Spjd * dsl_dir_tempreserve_space(). 1432168404Spjd */ 1433168404Spjdvoid 1434168404Spjddsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) 1435168404Spjd{ 1436168404Spjd int txgidx = tx->tx_txg & TXG_MASK; 1437168404Spjd list_t *tr_list = tr_cookie; 1438168404Spjd struct tempreserve *tr; 1439168404Spjd 1440168404Spjd ASSERT3U(tx->tx_txg, !=, 0); 1441168404Spjd 1442185029Spjd if (tr_cookie == NULL) 1443185029Spjd return; 1444185029Spjd 1445258632Savg while ((tr = list_head(tr_list)) != NULL) { 1446258632Savg if (tr->tr_ds) { 1447168404Spjd mutex_enter(&tr->tr_ds->dd_lock); 1448168404Spjd ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, 1449168404Spjd tr->tr_size); 1450168404Spjd tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; 1451168404Spjd mutex_exit(&tr->tr_ds->dd_lock); 1452185029Spjd } else { 1453185029Spjd arc_tempreserve_clear(tr->tr_size); 1454168404Spjd } 1455168404Spjd list_remove(tr_list, tr); 1456168404Spjd kmem_free(tr, sizeof (struct tempreserve)); 1457168404Spjd } 1458168404Spjd 1459168404Spjd kmem_free(tr_list, sizeof (list_t)); 1460168404Spjd} 1461168404Spjd 1462258632Savg/* 1463258632Savg * This should be called from open context when we think we're going to write 1464258632Savg * or free space, for example when dirtying data. Be conservative; it's okay 1465258632Savg * to write less space or free more, but we don't want to write more or free 1466258632Savg * less than the amount specified. 1467258632Savg */ 1468258632Savgvoid 1469258632Savgdsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) 1470168404Spjd{ 1471168404Spjd int64_t parent_space; 1472168404Spjd uint64_t est_used; 1473168404Spjd 1474168404Spjd mutex_enter(&dd->dd_lock); 1475168404Spjd if (space > 0) 1476168404Spjd dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; 1477168404Spjd 1478275782Sdelphij est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes; 1479168404Spjd parent_space = parent_delta(dd, est_used, space); 1480168404Spjd mutex_exit(&dd->dd_lock); 1481168404Spjd 1482168404Spjd /* Make sure that we clean up dd_space_to* */ 1483168404Spjd dsl_dir_dirty(dd, tx); 1484168404Spjd 1485168404Spjd /* XXX this is potentially expensive and unnecessary... */ 1486168404Spjd if (parent_space && dd->dd_parent) 1487258632Savg dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); 1488168404Spjd} 1489168404Spjd 1490168404Spjd/* call from syncing context when we actually write/free space for this dd */ 1491168404Spjdvoid 1492185029Spjddsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, 1493168404Spjd int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) 1494168404Spjd{ 1495168404Spjd int64_t accounted_delta; 1496254757Sdelphij 1497254757Sdelphij /* 1498254757Sdelphij * dsl_dataset_set_refreservation_sync_impl() calls this with 1499254757Sdelphij * dd_lock held, so that it can atomically update 1500254757Sdelphij * ds->ds_reserved and the dsl_dir accounting, so that 1501254757Sdelphij * dsl_dataset_check_quota() can see dataset and dir accounting 1502254757Sdelphij * consistently. 1503254757Sdelphij */ 1504185029Spjd boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); 1505168404Spjd 1506168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 1507185029Spjd ASSERT(type < DD_USED_NUM); 1508168404Spjd 1509254757Sdelphij dmu_buf_will_dirty(dd->dd_dbuf, tx); 1510254757Sdelphij 1511185029Spjd if (needlock) 1512185029Spjd mutex_enter(&dd->dd_lock); 1513275782Sdelphij accounted_delta = 1514275782Sdelphij parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used); 1515275782Sdelphij ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used); 1516168404Spjd ASSERT(compressed >= 0 || 1517275782Sdelphij dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed); 1518168404Spjd ASSERT(uncompressed >= 0 || 1519275782Sdelphij dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed); 1520275782Sdelphij dsl_dir_phys(dd)->dd_used_bytes += used; 1521275782Sdelphij dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed; 1522275782Sdelphij dsl_dir_phys(dd)->dd_compressed_bytes += compressed; 1523168404Spjd 1524275782Sdelphij if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { 1525185029Spjd ASSERT(used > 0 || 1526275782Sdelphij dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used); 1527275782Sdelphij dsl_dir_phys(dd)->dd_used_breakdown[type] += used; 1528185029Spjd#ifdef DEBUG 1529185029Spjd dd_used_t t; 1530185029Spjd uint64_t u = 0; 1531185029Spjd for (t = 0; t < DD_USED_NUM; t++) 1532275782Sdelphij u += dsl_dir_phys(dd)->dd_used_breakdown[t]; 1533275782Sdelphij ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes); 1534185029Spjd#endif 1535185029Spjd } 1536185029Spjd if (needlock) 1537185029Spjd mutex_exit(&dd->dd_lock); 1538185029Spjd 1539168404Spjd if (dd->dd_parent != NULL) { 1540185029Spjd dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, 1541168404Spjd accounted_delta, compressed, uncompressed, tx); 1542185029Spjd dsl_dir_transfer_space(dd->dd_parent, 1543185029Spjd used - accounted_delta, 1544277419Smav DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL); 1545168404Spjd } 1546168404Spjd} 1547168404Spjd 1548185029Spjdvoid 1549185029Spjddsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, 1550185029Spjd dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) 1551185029Spjd{ 1552277419Smav ASSERT(tx == NULL || dmu_tx_is_syncing(tx)); 1553185029Spjd ASSERT(oldtype < DD_USED_NUM); 1554185029Spjd ASSERT(newtype < DD_USED_NUM); 1555185029Spjd 1556275782Sdelphij if (delta == 0 || 1557275782Sdelphij !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN)) 1558185029Spjd return; 1559185029Spjd 1560277419Smav if (tx != NULL) 1561277419Smav dmu_buf_will_dirty(dd->dd_dbuf, tx); 1562254757Sdelphij mutex_enter(&dd->dd_lock); 1563185029Spjd ASSERT(delta > 0 ? 1564275782Sdelphij dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta : 1565275782Sdelphij dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta); 1566275782Sdelphij ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta)); 1567275782Sdelphij dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta; 1568275782Sdelphij dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta; 1569254757Sdelphij mutex_exit(&dd->dd_lock); 1570185029Spjd} 1571185029Spjd 1572248571Smmtypedef struct dsl_dir_set_qr_arg { 1573248571Smm const char *ddsqra_name; 1574248571Smm zprop_source_t ddsqra_source; 1575248571Smm uint64_t ddsqra_value; 1576248571Smm} dsl_dir_set_qr_arg_t; 1577248571Smm 1578168404Spjdstatic int 1579248571Smmdsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) 1580168404Spjd{ 1581248571Smm dsl_dir_set_qr_arg_t *ddsqra = arg; 1582248571Smm dsl_pool_t *dp = dmu_tx_pool(tx); 1583248571Smm dsl_dataset_t *ds; 1584248571Smm int error; 1585248571Smm uint64_t towrite, newval; 1586168404Spjd 1587248571Smm error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); 1588248571Smm if (error != 0) 1589248571Smm return (error); 1590219089Spjd 1591248571Smm error = dsl_prop_predict(ds->ds_dir, "quota", 1592248571Smm ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); 1593248571Smm if (error != 0) { 1594248571Smm dsl_dataset_rele(ds, FTAG); 1595248571Smm return (error); 1596248571Smm } 1597248571Smm 1598248571Smm if (newval == 0) { 1599248571Smm dsl_dataset_rele(ds, FTAG); 1600168404Spjd return (0); 1601248571Smm } 1602168404Spjd 1603248571Smm mutex_enter(&ds->ds_dir->dd_lock); 1604168404Spjd /* 1605168404Spjd * If we are doing the preliminary check in open context, and 1606168404Spjd * there are pending changes, then don't fail it, since the 1607185029Spjd * pending changes could under-estimate the amount of space to be 1608168404Spjd * freed up. 1609168404Spjd */ 1610248571Smm towrite = dsl_dir_space_towrite(ds->ds_dir); 1611168404Spjd if ((dmu_tx_is_syncing(tx) || towrite == 0) && 1612275782Sdelphij (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || 1613275782Sdelphij newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { 1614249195Smm error = SET_ERROR(ENOSPC); 1615168404Spjd } 1616248571Smm mutex_exit(&ds->ds_dir->dd_lock); 1617248571Smm dsl_dataset_rele(ds, FTAG); 1618248571Smm return (error); 1619168404Spjd} 1620168404Spjd 1621168404Spjdstatic void 1622248571Smmdsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) 1623168404Spjd{ 1624248571Smm dsl_dir_set_qr_arg_t *ddsqra = arg; 1625248571Smm dsl_pool_t *dp = dmu_tx_pool(tx); 1626248571Smm dsl_dataset_t *ds; 1627248571Smm uint64_t newval; 1628168404Spjd 1629248571Smm VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); 1630219089Spjd 1631249787Smm if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { 1632249787Smm dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), 1633249787Smm ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, 1634249787Smm &ddsqra->ddsqra_value, tx); 1635168404Spjd 1636249787Smm VERIFY0(dsl_prop_get_int_ds(ds, 1637249787Smm zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); 1638249787Smm } else { 1639249787Smm newval = ddsqra->ddsqra_value; 1640249787Smm spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", 1641249787Smm zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); 1642249787Smm } 1643248571Smm 1644248571Smm dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1645248571Smm mutex_enter(&ds->ds_dir->dd_lock); 1646275782Sdelphij dsl_dir_phys(ds->ds_dir)->dd_quota = newval; 1647248571Smm mutex_exit(&ds->ds_dir->dd_lock); 1648248571Smm dsl_dataset_rele(ds, FTAG); 1649168404Spjd} 1650168404Spjd 1651168404Spjdint 1652219089Spjddsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) 1653168404Spjd{ 1654248571Smm dsl_dir_set_qr_arg_t ddsqra; 1655168404Spjd 1656248571Smm ddsqra.ddsqra_name = ddname; 1657248571Smm ddsqra.ddsqra_source = source; 1658248571Smm ddsqra.ddsqra_value = quota; 1659219089Spjd 1660248571Smm return (dsl_sync_task(ddname, dsl_dir_set_quota_check, 1661332547Smav dsl_dir_set_quota_sync, &ddsqra, 0, 1662332547Smav ZFS_SPACE_CHECK_EXTRA_RESERVED)); 1663168404Spjd} 1664168404Spjd 1665185029Spjdint 1666248571Smmdsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) 1667168404Spjd{ 1668248571Smm dsl_dir_set_qr_arg_t *ddsqra = arg; 1669248571Smm dsl_pool_t *dp = dmu_tx_pool(tx); 1670248571Smm dsl_dataset_t *ds; 1671248571Smm dsl_dir_t *dd; 1672248571Smm uint64_t newval, used, avail; 1673248571Smm int error; 1674168404Spjd 1675248571Smm error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); 1676248571Smm if (error != 0) 1677248571Smm return (error); 1678248571Smm dd = ds->ds_dir; 1679219089Spjd 1680168404Spjd /* 1681168404Spjd * If we are doing the preliminary check in open context, the 1682168404Spjd * space estimates may be inaccurate. 1683168404Spjd */ 1684248571Smm if (!dmu_tx_is_syncing(tx)) { 1685248571Smm dsl_dataset_rele(ds, FTAG); 1686168404Spjd return (0); 1687248571Smm } 1688168404Spjd 1689248571Smm error = dsl_prop_predict(ds->ds_dir, 1690248571Smm zfs_prop_to_name(ZFS_PROP_RESERVATION), 1691248571Smm ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); 1692248571Smm if (error != 0) { 1693248571Smm dsl_dataset_rele(ds, FTAG); 1694248571Smm return (error); 1695248571Smm } 1696248571Smm 1697168404Spjd mutex_enter(&dd->dd_lock); 1698275782Sdelphij used = dsl_dir_phys(dd)->dd_used_bytes; 1699168404Spjd mutex_exit(&dd->dd_lock); 1700168404Spjd 1701168404Spjd if (dd->dd_parent) { 1702168404Spjd avail = dsl_dir_space_available(dd->dd_parent, 1703168404Spjd NULL, 0, FALSE); 1704168404Spjd } else { 1705332547Smav avail = dsl_pool_adjustedsize(dd->dd_pool, 1706332547Smav ZFS_SPACE_CHECK_NORMAL) - used; 1707168404Spjd } 1708168404Spjd 1709275782Sdelphij if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { 1710248571Smm uint64_t delta = MAX(used, newval) - 1711275782Sdelphij MAX(used, dsl_dir_phys(dd)->dd_reserved); 1712209962Smm 1713248571Smm if (delta > avail || 1714275782Sdelphij (dsl_dir_phys(dd)->dd_quota > 0 && 1715275782Sdelphij newval > dsl_dir_phys(dd)->dd_quota)) 1716249195Smm error = SET_ERROR(ENOSPC); 1717209962Smm } 1718209962Smm 1719248571Smm dsl_dataset_rele(ds, FTAG); 1720248571Smm return (error); 1721168404Spjd} 1722168404Spjd 1723248571Smmvoid 1724248571Smmdsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) 1725168404Spjd{ 1726168404Spjd uint64_t used; 1727168404Spjd int64_t delta; 1728168404Spjd 1729185029Spjd dmu_buf_will_dirty(dd->dd_dbuf, tx); 1730185029Spjd 1731168404Spjd mutex_enter(&dd->dd_lock); 1732275782Sdelphij used = dsl_dir_phys(dd)->dd_used_bytes; 1733275782Sdelphij delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); 1734275782Sdelphij dsl_dir_phys(dd)->dd_reserved = value; 1735168404Spjd 1736168404Spjd if (dd->dd_parent != NULL) { 1737168404Spjd /* Roll up this additional usage into our ancestors */ 1738185029Spjd dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, 1739185029Spjd delta, 0, 0, tx); 1740168404Spjd } 1741185029Spjd mutex_exit(&dd->dd_lock); 1742168404Spjd} 1743168404Spjd 1744248571Smmstatic void 1745248571Smmdsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) 1746168404Spjd{ 1747248571Smm dsl_dir_set_qr_arg_t *ddsqra = arg; 1748248571Smm dsl_pool_t *dp = dmu_tx_pool(tx); 1749219089Spjd dsl_dataset_t *ds; 1750248571Smm uint64_t newval; 1751168404Spjd 1752248571Smm VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); 1753219089Spjd 1754249787Smm if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { 1755249787Smm dsl_prop_set_sync_impl(ds, 1756249787Smm zfs_prop_to_name(ZFS_PROP_RESERVATION), 1757249787Smm ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, 1758249787Smm &ddsqra->ddsqra_value, tx); 1759219089Spjd 1760249787Smm VERIFY0(dsl_prop_get_int_ds(ds, 1761249787Smm zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); 1762249787Smm } else { 1763249787Smm newval = ddsqra->ddsqra_value; 1764249787Smm spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", 1765249787Smm zfs_prop_to_name(ZFS_PROP_RESERVATION), 1766249787Smm (longlong_t)newval); 1767249787Smm } 1768219089Spjd 1769248571Smm dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); 1770248571Smm dsl_dataset_rele(ds, FTAG); 1771248571Smm} 1772219089Spjd 1773248571Smmint 1774248571Smmdsl_dir_set_reservation(const char *ddname, zprop_source_t source, 1775248571Smm uint64_t reservation) 1776248571Smm{ 1777248571Smm dsl_dir_set_qr_arg_t ddsqra; 1778219089Spjd 1779248571Smm ddsqra.ddsqra_name = ddname; 1780248571Smm ddsqra.ddsqra_source = source; 1781248571Smm ddsqra.ddsqra_value = reservation; 1782248571Smm 1783248571Smm return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, 1784332547Smav dsl_dir_set_reservation_sync, &ddsqra, 0, 1785332547Smav ZFS_SPACE_CHECK_EXTRA_RESERVED)); 1786168404Spjd} 1787168404Spjd 1788168404Spjdstatic dsl_dir_t * 1789168404Spjdclosest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) 1790168404Spjd{ 1791168404Spjd for (; ds1; ds1 = ds1->dd_parent) { 1792168404Spjd dsl_dir_t *dd; 1793168404Spjd for (dd = ds2; dd; dd = dd->dd_parent) { 1794168404Spjd if (ds1 == dd) 1795168404Spjd return (dd); 1796168404Spjd } 1797168404Spjd } 1798168404Spjd return (NULL); 1799168404Spjd} 1800168404Spjd 1801168404Spjd/* 1802168404Spjd * If delta is applied to dd, how much of that delta would be applied to 1803168404Spjd * ancestor? Syncing context only. 1804168404Spjd */ 1805168404Spjdstatic int64_t 1806168404Spjdwould_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) 1807168404Spjd{ 1808168404Spjd if (dd == ancestor) 1809168404Spjd return (delta); 1810168404Spjd 1811168404Spjd mutex_enter(&dd->dd_lock); 1812275782Sdelphij delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); 1813168404Spjd mutex_exit(&dd->dd_lock); 1814168404Spjd return (would_change(dd->dd_parent, delta, ancestor)); 1815168404Spjd} 1816168404Spjd 1817248571Smmtypedef struct dsl_dir_rename_arg { 1818248571Smm const char *ddra_oldname; 1819248571Smm const char *ddra_newname; 1820264835Sdelphij cred_t *ddra_cred; 1821248571Smm} dsl_dir_rename_arg_t; 1822168404Spjd 1823339129Smavtypedef struct dsl_valid_rename_arg { 1824339129Smav int char_delta; 1825339129Smav int nest_delta; 1826339129Smav} dsl_valid_rename_arg_t; 1827339129Smav 1828248571Smm/* ARGSUSED */ 1829168404Spjdstatic int 1830248571Smmdsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 1831168404Spjd{ 1832339129Smav dsl_valid_rename_arg_t *dvra = arg; 1833307108Smav char namebuf[ZFS_MAX_DATASET_NAME_LEN]; 1834168404Spjd 1835248571Smm dsl_dataset_name(ds, namebuf); 1836248571Smm 1837339129Smav ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN), 1838339129Smav <, ZFS_MAX_DATASET_NAME_LEN); 1839339129Smav int namelen = strlen(namebuf) + dvra->char_delta; 1840339129Smav int depth = get_dataset_depth(namebuf) + dvra->nest_delta; 1841339129Smav 1842339129Smav if (namelen >= ZFS_MAX_DATASET_NAME_LEN) 1843249195Smm return (SET_ERROR(ENAMETOOLONG)); 1844339129Smav if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting) 1845339129Smav return (SET_ERROR(ENAMETOOLONG)); 1846248571Smm return (0); 1847248571Smm} 1848248571Smm 1849248571Smmstatic int 1850248571Smmdsl_dir_rename_check(void *arg, dmu_tx_t *tx) 1851248571Smm{ 1852248571Smm dsl_dir_rename_arg_t *ddra = arg; 1853248571Smm dsl_pool_t *dp = dmu_tx_pool(tx); 1854248571Smm dsl_dir_t *dd, *newparent; 1855339129Smav dsl_valid_rename_arg_t dvra; 1856359722Sfreqlabs dsl_dataset_t *parentds; 1857359722Sfreqlabs objset_t *parentos; 1858248571Smm const char *mynewname; 1859248571Smm int error; 1860248571Smm 1861248571Smm /* target dir should exist */ 1862248571Smm error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); 1863248571Smm if (error != 0) 1864248571Smm return (error); 1865248571Smm 1866248571Smm /* new parent should exist */ 1867248571Smm error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, 1868248571Smm &newparent, &mynewname); 1869248571Smm if (error != 0) { 1870248571Smm dsl_dir_rele(dd, FTAG); 1871248571Smm return (error); 1872226676Spjd } 1873168404Spjd 1874248571Smm /* can't rename to different pool */ 1875248571Smm if (dd->dd_pool != newparent->dd_pool) { 1876248571Smm dsl_dir_rele(newparent, FTAG); 1877248571Smm dsl_dir_rele(dd, FTAG); 1878282127Savg return (SET_ERROR(EXDEV)); 1879248571Smm } 1880248571Smm 1881248571Smm /* new name should not already exist */ 1882248571Smm if (mynewname == NULL) { 1883248571Smm dsl_dir_rele(newparent, FTAG); 1884248571Smm dsl_dir_rele(dd, FTAG); 1885249195Smm return (SET_ERROR(EEXIST)); 1886248571Smm } 1887168404Spjd 1888359722Sfreqlabs /* can't rename below anything but filesystems (eg. no ZVOLs) */ 1889359722Sfreqlabs error = dsl_dataset_hold_obj(newparent->dd_pool, 1890359722Sfreqlabs dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds); 1891359722Sfreqlabs if (error != 0) { 1892359722Sfreqlabs dsl_dir_rele(newparent, FTAG); 1893359722Sfreqlabs dsl_dir_rele(dd, FTAG); 1894359722Sfreqlabs return (error); 1895359722Sfreqlabs } 1896359722Sfreqlabs error = dmu_objset_from_ds(parentds, &parentos); 1897359722Sfreqlabs if (error != 0) { 1898359722Sfreqlabs dsl_dataset_rele(parentds, FTAG); 1899359722Sfreqlabs dsl_dir_rele(newparent, FTAG); 1900359722Sfreqlabs dsl_dir_rele(dd, FTAG); 1901359722Sfreqlabs return (error); 1902359722Sfreqlabs } 1903359722Sfreqlabs if (dmu_objset_type(parentos) != DMU_OST_ZFS) { 1904359722Sfreqlabs dsl_dataset_rele(parentds, FTAG); 1905359722Sfreqlabs dsl_dir_rele(newparent, FTAG); 1906359722Sfreqlabs dsl_dir_rele(dd, FTAG); 1907359722Sfreqlabs return (error); 1908359722Sfreqlabs } 1909359722Sfreqlabs dsl_dataset_rele(parentds, FTAG); 1910359722Sfreqlabs 1911339129Smav ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN), 1912339129Smav <, ZFS_MAX_DATASET_NAME_LEN); 1913339129Smav ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN), 1914339129Smav <, ZFS_MAX_DATASET_NAME_LEN); 1915339129Smav dvra.char_delta = strlen(ddra->ddra_newname) 1916339129Smav - strlen(ddra->ddra_oldname); 1917339129Smav dvra.nest_delta = get_dataset_depth(ddra->ddra_newname) 1918339129Smav - get_dataset_depth(ddra->ddra_oldname); 1919339129Smav 1920248571Smm /* if the name length is growing, validate child name lengths */ 1921339129Smav if (dvra.char_delta > 0 || dvra.nest_delta > 0) { 1922248571Smm error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, 1923339129Smav &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 1924248571Smm if (error != 0) { 1925248571Smm dsl_dir_rele(newparent, FTAG); 1926248571Smm dsl_dir_rele(dd, FTAG); 1927248571Smm return (error); 1928248571Smm } 1929248571Smm } 1930248571Smm 1931264835Sdelphij if (dmu_tx_is_syncing(tx)) { 1932266915Sdelphij if (spa_feature_is_active(dp->dp_spa, 1933264835Sdelphij SPA_FEATURE_FS_SS_LIMIT)) { 1934264835Sdelphij /* 1935264835Sdelphij * Although this is the check function and we don't 1936264835Sdelphij * normally make on-disk changes in check functions, 1937264835Sdelphij * we need to do that here. 1938264835Sdelphij * 1939264835Sdelphij * Ensure this portion of the tree's counts have been 1940264835Sdelphij * initialized in case the new parent has limits set. 1941264835Sdelphij */ 1942264835Sdelphij dsl_dir_init_fs_ss_count(dd, tx); 1943264835Sdelphij } 1944264835Sdelphij } 1945264835Sdelphij 1946248571Smm if (newparent != dd->dd_parent) { 1947168404Spjd /* is there enough space? */ 1948168404Spjd uint64_t myspace = 1949275782Sdelphij MAX(dsl_dir_phys(dd)->dd_used_bytes, 1950275782Sdelphij dsl_dir_phys(dd)->dd_reserved); 1951264835Sdelphij objset_t *os = dd->dd_pool->dp_meta_objset; 1952264835Sdelphij uint64_t fs_cnt = 0; 1953264835Sdelphij uint64_t ss_cnt = 0; 1954168404Spjd 1955264835Sdelphij if (dsl_dir_is_zapified(dd)) { 1956264835Sdelphij int err; 1957264835Sdelphij 1958264835Sdelphij err = zap_lookup(os, dd->dd_object, 1959264835Sdelphij DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, 1960264835Sdelphij &fs_cnt); 1961266915Sdelphij if (err != ENOENT && err != 0) { 1962266915Sdelphij dsl_dir_rele(newparent, FTAG); 1963266915Sdelphij dsl_dir_rele(dd, FTAG); 1964264835Sdelphij return (err); 1965266915Sdelphij } 1966264835Sdelphij 1967264835Sdelphij /* 1968264835Sdelphij * have to add 1 for the filesystem itself that we're 1969264835Sdelphij * moving 1970264835Sdelphij */ 1971264835Sdelphij fs_cnt++; 1972264835Sdelphij 1973264835Sdelphij err = zap_lookup(os, dd->dd_object, 1974264835Sdelphij DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, 1975264835Sdelphij &ss_cnt); 1976266915Sdelphij if (err != ENOENT && err != 0) { 1977266915Sdelphij dsl_dir_rele(newparent, FTAG); 1978266915Sdelphij dsl_dir_rele(dd, FTAG); 1979264835Sdelphij return (err); 1980266915Sdelphij } 1981264835Sdelphij } 1982264835Sdelphij 1983168404Spjd /* no rename into our descendant */ 1984248571Smm if (closest_common_ancestor(dd, newparent) == dd) { 1985248571Smm dsl_dir_rele(newparent, FTAG); 1986248571Smm dsl_dir_rele(dd, FTAG); 1987249195Smm return (SET_ERROR(EINVAL)); 1988248571Smm } 1989168404Spjd 1990248571Smm error = dsl_dir_transfer_possible(dd->dd_parent, 1991264835Sdelphij newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred); 1992248571Smm if (error != 0) { 1993248571Smm dsl_dir_rele(newparent, FTAG); 1994248571Smm dsl_dir_rele(dd, FTAG); 1995248571Smm return (error); 1996248571Smm } 1997168404Spjd } 1998168404Spjd 1999248571Smm dsl_dir_rele(newparent, FTAG); 2000248571Smm dsl_dir_rele(dd, FTAG); 2001168404Spjd return (0); 2002168404Spjd} 2003168404Spjd 2004168404Spjdstatic void 2005248571Smmdsl_dir_rename_sync(void *arg, dmu_tx_t *tx) 2006168404Spjd{ 2007248571Smm dsl_dir_rename_arg_t *ddra = arg; 2008248571Smm dsl_pool_t *dp = dmu_tx_pool(tx); 2009248571Smm dsl_dir_t *dd, *newparent; 2010248571Smm const char *mynewname; 2011248571Smm int error; 2012168404Spjd objset_t *mos = dp->dp_meta_objset; 2013168404Spjd 2014248571Smm VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); 2015248571Smm VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, 2016248571Smm &mynewname)); 2017248571Smm 2018248571Smm /* Log this before we change the name. */ 2019248571Smm spa_history_log_internal_dd(dd, "rename", tx, 2020248571Smm "-> %s", ddra->ddra_newname); 2021248571Smm 2022248571Smm if (newparent != dd->dd_parent) { 2023264835Sdelphij objset_t *os = dd->dd_pool->dp_meta_objset; 2024264835Sdelphij uint64_t fs_cnt = 0; 2025264835Sdelphij uint64_t ss_cnt = 0; 2026264835Sdelphij 2027264835Sdelphij /* 2028264835Sdelphij * We already made sure the dd counts were initialized in the 2029264835Sdelphij * check function. 2030264835Sdelphij */ 2031266915Sdelphij if (spa_feature_is_active(dp->dp_spa, 2032264835Sdelphij SPA_FEATURE_FS_SS_LIMIT)) { 2033264835Sdelphij VERIFY0(zap_lookup(os, dd->dd_object, 2034264835Sdelphij DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, 2035264835Sdelphij &fs_cnt)); 2036264835Sdelphij /* add 1 for the filesystem itself that we're moving */ 2037264835Sdelphij fs_cnt++; 2038264835Sdelphij 2039264835Sdelphij VERIFY0(zap_lookup(os, dd->dd_object, 2040264835Sdelphij DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, 2041264835Sdelphij &ss_cnt)); 2042264835Sdelphij } 2043264835Sdelphij 2044264835Sdelphij dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, 2045264835Sdelphij DD_FIELD_FILESYSTEM_COUNT, tx); 2046264835Sdelphij dsl_fs_ss_count_adjust(newparent, fs_cnt, 2047264835Sdelphij DD_FIELD_FILESYSTEM_COUNT, tx); 2048264835Sdelphij 2049264835Sdelphij dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, 2050264835Sdelphij DD_FIELD_SNAPSHOT_COUNT, tx); 2051264835Sdelphij dsl_fs_ss_count_adjust(newparent, ss_cnt, 2052264835Sdelphij DD_FIELD_SNAPSHOT_COUNT, tx); 2053264835Sdelphij 2054185029Spjd dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, 2055275782Sdelphij -dsl_dir_phys(dd)->dd_used_bytes, 2056275782Sdelphij -dsl_dir_phys(dd)->dd_compressed_bytes, 2057275782Sdelphij -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); 2058248571Smm dsl_dir_diduse_space(newparent, DD_USED_CHILD, 2059275782Sdelphij dsl_dir_phys(dd)->dd_used_bytes, 2060275782Sdelphij dsl_dir_phys(dd)->dd_compressed_bytes, 2061275782Sdelphij dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); 2062185029Spjd 2063275782Sdelphij if (dsl_dir_phys(dd)->dd_reserved > 2064275782Sdelphij dsl_dir_phys(dd)->dd_used_bytes) { 2065275782Sdelphij uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - 2066275782Sdelphij dsl_dir_phys(dd)->dd_used_bytes; 2067185029Spjd 2068185029Spjd dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, 2069185029Spjd -unused_rsrv, 0, 0, tx); 2070248571Smm dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, 2071185029Spjd unused_rsrv, 0, 0, tx); 2072185029Spjd } 2073168404Spjd } 2074168404Spjd 2075168404Spjd dmu_buf_will_dirty(dd->dd_dbuf, tx); 2076168404Spjd 2077168404Spjd /* remove from old parent zapobj */ 2078275782Sdelphij error = zap_remove(mos, 2079275782Sdelphij dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, 2080168404Spjd dd->dd_myname, tx); 2081248571Smm ASSERT0(error); 2082168404Spjd 2083248571Smm (void) strcpy(dd->dd_myname, mynewname); 2084248571Smm dsl_dir_rele(dd->dd_parent, dd); 2085275782Sdelphij dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; 2086248571Smm VERIFY0(dsl_dir_hold_obj(dp, 2087248571Smm newparent->dd_object, NULL, dd, &dd->dd_parent)); 2088168404Spjd 2089168404Spjd /* add to new parent zapobj */ 2090275782Sdelphij VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, 2091248571Smm dd->dd_myname, 8, 1, &dd->dd_object, tx)); 2092248571Smm 2093248571Smm#ifdef __FreeBSD__ 2094219320Spjd#ifdef _KERNEL 2095248571Smm zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname); 2096248571Smm zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname); 2097219320Spjd#endif 2098248571Smm#endif 2099185029Spjd 2100248571Smm dsl_prop_notify_all(dd); 2101248571Smm 2102248571Smm dsl_dir_rele(newparent, FTAG); 2103248571Smm dsl_dir_rele(dd, FTAG); 2104168404Spjd} 2105168404Spjd 2106168404Spjdint 2107248571Smmdsl_dir_rename(const char *oldname, const char *newname) 2108168404Spjd{ 2109248571Smm dsl_dir_rename_arg_t ddra; 2110168404Spjd 2111248571Smm ddra.ddra_oldname = oldname; 2112248571Smm ddra.ddra_newname = newname; 2113264835Sdelphij ddra.ddra_cred = CRED(); 2114168404Spjd 2115248571Smm return (dsl_sync_task(oldname, 2116268473Sdelphij dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 2117268473Sdelphij 3, ZFS_SPACE_CHECK_RESERVED)); 2118168404Spjd} 2119168404Spjd 2120168404Spjdint 2121264835Sdelphijdsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, 2122264835Sdelphij uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr) 2123168404Spjd{ 2124168404Spjd dsl_dir_t *ancestor; 2125168404Spjd int64_t adelta; 2126168404Spjd uint64_t avail; 2127264835Sdelphij int err; 2128168404Spjd 2129168404Spjd ancestor = closest_common_ancestor(sdd, tdd); 2130168404Spjd adelta = would_change(sdd, -space, ancestor); 2131168404Spjd avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); 2132168404Spjd if (avail < space) 2133249195Smm return (SET_ERROR(ENOSPC)); 2134168404Spjd 2135264835Sdelphij err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, 2136264835Sdelphij ancestor, cr); 2137264835Sdelphij if (err != 0) 2138264835Sdelphij return (err); 2139264835Sdelphij err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, 2140264835Sdelphij ancestor, cr); 2141264835Sdelphij if (err != 0) 2142264835Sdelphij return (err); 2143264835Sdelphij 2144168404Spjd return (0); 2145168404Spjd} 2146219089Spjd 2147219089Spjdtimestruc_t 2148219089Spjddsl_dir_snap_cmtime(dsl_dir_t *dd) 2149219089Spjd{ 2150219089Spjd timestruc_t t; 2151219089Spjd 2152219089Spjd mutex_enter(&dd->dd_lock); 2153219089Spjd t = dd->dd_snap_cmtime; 2154219089Spjd mutex_exit(&dd->dd_lock); 2155219089Spjd 2156219089Spjd return (t); 2157219089Spjd} 2158219089Spjd 2159219089Spjdvoid 2160219089Spjddsl_dir_snap_cmtime_update(dsl_dir_t *dd) 2161219089Spjd{ 2162219089Spjd timestruc_t t; 2163219089Spjd 2164219089Spjd gethrestime(&t); 2165219089Spjd mutex_enter(&dd->dd_lock); 2166219089Spjd dd->dd_snap_cmtime = t; 2167219089Spjd mutex_exit(&dd->dd_lock); 2168219089Spjd} 2169259813Sdelphij 2170259813Sdelphijvoid 2171259813Sdelphijdsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) 2172259813Sdelphij{ 2173259813Sdelphij objset_t *mos = dd->dd_pool->dp_meta_objset; 2174259813Sdelphij dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); 2175259813Sdelphij} 2176264835Sdelphij 2177264835Sdelphijboolean_t 2178264835Sdelphijdsl_dir_is_zapified(dsl_dir_t *dd) 2179264835Sdelphij{ 2180264835Sdelphij dmu_object_info_t doi; 2181264835Sdelphij 2182264835Sdelphij dmu_object_info_from_db(dd->dd_dbuf, &doi); 2183264835Sdelphij return (doi.doi_type == DMU_OTN_ZAP_METADATA); 2184264835Sdelphij} 2185