dmu_tx.c revision 274337
1164410Ssyrinx/* 2164410Ssyrinx * CDDL HEADER START 3164410Ssyrinx * 4164410Ssyrinx * The contents of this file are subject to the terms of the 5164410Ssyrinx * Common Development and Distribution License (the "License"). 6164410Ssyrinx * You may not use this file except in compliance with the License. 7164410Ssyrinx * 8164410Ssyrinx * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9164410Ssyrinx * or http://www.opensolaris.org/os/licensing. 10164410Ssyrinx * See the License for the specific language governing permissions 11164410Ssyrinx * and limitations under the License. 12164410Ssyrinx * 13164410Ssyrinx * When distributing Covered Code, include this CDDL HEADER in each 14164410Ssyrinx * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15164410Ssyrinx * If applicable, add the following below this CDDL HEADER, with the 16164410Ssyrinx * fields enclosed by brackets "[]" replaced with your own identifying 17164410Ssyrinx * information: Portions Copyright [yyyy] [name of copyright owner] 18164410Ssyrinx * 19164410Ssyrinx * CDDL HEADER END 20164410Ssyrinx */ 21164410Ssyrinx/* 22164410Ssyrinx * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23164410Ssyrinx * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24164410Ssyrinx * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25164410Ssyrinx */ 26164410Ssyrinx 27164410Ssyrinx#include <sys/dmu.h> 28164410Ssyrinx#include <sys/dmu_impl.h> 29164410Ssyrinx#include <sys/dbuf.h> 30164410Ssyrinx#include <sys/dmu_tx.h> 31164410Ssyrinx#include <sys/dmu_objset.h> 32164410Ssyrinx#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ 33164410Ssyrinx#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ 34164410Ssyrinx#include <sys/dsl_pool.h> 35164410Ssyrinx#include <sys/zap_impl.h> /* for fzap_default_block_shift */ 36164410Ssyrinx#include <sys/spa.h> 37164410Ssyrinx#include <sys/sa.h> 38164410Ssyrinx#include <sys/sa_impl.h> 39164410Ssyrinx#include <sys/zfs_context.h> 40164410Ssyrinx#include <sys/varargs.h> 41164410Ssyrinx 42164410Ssyrinxtypedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, 43164410Ssyrinx uint64_t arg1, uint64_t arg2); 44164410Ssyrinx 45164410Ssyrinx 46164410Ssyrinxdmu_tx_t * 47164410Ssyrinxdmu_tx_create_dd(dsl_dir_t *dd) 48164410Ssyrinx{ 49164410Ssyrinx dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); 50164410Ssyrinx tx->tx_dir = dd; 51164410Ssyrinx if (dd != NULL) 52164410Ssyrinx tx->tx_pool = dd->dd_pool; 53164410Ssyrinx list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), 54164410Ssyrinx offsetof(dmu_tx_hold_t, txh_node)); 55164410Ssyrinx list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), 56164410Ssyrinx offsetof(dmu_tx_callback_t, dcb_node)); 57164410Ssyrinx tx->tx_start = gethrtime(); 58164410Ssyrinx#ifdef ZFS_DEBUG 59164410Ssyrinx refcount_create(&tx->tx_space_written); 60164410Ssyrinx refcount_create(&tx->tx_space_freed); 61164410Ssyrinx#endif 62164410Ssyrinx return (tx); 63164410Ssyrinx} 64164410Ssyrinx 65164410Ssyrinxdmu_tx_t * 66164410Ssyrinxdmu_tx_create(objset_t *os) 67164410Ssyrinx{ 68164410Ssyrinx dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); 69164410Ssyrinx tx->tx_objset = os; 70164410Ssyrinx tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset); 71164410Ssyrinx return (tx); 72164410Ssyrinx} 73164410Ssyrinx 74164410Ssyrinxdmu_tx_t * 75164410Ssyrinxdmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) 76164410Ssyrinx{ 77164410Ssyrinx dmu_tx_t *tx = dmu_tx_create_dd(NULL); 78164410Ssyrinx 79164410Ssyrinx ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); 80164410Ssyrinx tx->tx_pool = dp; 81164410Ssyrinx tx->tx_txg = txg; 82164410Ssyrinx tx->tx_anyobj = TRUE; 83164410Ssyrinx 84164410Ssyrinx return (tx); 85164410Ssyrinx} 86164410Ssyrinx 87164410Ssyrinxint 88164410Ssyrinxdmu_tx_is_syncing(dmu_tx_t *tx) 89164410Ssyrinx{ 90164410Ssyrinx return (tx->tx_anyobj); 91164410Ssyrinx} 92164410Ssyrinx 93164410Ssyrinxint 94164410Ssyrinxdmu_tx_private_ok(dmu_tx_t *tx) 95164410Ssyrinx{ 96164410Ssyrinx return (tx->tx_anyobj); 97164410Ssyrinx} 98164410Ssyrinx 99164410Ssyrinxstatic dmu_tx_hold_t * 100164410Ssyrinxdmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, 101164410Ssyrinx enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) 102164410Ssyrinx{ 103164410Ssyrinx dmu_tx_hold_t *txh; 104164410Ssyrinx dnode_t *dn = NULL; 105164410Ssyrinx int err; 106164410Ssyrinx 107164410Ssyrinx if (object != DMU_NEW_OBJECT) { 108164410Ssyrinx err = dnode_hold(os, object, tx, &dn); 109164410Ssyrinx if (err) { 110164410Ssyrinx tx->tx_err = err; 111164410Ssyrinx return (NULL); 112164410Ssyrinx } 113164410Ssyrinx 114164410Ssyrinx if (err == 0 && tx->tx_txg != 0) { 115164410Ssyrinx mutex_enter(&dn->dn_mtx); 116164410Ssyrinx /* 117164410Ssyrinx * dn->dn_assigned_txg == tx->tx_txg doesn't pose a 118164410Ssyrinx * problem, but there's no way for it to happen (for 119164410Ssyrinx * now, at least). 120164410Ssyrinx */ 121164410Ssyrinx ASSERT(dn->dn_assigned_txg == 0); 122164410Ssyrinx dn->dn_assigned_txg = tx->tx_txg; 123164410Ssyrinx (void) refcount_add(&dn->dn_tx_holds, tx); 124164410Ssyrinx mutex_exit(&dn->dn_mtx); 125164410Ssyrinx } 126164410Ssyrinx } 127164410Ssyrinx 128164410Ssyrinx txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); 129164410Ssyrinx txh->txh_tx = tx; 130164410Ssyrinx txh->txh_dnode = dn; 131164410Ssyrinx#ifdef ZFS_DEBUG 132164410Ssyrinx txh->txh_type = type; 133164410Ssyrinx txh->txh_arg1 = arg1; 134164410Ssyrinx txh->txh_arg2 = arg2; 135164410Ssyrinx#endif 136164410Ssyrinx list_insert_tail(&tx->tx_holds, txh); 137164410Ssyrinx 138164410Ssyrinx return (txh); 139164410Ssyrinx} 140164410Ssyrinx 141164410Ssyrinxvoid 142164410Ssyrinxdmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) 143164410Ssyrinx{ 144164410Ssyrinx /* 145164410Ssyrinx * If we're syncing, they can manipulate any object anyhow, and 146164410Ssyrinx * the hold on the dnode_t can cause problems. 147164410Ssyrinx */ 148164410Ssyrinx if (!dmu_tx_is_syncing(tx)) { 149164410Ssyrinx (void) dmu_tx_hold_object_impl(tx, os, 150164410Ssyrinx object, THT_NEWOBJECT, 0, 0); 151164410Ssyrinx } 152164410Ssyrinx} 153164410Ssyrinx 154164410Ssyrinxstatic int 155164410Ssyrinxdmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) 156164410Ssyrinx{ 157164410Ssyrinx int err; 158164410Ssyrinx dmu_buf_impl_t *db; 159164410Ssyrinx 160164410Ssyrinx rw_enter(&dn->dn_struct_rwlock, RW_READER); 161164410Ssyrinx db = dbuf_hold_level(dn, level, blkid, FTAG); 162164410Ssyrinx rw_exit(&dn->dn_struct_rwlock); 163164410Ssyrinx if (db == NULL) 164164410Ssyrinx return (SET_ERROR(EIO)); 165164410Ssyrinx err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); 166164410Ssyrinx dbuf_rele(db, FTAG); 167164410Ssyrinx return (err); 168164410Ssyrinx} 169164410Ssyrinx 170164410Ssyrinxstatic void 171164410Ssyrinxdmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, 172164410Ssyrinx int level, uint64_t blkid, boolean_t freeable, uint64_t *history) 173164410Ssyrinx{ 174164410Ssyrinx objset_t *os = dn->dn_objset; 175164410Ssyrinx dsl_dataset_t *ds = os->os_dsl_dataset; 176164410Ssyrinx int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 177164410Ssyrinx dmu_buf_impl_t *parent = NULL; 178164410Ssyrinx blkptr_t *bp = NULL; 179164410Ssyrinx uint64_t space; 180164410Ssyrinx 181164410Ssyrinx if (level >= dn->dn_nlevels || history[level] == blkid) 182164410Ssyrinx return; 183164410Ssyrinx 184164410Ssyrinx history[level] = blkid; 185164410Ssyrinx 186164410Ssyrinx space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift); 187164410Ssyrinx 188164410Ssyrinx if (db == NULL || db == dn->dn_dbuf) { 189164410Ssyrinx ASSERT(level != 0); 190164410Ssyrinx db = NULL; 191164410Ssyrinx } else { 192164410Ssyrinx ASSERT(DB_DNODE(db) == dn); 193164410Ssyrinx ASSERT(db->db_level == level); 194164410Ssyrinx ASSERT(db->db.db_size == space); 195164410Ssyrinx ASSERT(db->db_blkid == blkid); 196164410Ssyrinx bp = db->db_blkptr; 197164410Ssyrinx parent = db->db_parent; 198164410Ssyrinx } 199164410Ssyrinx 200164410Ssyrinx freeable = (bp && (freeable || 201164410Ssyrinx dsl_dataset_block_freeable(ds, bp, bp->blk_birth))); 202164410Ssyrinx 203164410Ssyrinx if (freeable) 204164410Ssyrinx txh->txh_space_tooverwrite += space; 205164410Ssyrinx else 206164410Ssyrinx txh->txh_space_towrite += space; 207164410Ssyrinx if (bp) 208164410Ssyrinx txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp); 209164410Ssyrinx 210164410Ssyrinx dmu_tx_count_twig(txh, dn, parent, level + 1, 211164410Ssyrinx blkid >> epbs, freeable, history); 212164410Ssyrinx} 213164410Ssyrinx 214164410Ssyrinx/* ARGSUSED */ 215164410Ssyrinxstatic void 216164410Ssyrinxdmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 217164410Ssyrinx{ 218164410Ssyrinx dnode_t *dn = txh->txh_dnode; 219164410Ssyrinx uint64_t start, end, i; 220164410Ssyrinx int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; 221164410Ssyrinx int err = 0; 222164410Ssyrinx 223164410Ssyrinx if (len == 0) 224164410Ssyrinx return; 225164410Ssyrinx 226164410Ssyrinx min_bs = SPA_MINBLOCKSHIFT; 227164410Ssyrinx max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1; 228164410Ssyrinx min_ibs = DN_MIN_INDBLKSHIFT; 229164410Ssyrinx max_ibs = DN_MAX_INDBLKSHIFT; 230164410Ssyrinx 231164410Ssyrinx if (dn) { 232164410Ssyrinx uint64_t history[DN_MAX_LEVELS]; 233164410Ssyrinx int nlvls = dn->dn_nlevels; 234164410Ssyrinx int delta; 235164410Ssyrinx 236164410Ssyrinx /* 237164410Ssyrinx * For i/o error checking, read the first and last level-0 238164410Ssyrinx * blocks (if they are not aligned), and all the level-1 blocks. 239164410Ssyrinx */ 240164410Ssyrinx if (dn->dn_maxblkid == 0) { 241164410Ssyrinx delta = dn->dn_datablksz; 242164410Ssyrinx start = (off < dn->dn_datablksz) ? 0 : 1; 243164410Ssyrinx end = (off+len <= dn->dn_datablksz) ? 0 : 1; 244164410Ssyrinx if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { 245164410Ssyrinx err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 246164410Ssyrinx if (err) 247164410Ssyrinx goto out; 248164410Ssyrinx delta -= off; 249164410Ssyrinx } 250164410Ssyrinx } else { 251164410Ssyrinx zio_t *zio = zio_root(dn->dn_objset->os_spa, 252164410Ssyrinx NULL, NULL, ZIO_FLAG_CANFAIL); 253164410Ssyrinx 254164410Ssyrinx /* first level-0 block */ 255164410Ssyrinx start = off >> dn->dn_datablkshift; 256164410Ssyrinx if (P2PHASE(off, dn->dn_datablksz) || 257164410Ssyrinx len < dn->dn_datablksz) { 258164410Ssyrinx err = dmu_tx_check_ioerr(zio, dn, 0, start); 259164410Ssyrinx if (err) 260164410Ssyrinx goto out; 261164410Ssyrinx } 262164410Ssyrinx 263164410Ssyrinx /* last level-0 block */ 264164410Ssyrinx end = (off+len-1) >> dn->dn_datablkshift; 265164410Ssyrinx if (end != start && end <= dn->dn_maxblkid && 266164410Ssyrinx P2PHASE(off+len, dn->dn_datablksz)) { 267164410Ssyrinx err = dmu_tx_check_ioerr(zio, dn, 0, end); 268164410Ssyrinx if (err) 269164410Ssyrinx goto out; 270164410Ssyrinx } 271164410Ssyrinx 272164410Ssyrinx /* level-1 blocks */ 273164410Ssyrinx if (nlvls > 1) { 274164410Ssyrinx int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 275164410Ssyrinx for (i = (start>>shft)+1; i < end>>shft; i++) { 276164410Ssyrinx err = dmu_tx_check_ioerr(zio, dn, 1, i); 277164410Ssyrinx if (err) 278164410Ssyrinx goto out; 279164410Ssyrinx } 280164410Ssyrinx } 281164410Ssyrinx 282164410Ssyrinx err = zio_wait(zio); 283164410Ssyrinx if (err) 284164410Ssyrinx goto out; 285164410Ssyrinx delta = P2NPHASE(off, dn->dn_datablksz); 286164410Ssyrinx } 287164410Ssyrinx 288164410Ssyrinx min_ibs = max_ibs = dn->dn_indblkshift; 289164410Ssyrinx if (dn->dn_maxblkid > 0) { 290164410Ssyrinx /* 291164410Ssyrinx * The blocksize can't change, 292164410Ssyrinx * so we can make a more precise estimate. 293164410Ssyrinx */ 294164410Ssyrinx ASSERT(dn->dn_datablkshift != 0); 295164410Ssyrinx min_bs = max_bs = dn->dn_datablkshift; 296164410Ssyrinx } else { 297164410Ssyrinx /* 298164410Ssyrinx * The blocksize can increase up to the recordsize, 299164410Ssyrinx * or if it is already more than the recordsize, 300164410Ssyrinx * up to the next power of 2. 301164410Ssyrinx */ 302164410Ssyrinx min_bs = highbit64(dn->dn_datablksz - 1); 303164410Ssyrinx max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1)); 304164410Ssyrinx } 305164410Ssyrinx 306164410Ssyrinx /* 307164410Ssyrinx * If this write is not off the end of the file 308164410Ssyrinx * we need to account for overwrites/unref. 309164410Ssyrinx */ 310164410Ssyrinx if (start <= dn->dn_maxblkid) { 311164410Ssyrinx for (int l = 0; l < DN_MAX_LEVELS; l++) 312164410Ssyrinx history[l] = -1ULL; 313164410Ssyrinx } 314164410Ssyrinx while (start <= dn->dn_maxblkid) { 315164410Ssyrinx dmu_buf_impl_t *db; 316164410Ssyrinx 317164410Ssyrinx rw_enter(&dn->dn_struct_rwlock, RW_READER); 318164410Ssyrinx err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); 319164410Ssyrinx rw_exit(&dn->dn_struct_rwlock); 320164410Ssyrinx 321164410Ssyrinx if (err) { 322164410Ssyrinx txh->txh_tx->tx_err = err; 323164410Ssyrinx return; 324164410Ssyrinx } 325164410Ssyrinx 326164410Ssyrinx dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, 327164410Ssyrinx history); 328164410Ssyrinx dbuf_rele(db, FTAG); 329164410Ssyrinx if (++start > end) { 330164410Ssyrinx /* 331164410Ssyrinx * Account for new indirects appearing 332164410Ssyrinx * before this IO gets assigned into a txg. 333164410Ssyrinx */ 334164410Ssyrinx bits = 64 - min_bs; 335164410Ssyrinx epbs = min_ibs - SPA_BLKPTRSHIFT; 336164410Ssyrinx for (bits -= epbs * (nlvls - 1); 337164410Ssyrinx bits >= 0; bits -= epbs) 338164410Ssyrinx txh->txh_fudge += 1ULL << max_ibs; 339164410Ssyrinx goto out; 340164410Ssyrinx } 341164410Ssyrinx off += delta; 342164410Ssyrinx if (len >= delta) 343164410Ssyrinx len -= delta; 344164410Ssyrinx delta = dn->dn_datablksz; 345164410Ssyrinx } 346164410Ssyrinx } 347164410Ssyrinx 348164410Ssyrinx /* 349164410Ssyrinx * 'end' is the last thing we will access, not one past. 350164410Ssyrinx * This way we won't overflow when accessing the last byte. 351164410Ssyrinx */ 352164410Ssyrinx start = P2ALIGN(off, 1ULL << max_bs); 353164410Ssyrinx end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; 354164410Ssyrinx txh->txh_space_towrite += end - start + 1; 355164410Ssyrinx 356164410Ssyrinx start >>= min_bs; 357164410Ssyrinx end >>= min_bs; 358164410Ssyrinx 359164410Ssyrinx epbs = min_ibs - SPA_BLKPTRSHIFT; 360164410Ssyrinx 361164410Ssyrinx /* 362164410Ssyrinx * The object contains at most 2^(64 - min_bs) blocks, 363164410Ssyrinx * and each indirect level maps 2^epbs. 364164410Ssyrinx */ 365164410Ssyrinx for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { 366164410Ssyrinx start >>= epbs; 367164410Ssyrinx end >>= epbs; 368164410Ssyrinx ASSERT3U(end, >=, start); 369164410Ssyrinx txh->txh_space_towrite += (end - start + 1) << max_ibs; 370164410Ssyrinx if (start != 0) { 371164410Ssyrinx /* 372164410Ssyrinx * We also need a new blkid=0 indirect block 373164410Ssyrinx * to reference any existing file data. 374164410Ssyrinx */ 375164410Ssyrinx txh->txh_space_towrite += 1ULL << max_ibs; 376164410Ssyrinx } 377164410Ssyrinx } 378164410Ssyrinx 379164410Ssyrinxout: 380164410Ssyrinx if (txh->txh_space_towrite + txh->txh_space_tooverwrite > 381164410Ssyrinx 2 * DMU_MAX_ACCESS) 382164410Ssyrinx err = SET_ERROR(EFBIG); 383164410Ssyrinx 384164410Ssyrinx if (err) 385164410Ssyrinx txh->txh_tx->tx_err = err; 386164410Ssyrinx} 387164410Ssyrinx 388164410Ssyrinxstatic void 389164410Ssyrinxdmu_tx_count_dnode(dmu_tx_hold_t *txh) 390164410Ssyrinx{ 391164410Ssyrinx dnode_t *dn = txh->txh_dnode; 392164410Ssyrinx dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); 393164410Ssyrinx uint64_t space = mdn->dn_datablksz + 394164410Ssyrinx ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); 395164410Ssyrinx 396164410Ssyrinx if (dn && dn->dn_dbuf->db_blkptr && 397164410Ssyrinx dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 398164410Ssyrinx dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) { 399164410Ssyrinx txh->txh_space_tooverwrite += space; 400164410Ssyrinx txh->txh_space_tounref += space; 401164410Ssyrinx } else { 402164410Ssyrinx txh->txh_space_towrite += space; 403164410Ssyrinx if (dn && dn->dn_dbuf->db_blkptr) 404164410Ssyrinx txh->txh_space_tounref += space; 405164410Ssyrinx } 406164410Ssyrinx} 407164410Ssyrinx 408164410Ssyrinxvoid 409164410Ssyrinxdmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) 410164410Ssyrinx{ 411164410Ssyrinx dmu_tx_hold_t *txh; 412164410Ssyrinx 413164410Ssyrinx ASSERT(tx->tx_txg == 0); 414164410Ssyrinx ASSERT(len < DMU_MAX_ACCESS); 415164410Ssyrinx ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 416164410Ssyrinx 417164410Ssyrinx txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 418164410Ssyrinx object, THT_WRITE, off, len); 419164410Ssyrinx if (txh == NULL) 420164410Ssyrinx return; 421164410Ssyrinx 422164410Ssyrinx dmu_tx_count_write(txh, off, len); 423164410Ssyrinx dmu_tx_count_dnode(txh); 424164410Ssyrinx} 425164410Ssyrinx 426164410Ssyrinxstatic void 427164410Ssyrinxdmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 428164410Ssyrinx{ 429164410Ssyrinx uint64_t blkid, nblks, lastblk; 430164410Ssyrinx uint64_t space = 0, unref = 0, skipped = 0; 431164410Ssyrinx dnode_t *dn = txh->txh_dnode; 432164410Ssyrinx dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 433164410Ssyrinx spa_t *spa = txh->txh_tx->tx_pool->dp_spa; 434164410Ssyrinx int epbs; 435164410Ssyrinx uint64_t l0span = 0, nl1blks = 0; 436164410Ssyrinx 437164410Ssyrinx if (dn->dn_nlevels == 0) 438164410Ssyrinx return; 439164410Ssyrinx 440164410Ssyrinx /* 441164410Ssyrinx * The struct_rwlock protects us against dn_nlevels 442164410Ssyrinx * changing, in case (against all odds) we manage to dirty & 443164410Ssyrinx * sync out the changes after we check for being dirty. 444164410Ssyrinx * Also, dbuf_hold_impl() wants us to have the struct_rwlock. 445164410Ssyrinx */ 446164410Ssyrinx rw_enter(&dn->dn_struct_rwlock, RW_READER); 447164410Ssyrinx epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 448164410Ssyrinx if (dn->dn_maxblkid == 0) { 449164410Ssyrinx if (off == 0 && len >= dn->dn_datablksz) { 450164410Ssyrinx blkid = 0; 451164410Ssyrinx nblks = 1; 452164410Ssyrinx } else { 453164410Ssyrinx rw_exit(&dn->dn_struct_rwlock); 454164410Ssyrinx return; 455164410Ssyrinx } 456164410Ssyrinx } else { 457164410Ssyrinx blkid = off >> dn->dn_datablkshift; 458164410Ssyrinx nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; 459164410Ssyrinx 460164410Ssyrinx if (blkid > dn->dn_maxblkid) { 461164410Ssyrinx rw_exit(&dn->dn_struct_rwlock); 462164410Ssyrinx return; 463164410Ssyrinx } 464164410Ssyrinx if (blkid + nblks > dn->dn_maxblkid) 465164410Ssyrinx nblks = dn->dn_maxblkid - blkid + 1; 466164410Ssyrinx 467164410Ssyrinx } 468164410Ssyrinx l0span = nblks; /* save for later use to calc level > 1 overhead */ 469164410Ssyrinx if (dn->dn_nlevels == 1) { 470164410Ssyrinx int i; 471164410Ssyrinx for (i = 0; i < nblks; i++) { 472164410Ssyrinx blkptr_t *bp = dn->dn_phys->dn_blkptr; 473164410Ssyrinx ASSERT3U(blkid + i, <, dn->dn_nblkptr); 474164410Ssyrinx bp += blkid + i; 475164410Ssyrinx if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) { 476164410Ssyrinx dprintf_bp(bp, "can free old%s", ""); 477164410Ssyrinx space += bp_get_dsize(spa, bp); 478164410Ssyrinx } 479164410Ssyrinx unref += BP_GET_ASIZE(bp); 480164410Ssyrinx } 481164410Ssyrinx nl1blks = 1; 482164410Ssyrinx nblks = 0; 483164410Ssyrinx } 484164410Ssyrinx 485164410Ssyrinx lastblk = blkid + nblks - 1; 486164410Ssyrinx while (nblks) { 487164410Ssyrinx dmu_buf_impl_t *dbuf; 488164410Ssyrinx uint64_t ibyte, new_blkid; 489164410Ssyrinx int epb = 1 << epbs; 490164410Ssyrinx int err, i, blkoff, tochk; 491164410Ssyrinx blkptr_t *bp; 492164410Ssyrinx 493164410Ssyrinx ibyte = blkid << dn->dn_datablkshift; 494164410Ssyrinx err = dnode_next_offset(dn, 495164410Ssyrinx DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); 496164410Ssyrinx new_blkid = ibyte >> dn->dn_datablkshift; 497164410Ssyrinx if (err == ESRCH) { 498164410Ssyrinx skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; 499164410Ssyrinx break; 500164410Ssyrinx } 501164410Ssyrinx if (err) { 502164410Ssyrinx txh->txh_tx->tx_err = err; 503164410Ssyrinx break; 504164410Ssyrinx } 505164410Ssyrinx if (new_blkid > lastblk) { 506164410Ssyrinx skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; 507164410Ssyrinx break; 508164410Ssyrinx } 509164410Ssyrinx 510164410Ssyrinx if (new_blkid > blkid) { 511164410Ssyrinx ASSERT((new_blkid >> epbs) > (blkid >> epbs)); 512164410Ssyrinx skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1; 513164410Ssyrinx nblks -= new_blkid - blkid; 514164410Ssyrinx blkid = new_blkid; 515164410Ssyrinx } 516164410Ssyrinx blkoff = P2PHASE(blkid, epb); 517164410Ssyrinx tochk = MIN(epb - blkoff, nblks); 518164410Ssyrinx 519164410Ssyrinx err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); 520164410Ssyrinx if (err) { 521164410Ssyrinx txh->txh_tx->tx_err = err; 522164410Ssyrinx break; 523164410Ssyrinx } 524164410Ssyrinx 525164410Ssyrinx txh->txh_memory_tohold += dbuf->db.db_size; 526164410Ssyrinx 527164410Ssyrinx /* 528164410Ssyrinx * We don't check memory_tohold against DMU_MAX_ACCESS because 529164410Ssyrinx * memory_tohold is an over-estimation (especially the >L1 530164410Ssyrinx * indirect blocks), so it could fail. Callers should have 531164410Ssyrinx * already verified that they will not be holding too much 532164410Ssyrinx * memory. 533164410Ssyrinx */ 534164410Ssyrinx 535164410Ssyrinx err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); 536164410Ssyrinx if (err != 0) { 537164410Ssyrinx txh->txh_tx->tx_err = err; 538164410Ssyrinx dbuf_rele(dbuf, FTAG); 539164410Ssyrinx break; 540164410Ssyrinx } 541164410Ssyrinx 542164410Ssyrinx bp = dbuf->db.db_data; 543164410Ssyrinx bp += blkoff; 544164410Ssyrinx 545164410Ssyrinx for (i = 0; i < tochk; i++) { 546164410Ssyrinx if (dsl_dataset_block_freeable(ds, &bp[i], 547164410Ssyrinx bp[i].blk_birth)) { 548164410Ssyrinx dprintf_bp(&bp[i], "can free old%s", ""); 549164410Ssyrinx space += bp_get_dsize(spa, &bp[i]); 550164410Ssyrinx } 551164410Ssyrinx unref += BP_GET_ASIZE(bp); 552164410Ssyrinx } 553164410Ssyrinx dbuf_rele(dbuf, FTAG); 554164410Ssyrinx 555164410Ssyrinx ++nl1blks; 556164410Ssyrinx blkid += tochk; 557164410Ssyrinx nblks -= tochk; 558164410Ssyrinx } 559164410Ssyrinx rw_exit(&dn->dn_struct_rwlock); 560164410Ssyrinx 561164410Ssyrinx /* 562164410Ssyrinx * Add in memory requirements of higher-level indirects. 563164410Ssyrinx * This assumes a worst-possible scenario for dn_nlevels and a 564164410Ssyrinx * worst-possible distribution of l1-blocks over the region to free. 565164410Ssyrinx */ 566164410Ssyrinx { 567164410Ssyrinx uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs); 568164410Ssyrinx int level = 2; 569164410Ssyrinx /* 570164410Ssyrinx * Here we don't use DN_MAX_LEVEL, but calculate it with the 571164410Ssyrinx * given datablkshift and indblkshift. This makes the 572164410Ssyrinx * difference between 19 and 8 on large files. 573164410Ssyrinx */ 574164410Ssyrinx int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) / 575164410Ssyrinx (dn->dn_indblkshift - SPA_BLKPTRSHIFT); 576164410Ssyrinx 577164410Ssyrinx while (level++ < maxlevel) { 578164410Ssyrinx txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1) 579164410Ssyrinx << dn->dn_indblkshift; 580164410Ssyrinx blkcnt = 1 + (blkcnt >> epbs); 581164410Ssyrinx } 582164410Ssyrinx } 583164410Ssyrinx 584164410Ssyrinx /* account for new level 1 indirect blocks that might show up */ 585164410Ssyrinx if (skipped > 0) { 586164410Ssyrinx txh->txh_fudge += skipped << dn->dn_indblkshift; 587164410Ssyrinx skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); 588164410Ssyrinx txh->txh_memory_tohold += skipped << dn->dn_indblkshift; 589164410Ssyrinx } 590164410Ssyrinx txh->txh_space_tofree += space; 591164410Ssyrinx txh->txh_space_tounref += unref; 592164410Ssyrinx} 593164410Ssyrinx 594164410Ssyrinx/* 595164410Ssyrinx * This function marks the transaction as being a "net free". The end 596164410Ssyrinx * result is that refquotas will be disabled for this transaction, and 597164410Ssyrinx * this transaction will be able to use half of the pool space overhead 598164410Ssyrinx * (see dsl_pool_adjustedsize()). Therefore this function should only 599164410Ssyrinx * be called for transactions that we expect will not cause a net increase 600164410Ssyrinx * in the amount of space used (but it's OK if that is occasionally not true). 601164410Ssyrinx */ 602164410Ssyrinxvoid 603164410Ssyrinxdmu_tx_mark_netfree(dmu_tx_t *tx) 604164410Ssyrinx{ 605164410Ssyrinx dmu_tx_hold_t *txh; 606164410Ssyrinx 607164410Ssyrinx txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 608164410Ssyrinx DMU_NEW_OBJECT, THT_FREE, 0, 0); 609164410Ssyrinx 610164410Ssyrinx /* 611164410Ssyrinx * Pretend that this operation will free 1GB of space. This 612164410Ssyrinx * should be large enough to cancel out the largest write. 613164410Ssyrinx * We don't want to use something like UINT64_MAX, because that would 614164410Ssyrinx * cause overflows when doing math with these values (e.g. in 615164410Ssyrinx * dmu_tx_try_assign()). 616164410Ssyrinx */ 617164410Ssyrinx txh->txh_space_tofree = txh->txh_space_tounref = 1024 * 1024 * 1024; 618164410Ssyrinx} 619164410Ssyrinx 620164410Ssyrinxvoid 621164410Ssyrinxdmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) 622164410Ssyrinx{ 623164410Ssyrinx dmu_tx_hold_t *txh; 624164410Ssyrinx dnode_t *dn; 625164410Ssyrinx int err; 626164410Ssyrinx zio_t *zio; 627164410Ssyrinx 628164410Ssyrinx ASSERT(tx->tx_txg == 0); 629164410Ssyrinx 630164410Ssyrinx txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 631164410Ssyrinx object, THT_FREE, off, len); 632164410Ssyrinx if (txh == NULL) 633164410Ssyrinx return; 634164410Ssyrinx dn = txh->txh_dnode; 635164410Ssyrinx dmu_tx_count_dnode(txh); 636164410Ssyrinx 637164410Ssyrinx if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) 638164410Ssyrinx return; 639164410Ssyrinx if (len == DMU_OBJECT_END) 640164410Ssyrinx len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; 641164410Ssyrinx 642164410Ssyrinx 643164410Ssyrinx /* 644164410Ssyrinx * For i/o error checking, we read the first and last level-0 645164410Ssyrinx * blocks if they are not aligned, and all the level-1 blocks. 646164410Ssyrinx * 647164410Ssyrinx * Note: dbuf_free_range() assumes that we have not instantiated 648164410Ssyrinx * any level-0 dbufs that will be completely freed. Therefore we must 649164410Ssyrinx * exercise care to not read or count the first and last blocks 650164410Ssyrinx * if they are blocksize-aligned. 651164410Ssyrinx */ 652164410Ssyrinx if (dn->dn_datablkshift == 0) { 653164410Ssyrinx if (off != 0 || len < dn->dn_datablksz) 654164410Ssyrinx dmu_tx_count_write(txh, 0, dn->dn_datablksz); 655164410Ssyrinx } else { 656164410Ssyrinx /* first block will be modified if it is not aligned */ 657164410Ssyrinx if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) 658164410Ssyrinx dmu_tx_count_write(txh, off, 1); 659164410Ssyrinx /* last block will be modified if it is not aligned */ 660164410Ssyrinx if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) 661164410Ssyrinx dmu_tx_count_write(txh, off+len, 1); 662164410Ssyrinx } 663164410Ssyrinx 664164410Ssyrinx /* 665164410Ssyrinx * Check level-1 blocks. 666164410Ssyrinx */ 667164410Ssyrinx if (dn->dn_nlevels > 1) { 668164410Ssyrinx int shift = dn->dn_datablkshift + dn->dn_indblkshift - 669164410Ssyrinx SPA_BLKPTRSHIFT; 670164410Ssyrinx uint64_t start = off >> shift; 671164410Ssyrinx uint64_t end = (off + len) >> shift; 672164410Ssyrinx 673164410Ssyrinx ASSERT(dn->dn_indblkshift != 0); 674164410Ssyrinx 675164410Ssyrinx /* 676164410Ssyrinx * dnode_reallocate() can result in an object with indirect 677164410Ssyrinx * blocks having an odd data block size. In this case, 678164410Ssyrinx * just check the single block. 679164410Ssyrinx */ 680164410Ssyrinx if (dn->dn_datablkshift == 0) 681164410Ssyrinx start = end = 0; 682164410Ssyrinx 683164410Ssyrinx zio = zio_root(tx->tx_pool->dp_spa, 684164410Ssyrinx NULL, NULL, ZIO_FLAG_CANFAIL); 685164410Ssyrinx for (uint64_t i = start; i <= end; i++) { 686164410Ssyrinx uint64_t ibyte = i << shift; 687164410Ssyrinx err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); 688164410Ssyrinx i = ibyte >> shift; 689164410Ssyrinx if (err == ESRCH) 690164410Ssyrinx break; 691164410Ssyrinx if (err) { 692164410Ssyrinx tx->tx_err = err; 693164410Ssyrinx return; 694164410Ssyrinx } 695164410Ssyrinx 696164410Ssyrinx err = dmu_tx_check_ioerr(zio, dn, 1, i); 697164410Ssyrinx if (err) { 698164410Ssyrinx tx->tx_err = err; 699164410Ssyrinx return; 700164410Ssyrinx } 701164410Ssyrinx } 702164410Ssyrinx err = zio_wait(zio); 703164410Ssyrinx if (err) { 704164410Ssyrinx tx->tx_err = err; 705164410Ssyrinx return; 706164410Ssyrinx } 707164410Ssyrinx } 708164410Ssyrinx 709164410Ssyrinx dmu_tx_count_free(txh, off, len); 710164410Ssyrinx} 711164410Ssyrinx 712164410Ssyrinxvoid 713164410Ssyrinxdmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) 714164410Ssyrinx{ 715164410Ssyrinx dmu_tx_hold_t *txh; 716164410Ssyrinx dnode_t *dn; 717164410Ssyrinx uint64_t nblocks; 718164410Ssyrinx int epbs, err; 719164410Ssyrinx 720164410Ssyrinx ASSERT(tx->tx_txg == 0); 721164410Ssyrinx 722164410Ssyrinx txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 723164410Ssyrinx object, THT_ZAP, add, (uintptr_t)name); 724164410Ssyrinx if (txh == NULL) 725164410Ssyrinx return; 726164410Ssyrinx dn = txh->txh_dnode; 727164410Ssyrinx 728164410Ssyrinx dmu_tx_count_dnode(txh); 729164410Ssyrinx 730164410Ssyrinx if (dn == NULL) { 731164410Ssyrinx /* 732164410Ssyrinx * We will be able to fit a new object's entries into one leaf 733164410Ssyrinx * block. So there will be at most 2 blocks total, 734164410Ssyrinx * including the header block. 735164410Ssyrinx */ 736164410Ssyrinx dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); 737164410Ssyrinx return; 738164410Ssyrinx } 739164410Ssyrinx 740164410Ssyrinx ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); 741164410Ssyrinx 742164410Ssyrinx if (dn->dn_maxblkid == 0 && !add) { 743164410Ssyrinx blkptr_t *bp; 744164410Ssyrinx 745164410Ssyrinx /* 746164410Ssyrinx * If there is only one block (i.e. this is a micro-zap) 747164410Ssyrinx * and we are not adding anything, the accounting is simple. 748164410Ssyrinx */ 749164410Ssyrinx err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 750164410Ssyrinx if (err) { 751164410Ssyrinx tx->tx_err = err; 752164410Ssyrinx return; 753164410Ssyrinx } 754164410Ssyrinx 755164410Ssyrinx /* 756164410Ssyrinx * Use max block size here, since we don't know how much 757164410Ssyrinx * the size will change between now and the dbuf dirty call. 758164410Ssyrinx */ 759164410Ssyrinx bp = &dn->dn_phys->dn_blkptr[0]; 760164410Ssyrinx if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 761164410Ssyrinx bp, bp->blk_birth)) 762164410Ssyrinx txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ; 763164410Ssyrinx else 764164410Ssyrinx txh->txh_space_towrite += MZAP_MAX_BLKSZ; 765164410Ssyrinx if (!BP_IS_HOLE(bp)) 766164410Ssyrinx txh->txh_space_tounref += MZAP_MAX_BLKSZ; 767164410Ssyrinx return; 768164410Ssyrinx } 769164410Ssyrinx 770164410Ssyrinx if (dn->dn_maxblkid > 0 && name) { 771164410Ssyrinx /* 772164410Ssyrinx * access the name in this fat-zap so that we'll check 773164410Ssyrinx * for i/o errors to the leaf blocks, etc. 774164410Ssyrinx */ 775164410Ssyrinx err = zap_lookup(dn->dn_objset, dn->dn_object, name, 776164410Ssyrinx 8, 0, NULL); 777164410Ssyrinx if (err == EIO) { 778164410Ssyrinx tx->tx_err = err; 779164410Ssyrinx return; 780164410Ssyrinx } 781164410Ssyrinx } 782164410Ssyrinx 783164410Ssyrinx err = zap_count_write(dn->dn_objset, dn->dn_object, name, add, 784164410Ssyrinx &txh->txh_space_towrite, &txh->txh_space_tooverwrite); 785164410Ssyrinx 786164410Ssyrinx /* 787164410Ssyrinx * If the modified blocks are scattered to the four winds, 788164410Ssyrinx * we'll have to modify an indirect twig for each. 789164410Ssyrinx */ 790164410Ssyrinx epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 791164410Ssyrinx for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) 792164410Ssyrinx if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj) 793164410Ssyrinx txh->txh_space_towrite += 3 << dn->dn_indblkshift; 794164410Ssyrinx else 795164410Ssyrinx txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift; 796164410Ssyrinx} 797164410Ssyrinx 798164410Ssyrinxvoid 799164410Ssyrinxdmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) 800164410Ssyrinx{ 801164410Ssyrinx dmu_tx_hold_t *txh; 802164410Ssyrinx 803164410Ssyrinx ASSERT(tx->tx_txg == 0); 804164410Ssyrinx 805164410Ssyrinx txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 806164410Ssyrinx object, THT_BONUS, 0, 0); 807164410Ssyrinx if (txh) 808164410Ssyrinx dmu_tx_count_dnode(txh); 809164410Ssyrinx} 810164410Ssyrinx 811164410Ssyrinxvoid 812164410Ssyrinxdmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) 813164410Ssyrinx{ 814164410Ssyrinx dmu_tx_hold_t *txh; 815164410Ssyrinx ASSERT(tx->tx_txg == 0); 816164410Ssyrinx 817164410Ssyrinx txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 818164410Ssyrinx DMU_NEW_OBJECT, THT_SPACE, space, 0); 819164410Ssyrinx 820164410Ssyrinx txh->txh_space_towrite += space; 821164410Ssyrinx} 822164410Ssyrinx 823164410Ssyrinxint 824164410Ssyrinxdmu_tx_holds(dmu_tx_t *tx, uint64_t object) 825164410Ssyrinx{ 826164410Ssyrinx dmu_tx_hold_t *txh; 827164410Ssyrinx int holds = 0; 828164410Ssyrinx 829164410Ssyrinx /* 830164410Ssyrinx * By asserting that the tx is assigned, we're counting the 831164410Ssyrinx * number of dn_tx_holds, which is the same as the number of 832164410Ssyrinx * dn_holds. Otherwise, we'd be counting dn_holds, but 833164410Ssyrinx * dn_tx_holds could be 0. 834164410Ssyrinx */ 835164410Ssyrinx ASSERT(tx->tx_txg != 0); 836164410Ssyrinx 837164410Ssyrinx /* if (tx->tx_anyobj == TRUE) */ 838164410Ssyrinx /* return (0); */ 839164410Ssyrinx 840164410Ssyrinx for (txh = list_head(&tx->tx_holds); txh; 841164410Ssyrinx txh = list_next(&tx->tx_holds, txh)) { 842164410Ssyrinx if (txh->txh_dnode && txh->txh_dnode->dn_object == object) 843164410Ssyrinx holds++; 844164410Ssyrinx } 845164410Ssyrinx 846164410Ssyrinx return (holds); 847164410Ssyrinx} 848164410Ssyrinx 849164410Ssyrinx#ifdef ZFS_DEBUG 850164410Ssyrinxvoid 851164410Ssyrinxdmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) 852164410Ssyrinx{ 853164410Ssyrinx dmu_tx_hold_t *txh; 854164410Ssyrinx int match_object = FALSE, match_offset = FALSE; 855164410Ssyrinx dnode_t *dn; 856164410Ssyrinx 857164410Ssyrinx DB_DNODE_ENTER(db); 858164410Ssyrinx dn = DB_DNODE(db); 859164410Ssyrinx ASSERT(tx->tx_txg != 0); 860164410Ssyrinx ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); 861164410Ssyrinx ASSERT3U(dn->dn_object, ==, db->db.db_object); 862164410Ssyrinx 863164410Ssyrinx if (tx->tx_anyobj) { 864164410Ssyrinx DB_DNODE_EXIT(db); 865164410Ssyrinx return; 866164410Ssyrinx } 867164410Ssyrinx 868164410Ssyrinx /* XXX No checking on the meta dnode for now */ 869164410Ssyrinx if (db->db.db_object == DMU_META_DNODE_OBJECT) { 870164410Ssyrinx DB_DNODE_EXIT(db); 871164410Ssyrinx return; 872164410Ssyrinx } 873164410Ssyrinx 874164410Ssyrinx for (txh = list_head(&tx->tx_holds); txh; 875164410Ssyrinx txh = list_next(&tx->tx_holds, txh)) { 876164410Ssyrinx ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); 877164410Ssyrinx if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) 878164410Ssyrinx match_object = TRUE; 879164410Ssyrinx if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { 880164410Ssyrinx int datablkshift = dn->dn_datablkshift ? 881164410Ssyrinx dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; 882164410Ssyrinx int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 883164410Ssyrinx int shift = datablkshift + epbs * db->db_level; 884164410Ssyrinx uint64_t beginblk = shift >= 64 ? 0 : 885164410Ssyrinx (txh->txh_arg1 >> shift); 886164410Ssyrinx uint64_t endblk = shift >= 64 ? 0 : 887164410Ssyrinx ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); 888164410Ssyrinx uint64_t blkid = db->db_blkid; 889164410Ssyrinx 890164410Ssyrinx /* XXX txh_arg2 better not be zero... */ 891164410Ssyrinx 892164410Ssyrinx dprintf("found txh type %x beginblk=%llx endblk=%llx\n", 893164410Ssyrinx txh->txh_type, beginblk, endblk); 894164410Ssyrinx 895164410Ssyrinx switch (txh->txh_type) { 896164410Ssyrinx case THT_WRITE: 897164410Ssyrinx if (blkid >= beginblk && blkid <= endblk) 898164410Ssyrinx match_offset = TRUE; 899164410Ssyrinx /* 900164410Ssyrinx * We will let this hold work for the bonus 901164410Ssyrinx * or spill buffer so that we don't need to 902164410Ssyrinx * hold it when creating a new object. 903164410Ssyrinx */ 904164410Ssyrinx if (blkid == DMU_BONUS_BLKID || 905164410Ssyrinx blkid == DMU_SPILL_BLKID) 906164410Ssyrinx match_offset = TRUE; 907164410Ssyrinx /* 908164410Ssyrinx * They might have to increase nlevels, 909164410Ssyrinx * thus dirtying the new TLIBs. Or the 910164410Ssyrinx * might have to change the block size, 911164410Ssyrinx * thus dirying the new lvl=0 blk=0. 912164410Ssyrinx */ 913164410Ssyrinx if (blkid == 0) 914164410Ssyrinx match_offset = TRUE; 915164410Ssyrinx break; 916164410Ssyrinx case THT_FREE: 917164410Ssyrinx /* 918164410Ssyrinx * We will dirty all the level 1 blocks in 919164410Ssyrinx * the free range and perhaps the first and 920164410Ssyrinx * last level 0 block. 921164410Ssyrinx */ 922164410Ssyrinx if (blkid >= beginblk && (blkid <= endblk || 923164410Ssyrinx txh->txh_arg2 == DMU_OBJECT_END)) 924164410Ssyrinx match_offset = TRUE; 925164410Ssyrinx break; 926164410Ssyrinx case THT_SPILL: 927164410Ssyrinx if (blkid == DMU_SPILL_BLKID) 928164410Ssyrinx match_offset = TRUE; 929164410Ssyrinx break; 930164410Ssyrinx case THT_BONUS: 931164410Ssyrinx if (blkid == DMU_BONUS_BLKID) 932164410Ssyrinx match_offset = TRUE; 933164410Ssyrinx break; 934164410Ssyrinx case THT_ZAP: 935164410Ssyrinx match_offset = TRUE; 936164410Ssyrinx break; 937164410Ssyrinx case THT_NEWOBJECT: 938164410Ssyrinx match_object = TRUE; 939164410Ssyrinx break; 940164410Ssyrinx default: 941164410Ssyrinx ASSERT(!"bad txh_type"); 942164410Ssyrinx } 943164410Ssyrinx } 944164410Ssyrinx if (match_object && match_offset) { 945164410Ssyrinx DB_DNODE_EXIT(db); 946164410Ssyrinx return; 947164410Ssyrinx } 948164410Ssyrinx } 949164410Ssyrinx DB_DNODE_EXIT(db); 950164410Ssyrinx panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", 951164410Ssyrinx (u_longlong_t)db->db.db_object, db->db_level, 952164410Ssyrinx (u_longlong_t)db->db_blkid); 953164410Ssyrinx} 954164410Ssyrinx#endif 955164410Ssyrinx 956164410Ssyrinx/* 957164410Ssyrinx * If we can't do 10 iops, something is wrong. Let us go ahead 958164410Ssyrinx * and hit zfs_dirty_data_max. 959164410Ssyrinx */ 960164410Ssyrinxhrtime_t zfs_delay_max_ns = MSEC2NSEC(100); 961164410Ssyrinxint zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ 962164410Ssyrinx 963164410Ssyrinx/* 964164410Ssyrinx * We delay transactions when we've determined that the backend storage 965164410Ssyrinx * isn't able to accommodate the rate of incoming writes. 966164410Ssyrinx * 967164410Ssyrinx * If there is already a transaction waiting, we delay relative to when 968164410Ssyrinx * that transaction finishes waiting. This way the calculated min_time 969164410Ssyrinx * is independent of the number of threads concurrently executing 970164410Ssyrinx * transactions. 971164410Ssyrinx * 972164410Ssyrinx * If we are the only waiter, wait relative to when the transaction 973164410Ssyrinx * started, rather than the current time. This credits the transaction for 974164410Ssyrinx * "time already served", e.g. reading indirect blocks. 975164410Ssyrinx * 976164410Ssyrinx * The minimum time for a transaction to take is calculated as: 977164410Ssyrinx * min_time = scale * (dirty - min) / (max - dirty) 978164410Ssyrinx * min_time is then capped at zfs_delay_max_ns. 979164410Ssyrinx * 980164410Ssyrinx * The delay has two degrees of freedom that can be adjusted via tunables. 981164410Ssyrinx * The percentage of dirty data at which we start to delay is defined by 982164410Ssyrinx * zfs_delay_min_dirty_percent. This should typically be at or above 983164410Ssyrinx * zfs_vdev_async_write_active_max_dirty_percent so that we only start to 984164410Ssyrinx * delay after writing at full speed has failed to keep up with the incoming 985164410Ssyrinx * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly 986164410Ssyrinx * speaking, this variable determines the amount of delay at the midpoint of 987164410Ssyrinx * the curve. 988164410Ssyrinx * 989164410Ssyrinx * delay 990164410Ssyrinx * 10ms +-------------------------------------------------------------*+ 991164410Ssyrinx * | *| 992164410Ssyrinx * 9ms + *+ 993164410Ssyrinx * | *| 994164410Ssyrinx * 8ms + *+ 995164410Ssyrinx * | * | 996164410Ssyrinx * 7ms + * + 997164410Ssyrinx * | * | 998164410Ssyrinx * 6ms + * + 999164410Ssyrinx * | * | 1000164410Ssyrinx * 5ms + * + 1001164410Ssyrinx * | * | 1002164410Ssyrinx * 4ms + * + 1003164410Ssyrinx * | * | 1004164410Ssyrinx * 3ms + * + 1005164410Ssyrinx * | * | 1006164410Ssyrinx * 2ms + (midpoint) * + 1007164410Ssyrinx * | | ** | 1008164410Ssyrinx * 1ms + v *** + 1009164410Ssyrinx * | zfs_delay_scale ----------> ******** | 1010164410Ssyrinx * 0 +-------------------------------------*********----------------+ 1011164410Ssyrinx * 0% <- zfs_dirty_data_max -> 100% 1012164410Ssyrinx * 1013164410Ssyrinx * Note that since the delay is added to the outstanding time remaining on the 1014164410Ssyrinx * most recent transaction, the delay is effectively the inverse of IOPS. 1015164410Ssyrinx * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve 1016164410Ssyrinx * was chosen such that small changes in the amount of accumulated dirty data 1017164410Ssyrinx * in the first 3/4 of the curve yield relatively small differences in the 1018164410Ssyrinx * amount of delay. 1019164410Ssyrinx * 1020164410Ssyrinx * The effects can be easier to understand when the amount of delay is 1021164410Ssyrinx * represented on a log scale: 1022164410Ssyrinx * 1023164410Ssyrinx * delay 1024164410Ssyrinx * 100ms +-------------------------------------------------------------++ 1025164410Ssyrinx * + + 1026164410Ssyrinx * | | 1027164410Ssyrinx * + *+ 1028164410Ssyrinx * 10ms + *+ 1029164410Ssyrinx * + ** + 1030164410Ssyrinx * | (midpoint) ** | 1031164410Ssyrinx * + | ** + 1032164410Ssyrinx * 1ms + v **** + 1033164410Ssyrinx * + zfs_delay_scale ----------> ***** + 1034164410Ssyrinx * | **** | 1035164410Ssyrinx * + **** + 1036164410Ssyrinx * 100us + ** + 1037164410Ssyrinx * + * + 1038164410Ssyrinx * | * | 1039164410Ssyrinx * + * + 1040164410Ssyrinx * 10us + * + 1041164410Ssyrinx * + + 1042164410Ssyrinx * | | 1043164410Ssyrinx * + + 1044164410Ssyrinx * +--------------------------------------------------------------+ 1045164410Ssyrinx * 0% <- zfs_dirty_data_max -> 100% 1046164410Ssyrinx * 1047164410Ssyrinx * Note here that only as the amount of dirty data approaches its limit does 1048164410Ssyrinx * the delay start to increase rapidly. The goal of a properly tuned system 1049164410Ssyrinx * should be to keep the amount of dirty data out of that range by first 1050164410Ssyrinx * ensuring that the appropriate limits are set for the I/O scheduler to reach 1051164410Ssyrinx * optimal throughput on the backend storage, and then by changing the value 1052164410Ssyrinx * of zfs_delay_scale to increase the steepness of the curve. 1053164410Ssyrinx */ 1054164410Ssyrinxstatic void 1055164410Ssyrinxdmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) 1056164410Ssyrinx{ 1057164410Ssyrinx dsl_pool_t *dp = tx->tx_pool; 1058164410Ssyrinx uint64_t delay_min_bytes = 1059164410Ssyrinx zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; 1060164410Ssyrinx hrtime_t wakeup, min_tx_time, now; 1061164410Ssyrinx 1062164410Ssyrinx if (dirty <= delay_min_bytes) 1063164410Ssyrinx return; 1064164410Ssyrinx 1065164410Ssyrinx /* 1066164410Ssyrinx * The caller has already waited until we are under the max. 1067164410Ssyrinx * We make them pass us the amount of dirty data so we don't 1068164410Ssyrinx * have to handle the case of it being >= the max, which could 1069164410Ssyrinx * cause a divide-by-zero if it's == the max. 1070164410Ssyrinx */ 1071164410Ssyrinx ASSERT3U(dirty, <, zfs_dirty_data_max); 1072164410Ssyrinx 1073164410Ssyrinx now = gethrtime(); 1074164410Ssyrinx min_tx_time = zfs_delay_scale * 1075164410Ssyrinx (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); 1076164410Ssyrinx if (now > tx->tx_start + min_tx_time) 1077164410Ssyrinx return; 1078164410Ssyrinx 1079164410Ssyrinx min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); 1080164410Ssyrinx 1081164410Ssyrinx DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, 1082164410Ssyrinx uint64_t, min_tx_time); 1083164410Ssyrinx 1084164410Ssyrinx mutex_enter(&dp->dp_lock); 1085164410Ssyrinx wakeup = MAX(tx->tx_start + min_tx_time, 1086164410Ssyrinx dp->dp_last_wakeup + min_tx_time); 1087164410Ssyrinx dp->dp_last_wakeup = wakeup; 1088164410Ssyrinx mutex_exit(&dp->dp_lock); 1089164410Ssyrinx 1090164410Ssyrinx#ifdef _KERNEL 1091164410Ssyrinx#ifdef illumos 1092164410Ssyrinx mutex_enter(&curthread->t_delay_lock); 1093164410Ssyrinx while (cv_timedwait_hires(&curthread->t_delay_cv, 1094164410Ssyrinx &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns, 1095164410Ssyrinx CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0) 1096164410Ssyrinx continue; 1097164410Ssyrinx mutex_exit(&curthread->t_delay_lock); 1098164410Ssyrinx#else 1099164410Ssyrinx pause_sbt("dmu_tx_delay", wakeup * SBT_1NS, 1100164410Ssyrinx zfs_delay_resolution_ns * SBT_1NS, C_ABSOLUTE); 1101164410Ssyrinx#endif 1102164410Ssyrinx#else 1103164410Ssyrinx hrtime_t delta = wakeup - gethrtime(); 1104164410Ssyrinx struct timespec ts; 1105164410Ssyrinx ts.tv_sec = delta / NANOSEC; 1106164410Ssyrinx ts.tv_nsec = delta % NANOSEC; 1107164410Ssyrinx (void) nanosleep(&ts, NULL); 1108164410Ssyrinx#endif 1109164410Ssyrinx} 1110164410Ssyrinx 1111164410Ssyrinxstatic int 1112164410Ssyrinxdmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) 1113164410Ssyrinx{ 1114164410Ssyrinx dmu_tx_hold_t *txh; 1115164410Ssyrinx spa_t *spa = tx->tx_pool->dp_spa; 1116164410Ssyrinx uint64_t memory, asize, fsize, usize; 1117164410Ssyrinx uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; 1118164410Ssyrinx 1119164410Ssyrinx ASSERT0(tx->tx_txg); 1120164410Ssyrinx 1121164410Ssyrinx if (tx->tx_err) 1122164410Ssyrinx return (tx->tx_err); 1123164410Ssyrinx 1124164410Ssyrinx if (spa_suspended(spa)) { 1125164410Ssyrinx /* 1126164410Ssyrinx * If the user has indicated a blocking failure mode 1127164410Ssyrinx * then return ERESTART which will block in dmu_tx_wait(). 1128164410Ssyrinx * Otherwise, return EIO so that an error can get 1129164410Ssyrinx * propagated back to the VOP calls. 1130164410Ssyrinx * 1131164410Ssyrinx * Note that we always honor the txg_how flag regardless 1132164410Ssyrinx * of the failuremode setting. 1133164410Ssyrinx */ 1134164410Ssyrinx if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && 1135164410Ssyrinx txg_how != TXG_WAIT) 1136164410Ssyrinx return (SET_ERROR(EIO)); 1137164410Ssyrinx 1138164410Ssyrinx return (SET_ERROR(ERESTART)); 1139164410Ssyrinx } 1140164410Ssyrinx 1141164410Ssyrinx if (!tx->tx_waited && 1142164410Ssyrinx dsl_pool_need_dirty_delay(tx->tx_pool)) { 1143164410Ssyrinx tx->tx_wait_dirty = B_TRUE; 1144164410Ssyrinx return (SET_ERROR(ERESTART)); 1145164410Ssyrinx } 1146164410Ssyrinx 1147164410Ssyrinx tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); 1148164410Ssyrinx tx->tx_needassign_txh = NULL; 1149164410Ssyrinx 1150164410Ssyrinx /* 1151164410Ssyrinx * NB: No error returns are allowed after txg_hold_open, but 1152164410Ssyrinx * before processing the dnode holds, due to the 1153164410Ssyrinx * dmu_tx_unassign() logic. 1154164410Ssyrinx */ 1155164410Ssyrinx 1156164410Ssyrinx towrite = tofree = tooverwrite = tounref = tohold = fudge = 0; 1157164410Ssyrinx for (txh = list_head(&tx->tx_holds); txh; 1158164410Ssyrinx txh = list_next(&tx->tx_holds, txh)) { 1159164410Ssyrinx dnode_t *dn = txh->txh_dnode; 1160164410Ssyrinx if (dn != NULL) { 1161164410Ssyrinx mutex_enter(&dn->dn_mtx); 1162164410Ssyrinx if (dn->dn_assigned_txg == tx->tx_txg - 1) { 1163164410Ssyrinx mutex_exit(&dn->dn_mtx); 1164164410Ssyrinx tx->tx_needassign_txh = txh; 1165164410Ssyrinx return (SET_ERROR(ERESTART)); 1166164410Ssyrinx } 1167164410Ssyrinx if (dn->dn_assigned_txg == 0) 1168164410Ssyrinx dn->dn_assigned_txg = tx->tx_txg; 1169164410Ssyrinx ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1170164410Ssyrinx (void) refcount_add(&dn->dn_tx_holds, tx); 1171164410Ssyrinx mutex_exit(&dn->dn_mtx); 1172164410Ssyrinx } 1173164410Ssyrinx towrite += txh->txh_space_towrite; 1174164410Ssyrinx tofree += txh->txh_space_tofree; 1175164410Ssyrinx tooverwrite += txh->txh_space_tooverwrite; 1176164410Ssyrinx tounref += txh->txh_space_tounref; 1177164410Ssyrinx tohold += txh->txh_memory_tohold; 1178164410Ssyrinx fudge += txh->txh_fudge; 1179164410Ssyrinx } 1180164410Ssyrinx 1181164410Ssyrinx /* 1182164410Ssyrinx * If a snapshot has been taken since we made our estimates, 1183164410Ssyrinx * assume that we won't be able to free or overwrite anything. 1184164410Ssyrinx */ 1185164410Ssyrinx if (tx->tx_objset && 1186164410Ssyrinx dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) > 1187164410Ssyrinx tx->tx_lastsnap_txg) { 1188164410Ssyrinx towrite += tooverwrite; 1189164410Ssyrinx tooverwrite = tofree = 0; 1190164410Ssyrinx } 1191164410Ssyrinx 1192164410Ssyrinx /* needed allocation: worst-case estimate of write space */ 1193164410Ssyrinx asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); 1194164410Ssyrinx /* freed space estimate: worst-case overwrite + free estimate */ 1195164410Ssyrinx fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; 1196164410Ssyrinx /* convert unrefd space to worst-case estimate */ 1197164410Ssyrinx usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); 1198164410Ssyrinx /* calculate memory footprint estimate */ 1199164410Ssyrinx memory = towrite + tooverwrite + tohold; 1200164410Ssyrinx 1201164410Ssyrinx#ifdef ZFS_DEBUG 1202164410Ssyrinx /* 1203164410Ssyrinx * Add in 'tohold' to account for our dirty holds on this memory 1204164410Ssyrinx * XXX - the "fudge" factor is to account for skipped blocks that 1205164410Ssyrinx * we missed because dnode_next_offset() misses in-core-only blocks. 1206164410Ssyrinx */ 1207164410Ssyrinx tx->tx_space_towrite = asize + 1208164410Ssyrinx spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge); 1209164410Ssyrinx tx->tx_space_tofree = tofree; 1210164410Ssyrinx tx->tx_space_tooverwrite = tooverwrite; 1211164410Ssyrinx tx->tx_space_tounref = tounref; 1212164410Ssyrinx#endif 1213164410Ssyrinx 1214164410Ssyrinx if (tx->tx_dir && asize != 0) { 1215164410Ssyrinx int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, 1216164410Ssyrinx asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); 1217164410Ssyrinx if (err) 1218164410Ssyrinx return (err); 1219164410Ssyrinx } 1220164410Ssyrinx 1221164410Ssyrinx return (0); 1222164410Ssyrinx} 1223164410Ssyrinx 1224164410Ssyrinxstatic void 1225164410Ssyrinxdmu_tx_unassign(dmu_tx_t *tx) 1226164410Ssyrinx{ 1227164410Ssyrinx dmu_tx_hold_t *txh; 1228164410Ssyrinx 1229164410Ssyrinx if (tx->tx_txg == 0) 1230164410Ssyrinx return; 1231164410Ssyrinx 1232164410Ssyrinx txg_rele_to_quiesce(&tx->tx_txgh); 1233164410Ssyrinx 1234164410Ssyrinx /* 1235164410Ssyrinx * Walk the transaction's hold list, removing the hold on the 1236164410Ssyrinx * associated dnode, and notifying waiters if the refcount drops to 0. 1237164410Ssyrinx */ 1238164410Ssyrinx for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; 1239164410Ssyrinx txh = list_next(&tx->tx_holds, txh)) { 1240164410Ssyrinx dnode_t *dn = txh->txh_dnode; 1241164410Ssyrinx 1242164410Ssyrinx if (dn == NULL) 1243164410Ssyrinx continue; 1244164410Ssyrinx mutex_enter(&dn->dn_mtx); 1245164410Ssyrinx ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1246164410Ssyrinx 1247164410Ssyrinx if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 1248164410Ssyrinx dn->dn_assigned_txg = 0; 1249164410Ssyrinx cv_broadcast(&dn->dn_notxholds); 1250164410Ssyrinx } 1251164410Ssyrinx mutex_exit(&dn->dn_mtx); 1252164410Ssyrinx } 1253164410Ssyrinx 1254164410Ssyrinx txg_rele_to_sync(&tx->tx_txgh); 1255164410Ssyrinx 1256164410Ssyrinx tx->tx_lasttried_txg = tx->tx_txg; 1257164410Ssyrinx tx->tx_txg = 0; 1258164410Ssyrinx} 1259164410Ssyrinx 1260164410Ssyrinx/* 1261164410Ssyrinx * Assign tx to a transaction group. txg_how can be one of: 1262164410Ssyrinx * 1263164410Ssyrinx * (1) TXG_WAIT. If the current open txg is full, waits until there's 1264164410Ssyrinx * a new one. This should be used when you're not holding locks. 1265164410Ssyrinx * It will only fail if we're truly out of space (or over quota). 1266164410Ssyrinx * 1267164410Ssyrinx * (2) TXG_NOWAIT. If we can't assign into the current open txg without 1268164410Ssyrinx * blocking, returns immediately with ERESTART. This should be used 1269164410Ssyrinx * whenever you're holding locks. On an ERESTART error, the caller 1270164410Ssyrinx * should drop locks, do a dmu_tx_wait(tx), and try again. 1271164410Ssyrinx * 1272164410Ssyrinx * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait() 1273164410Ssyrinx * has already been called on behalf of this operation (though 1274164410Ssyrinx * most likely on a different tx). 1275164410Ssyrinx */ 1276164410Ssyrinxint 1277164410Ssyrinxdmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) 1278164410Ssyrinx{ 1279164410Ssyrinx int err; 1280164410Ssyrinx 1281164410Ssyrinx ASSERT(tx->tx_txg == 0); 1282164410Ssyrinx ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || 1283164410Ssyrinx txg_how == TXG_WAITED); 1284164410Ssyrinx ASSERT(!dsl_pool_sync_context(tx->tx_pool)); 1285164410Ssyrinx 1286164410Ssyrinx /* If we might wait, we must not hold the config lock. */ 1287164410Ssyrinx ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); 1288164410Ssyrinx 1289164410Ssyrinx if (txg_how == TXG_WAITED) 1290164410Ssyrinx tx->tx_waited = B_TRUE; 1291164410Ssyrinx 1292164410Ssyrinx while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { 1293164410Ssyrinx dmu_tx_unassign(tx); 1294 1295 if (err != ERESTART || txg_how != TXG_WAIT) 1296 return (err); 1297 1298 dmu_tx_wait(tx); 1299 } 1300 1301 txg_rele_to_quiesce(&tx->tx_txgh); 1302 1303 return (0); 1304} 1305 1306void 1307dmu_tx_wait(dmu_tx_t *tx) 1308{ 1309 spa_t *spa = tx->tx_pool->dp_spa; 1310 dsl_pool_t *dp = tx->tx_pool; 1311 1312 ASSERT(tx->tx_txg == 0); 1313 ASSERT(!dsl_pool_config_held(tx->tx_pool)); 1314 1315 if (tx->tx_wait_dirty) { 1316 /* 1317 * dmu_tx_try_assign() has determined that we need to wait 1318 * because we've consumed much or all of the dirty buffer 1319 * space. 1320 */ 1321 mutex_enter(&dp->dp_lock); 1322 while (dp->dp_dirty_total >= zfs_dirty_data_max) 1323 cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); 1324 uint64_t dirty = dp->dp_dirty_total; 1325 mutex_exit(&dp->dp_lock); 1326 1327 dmu_tx_delay(tx, dirty); 1328 1329 tx->tx_wait_dirty = B_FALSE; 1330 1331 /* 1332 * Note: setting tx_waited only has effect if the caller 1333 * used TX_WAIT. Otherwise they are going to destroy 1334 * this tx and try again. The common case, zfs_write(), 1335 * uses TX_WAIT. 1336 */ 1337 tx->tx_waited = B_TRUE; 1338 } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { 1339 /* 1340 * If the pool is suspended we need to wait until it 1341 * is resumed. Note that it's possible that the pool 1342 * has become active after this thread has tried to 1343 * obtain a tx. If that's the case then tx_lasttried_txg 1344 * would not have been set. 1345 */ 1346 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); 1347 } else if (tx->tx_needassign_txh) { 1348 /* 1349 * A dnode is assigned to the quiescing txg. Wait for its 1350 * transaction to complete. 1351 */ 1352 dnode_t *dn = tx->tx_needassign_txh->txh_dnode; 1353 1354 mutex_enter(&dn->dn_mtx); 1355 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) 1356 cv_wait(&dn->dn_notxholds, &dn->dn_mtx); 1357 mutex_exit(&dn->dn_mtx); 1358 tx->tx_needassign_txh = NULL; 1359 } else { 1360 txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); 1361 } 1362} 1363 1364void 1365dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) 1366{ 1367#ifdef ZFS_DEBUG 1368 if (tx->tx_dir == NULL || delta == 0) 1369 return; 1370 1371 if (delta > 0) { 1372 ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, 1373 tx->tx_space_towrite); 1374 (void) refcount_add_many(&tx->tx_space_written, delta, NULL); 1375 } else { 1376 (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); 1377 } 1378#endif 1379} 1380 1381void 1382dmu_tx_commit(dmu_tx_t *tx) 1383{ 1384 dmu_tx_hold_t *txh; 1385 1386 ASSERT(tx->tx_txg != 0); 1387 1388 /* 1389 * Go through the transaction's hold list and remove holds on 1390 * associated dnodes, notifying waiters if no holds remain. 1391 */ 1392 while (txh = list_head(&tx->tx_holds)) { 1393 dnode_t *dn = txh->txh_dnode; 1394 1395 list_remove(&tx->tx_holds, txh); 1396 kmem_free(txh, sizeof (dmu_tx_hold_t)); 1397 if (dn == NULL) 1398 continue; 1399 mutex_enter(&dn->dn_mtx); 1400 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1401 1402 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 1403 dn->dn_assigned_txg = 0; 1404 cv_broadcast(&dn->dn_notxholds); 1405 } 1406 mutex_exit(&dn->dn_mtx); 1407 dnode_rele(dn, tx); 1408 } 1409 1410 if (tx->tx_tempreserve_cookie) 1411 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); 1412 1413 if (!list_is_empty(&tx->tx_callbacks)) 1414 txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); 1415 1416 if (tx->tx_anyobj == FALSE) 1417 txg_rele_to_sync(&tx->tx_txgh); 1418 1419 list_destroy(&tx->tx_callbacks); 1420 list_destroy(&tx->tx_holds); 1421#ifdef ZFS_DEBUG 1422 dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", 1423 tx->tx_space_towrite, refcount_count(&tx->tx_space_written), 1424 tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); 1425 refcount_destroy_many(&tx->tx_space_written, 1426 refcount_count(&tx->tx_space_written)); 1427 refcount_destroy_many(&tx->tx_space_freed, 1428 refcount_count(&tx->tx_space_freed)); 1429#endif 1430 kmem_free(tx, sizeof (dmu_tx_t)); 1431} 1432 1433void 1434dmu_tx_abort(dmu_tx_t *tx) 1435{ 1436 dmu_tx_hold_t *txh; 1437 1438 ASSERT(tx->tx_txg == 0); 1439 1440 while (txh = list_head(&tx->tx_holds)) { 1441 dnode_t *dn = txh->txh_dnode; 1442 1443 list_remove(&tx->tx_holds, txh); 1444 kmem_free(txh, sizeof (dmu_tx_hold_t)); 1445 if (dn != NULL) 1446 dnode_rele(dn, tx); 1447 } 1448 1449 /* 1450 * Call any registered callbacks with an error code. 1451 */ 1452 if (!list_is_empty(&tx->tx_callbacks)) 1453 dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); 1454 1455 list_destroy(&tx->tx_callbacks); 1456 list_destroy(&tx->tx_holds); 1457#ifdef ZFS_DEBUG 1458 refcount_destroy_many(&tx->tx_space_written, 1459 refcount_count(&tx->tx_space_written)); 1460 refcount_destroy_many(&tx->tx_space_freed, 1461 refcount_count(&tx->tx_space_freed)); 1462#endif 1463 kmem_free(tx, sizeof (dmu_tx_t)); 1464} 1465 1466uint64_t 1467dmu_tx_get_txg(dmu_tx_t *tx) 1468{ 1469 ASSERT(tx->tx_txg != 0); 1470 return (tx->tx_txg); 1471} 1472 1473dsl_pool_t * 1474dmu_tx_pool(dmu_tx_t *tx) 1475{ 1476 ASSERT(tx->tx_pool != NULL); 1477 return (tx->tx_pool); 1478} 1479 1480 1481void 1482dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) 1483{ 1484 dmu_tx_callback_t *dcb; 1485 1486 dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); 1487 1488 dcb->dcb_func = func; 1489 dcb->dcb_data = data; 1490 1491 list_insert_tail(&tx->tx_callbacks, dcb); 1492} 1493 1494/* 1495 * Call all the commit callbacks on a list, with a given error code. 1496 */ 1497void 1498dmu_tx_do_callbacks(list_t *cb_list, int error) 1499{ 1500 dmu_tx_callback_t *dcb; 1501 1502 while (dcb = list_head(cb_list)) { 1503 list_remove(cb_list, dcb); 1504 dcb->dcb_func(dcb->dcb_data, error); 1505 kmem_free(dcb, sizeof (dmu_tx_callback_t)); 1506 } 1507} 1508 1509/* 1510 * Interface to hold a bunch of attributes. 1511 * used for creating new files. 1512 * attrsize is the total size of all attributes 1513 * to be added during object creation 1514 * 1515 * For updating/adding a single attribute dmu_tx_hold_sa() should be used. 1516 */ 1517 1518/* 1519 * hold necessary attribute name for attribute registration. 1520 * should be a very rare case where this is needed. If it does 1521 * happen it would only happen on the first write to the file system. 1522 */ 1523static void 1524dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) 1525{ 1526 int i; 1527 1528 if (!sa->sa_need_attr_registration) 1529 return; 1530 1531 for (i = 0; i != sa->sa_num_attrs; i++) { 1532 if (!sa->sa_attr_table[i].sa_registered) { 1533 if (sa->sa_reg_attr_obj) 1534 dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, 1535 B_TRUE, sa->sa_attr_table[i].sa_name); 1536 else 1537 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1538 B_TRUE, sa->sa_attr_table[i].sa_name); 1539 } 1540 } 1541} 1542 1543 1544void 1545dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) 1546{ 1547 dnode_t *dn; 1548 dmu_tx_hold_t *txh; 1549 1550 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, 1551 THT_SPILL, 0, 0); 1552 1553 dn = txh->txh_dnode; 1554 1555 if (dn == NULL) 1556 return; 1557 1558 /* If blkptr doesn't exist then add space to towrite */ 1559 if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { 1560 txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE; 1561 } else { 1562 blkptr_t *bp; 1563 1564 bp = &dn->dn_phys->dn_spill; 1565 if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 1566 bp, bp->blk_birth)) 1567 txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE; 1568 else 1569 txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE; 1570 if (!BP_IS_HOLE(bp)) 1571 txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE; 1572 } 1573} 1574 1575void 1576dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) 1577{ 1578 sa_os_t *sa = tx->tx_objset->os_sa; 1579 1580 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1581 1582 if (tx->tx_objset->os_sa->sa_master_obj == 0) 1583 return; 1584 1585 if (tx->tx_objset->os_sa->sa_layout_attr_obj) 1586 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); 1587 else { 1588 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); 1589 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); 1590 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1591 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1592 } 1593 1594 dmu_tx_sa_registration_hold(sa, tx); 1595 1596 if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) 1597 return; 1598 1599 (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, 1600 THT_SPILL, 0, 0); 1601} 1602 1603/* 1604 * Hold SA attribute 1605 * 1606 * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) 1607 * 1608 * variable_size is the total size of all variable sized attributes 1609 * passed to this function. It is not the total size of all 1610 * variable size attributes that *may* exist on this object. 1611 */ 1612void 1613dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) 1614{ 1615 uint64_t object; 1616 sa_os_t *sa = tx->tx_objset->os_sa; 1617 1618 ASSERT(hdl != NULL); 1619 1620 object = sa_handle_object(hdl); 1621 1622 dmu_tx_hold_bonus(tx, object); 1623 1624 if (tx->tx_objset->os_sa->sa_master_obj == 0) 1625 return; 1626 1627 if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || 1628 tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { 1629 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); 1630 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); 1631 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1632 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1633 } 1634 1635 dmu_tx_sa_registration_hold(sa, tx); 1636 1637 if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) 1638 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); 1639 1640 if (sa->sa_force_spill || may_grow || hdl->sa_spill) { 1641 ASSERT(tx->tx_txg == 0); 1642 dmu_tx_hold_spill(tx, object); 1643 } else { 1644 dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; 1645 dnode_t *dn; 1646 1647 DB_DNODE_ENTER(db); 1648 dn = DB_DNODE(db); 1649 if (dn->dn_have_spill) { 1650 ASSERT(tx->tx_txg == 0); 1651 dmu_tx_hold_spill(tx, object); 1652 } 1653 DB_DNODE_EXIT(db); 1654 } 1655} 1656