dmu_tx.c revision 321549
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23226512Smm * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24307290Smav * Copyright (c) 2012, 2016 by Delphix. All rights reserved. 25296519Smav * Copyright (c) 2014 Integros [integros.com] 26226512Smm */ 27168404Spjd 28168404Spjd#include <sys/dmu.h> 29168404Spjd#include <sys/dmu_impl.h> 30168404Spjd#include <sys/dbuf.h> 31168404Spjd#include <sys/dmu_tx.h> 32168404Spjd#include <sys/dmu_objset.h> 33321547Smav#include <sys/dsl_dataset.h> 34321547Smav#include <sys/dsl_dir.h> 35168404Spjd#include <sys/dsl_pool.h> 36321547Smav#include <sys/zap_impl.h> 37168404Spjd#include <sys/spa.h> 38219089Spjd#include <sys/sa.h> 39219089Spjd#include <sys/sa_impl.h> 40168404Spjd#include <sys/zfs_context.h> 41219089Spjd#include <sys/varargs.h> 42168404Spjd 43168404Spjdtypedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, 44168404Spjd uint64_t arg1, uint64_t arg2); 45168404Spjd 46168404Spjd 47168404Spjddmu_tx_t * 48168404Spjddmu_tx_create_dd(dsl_dir_t *dd) 49168404Spjd{ 50168404Spjd dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); 51168404Spjd tx->tx_dir = dd; 52248571Smm if (dd != NULL) 53168404Spjd tx->tx_pool = dd->dd_pool; 54168404Spjd list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), 55168404Spjd offsetof(dmu_tx_hold_t, txh_node)); 56219089Spjd list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), 57219089Spjd offsetof(dmu_tx_callback_t, dcb_node)); 58258632Savg tx->tx_start = gethrtime(); 59168404Spjd return (tx); 60168404Spjd} 61168404Spjd 62168404Spjddmu_tx_t * 63168404Spjddmu_tx_create(objset_t *os) 64168404Spjd{ 65219089Spjd dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); 66168404Spjd tx->tx_objset = os; 67168404Spjd return (tx); 68168404Spjd} 69168404Spjd 70168404Spjddmu_tx_t * 71168404Spjddmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) 72168404Spjd{ 73168404Spjd dmu_tx_t *tx = dmu_tx_create_dd(NULL); 74168404Spjd 75168404Spjd ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); 76168404Spjd tx->tx_pool = dp; 77168404Spjd tx->tx_txg = txg; 78168404Spjd tx->tx_anyobj = TRUE; 79168404Spjd 80168404Spjd return (tx); 81168404Spjd} 82168404Spjd 83168404Spjdint 84168404Spjddmu_tx_is_syncing(dmu_tx_t *tx) 85168404Spjd{ 86168404Spjd return (tx->tx_anyobj); 87168404Spjd} 88168404Spjd 89168404Spjdint 90168404Spjddmu_tx_private_ok(dmu_tx_t *tx) 91168404Spjd{ 92168404Spjd return (tx->tx_anyobj); 93168404Spjd} 94168404Spjd 95168404Spjdstatic dmu_tx_hold_t * 96321549Smavdmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, 97321549Smav uint64_t arg1, uint64_t arg2) 98168404Spjd{ 99168404Spjd dmu_tx_hold_t *txh; 100168404Spjd 101321549Smav if (dn != NULL) { 102321549Smav (void) refcount_add(&dn->dn_holds, tx); 103321549Smav if (tx->tx_txg != 0) { 104168404Spjd mutex_enter(&dn->dn_mtx); 105168404Spjd /* 106168404Spjd * dn->dn_assigned_txg == tx->tx_txg doesn't pose a 107168404Spjd * problem, but there's no way for it to happen (for 108168404Spjd * now, at least). 109168404Spjd */ 110168404Spjd ASSERT(dn->dn_assigned_txg == 0); 111168404Spjd dn->dn_assigned_txg = tx->tx_txg; 112168404Spjd (void) refcount_add(&dn->dn_tx_holds, tx); 113168404Spjd mutex_exit(&dn->dn_mtx); 114168404Spjd } 115168404Spjd } 116168404Spjd 117168404Spjd txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); 118168404Spjd txh->txh_tx = tx; 119168404Spjd txh->txh_dnode = dn; 120307049Smav refcount_create(&txh->txh_space_towrite); 121307049Smav refcount_create(&txh->txh_memory_tohold); 122168404Spjd txh->txh_type = type; 123168404Spjd txh->txh_arg1 = arg1; 124168404Spjd txh->txh_arg2 = arg2; 125168404Spjd list_insert_tail(&tx->tx_holds, txh); 126168404Spjd 127168404Spjd return (txh); 128168404Spjd} 129168404Spjd 130321549Smavstatic dmu_tx_hold_t * 131321549Smavdmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, 132321549Smav enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) 133321549Smav{ 134321549Smav dnode_t *dn = NULL; 135321549Smav dmu_tx_hold_t *txh; 136321549Smav int err; 137321549Smav 138321549Smav if (object != DMU_NEW_OBJECT) { 139321549Smav err = dnode_hold(os, object, FTAG, &dn); 140321549Smav if (err != 0) { 141321549Smav tx->tx_err = err; 142321549Smav return (NULL); 143321549Smav } 144321549Smav } 145321549Smav txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); 146321549Smav if (dn != NULL) 147321549Smav dnode_rele(dn, FTAG); 148321549Smav return (txh); 149321549Smav} 150321549Smav 151168404Spjdvoid 152321549Smavdmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) 153168404Spjd{ 154168404Spjd /* 155168404Spjd * If we're syncing, they can manipulate any object anyhow, and 156168404Spjd * the hold on the dnode_t can cause problems. 157168404Spjd */ 158321549Smav if (!dmu_tx_is_syncing(tx)) 159321549Smav (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); 160168404Spjd} 161168404Spjd 162321547Smav/* 163321547Smav * This function reads specified data from disk. The specified data will 164321547Smav * be needed to perform the transaction -- i.e, it will be read after 165321547Smav * we do dmu_tx_assign(). There are two reasons that we read the data now 166321547Smav * (before dmu_tx_assign()): 167321547Smav * 168321547Smav * 1. Reading it now has potentially better performance. The transaction 169321547Smav * has not yet been assigned, so the TXG is not held open, and also the 170321547Smav * caller typically has less locks held when calling dmu_tx_hold_*() than 171321547Smav * after the transaction has been assigned. This reduces the lock (and txg) 172321547Smav * hold times, thus reducing lock contention. 173321547Smav * 174321547Smav * 2. It is easier for callers (primarily the ZPL) to handle i/o errors 175321547Smav * that are detected before they start making changes to the DMU state 176321547Smav * (i.e. now). Once the transaction has been assigned, and some DMU 177321547Smav * state has been changed, it can be difficult to recover from an i/o 178321547Smav * error (e.g. to undo the changes already made in memory at the DMU 179321547Smav * layer). Typically code to do so does not exist in the caller -- it 180321547Smav * assumes that the data has already been cached and thus i/o errors are 181321547Smav * not possible. 182321547Smav * 183321547Smav * It has been observed that the i/o initiated here can be a performance 184321547Smav * problem, and it appears to be optional, because we don't look at the 185321547Smav * data which is read. However, removing this read would only serve to 186321547Smav * move the work elsewhere (after the dmu_tx_assign()), where it may 187321547Smav * have a greater impact on performance (in addition to the impact on 188321547Smav * fault tolerance noted above). 189321547Smav */ 190168404Spjdstatic int 191168404Spjddmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) 192168404Spjd{ 193168404Spjd int err; 194168404Spjd dmu_buf_impl_t *db; 195168404Spjd 196168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 197168404Spjd db = dbuf_hold_level(dn, level, blkid, FTAG); 198168404Spjd rw_exit(&dn->dn_struct_rwlock); 199168404Spjd if (db == NULL) 200249195Smm return (SET_ERROR(EIO)); 201185029Spjd err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); 202168404Spjd dbuf_rele(db, FTAG); 203168404Spjd return (err); 204168404Spjd} 205168404Spjd 206168404Spjd/* ARGSUSED */ 207168404Spjdstatic void 208168404Spjddmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 209168404Spjd{ 210168404Spjd dnode_t *dn = txh->txh_dnode; 211168404Spjd int err = 0; 212168404Spjd 213168404Spjd if (len == 0) 214168404Spjd return; 215168404Spjd 216321547Smav (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG); 217168404Spjd 218321547Smav if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) 219321547Smav err = SET_ERROR(EFBIG); 220168404Spjd 221321547Smav if (dn == NULL) 222321547Smav return; 223168404Spjd 224321547Smav /* 225321547Smav * For i/o error checking, read the blocks that will be needed 226321547Smav * to perform the write: the first and last level-0 blocks (if 227321547Smav * they are not aligned, i.e. if they are partial-block writes), 228321547Smav * and all the level-1 blocks. 229321547Smav */ 230321547Smav if (dn->dn_maxblkid == 0) { 231321547Smav if (off < dn->dn_datablksz && 232321547Smav (off > 0 || len < dn->dn_datablksz)) { 233321547Smav err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 234321547Smav if (err != 0) { 235321547Smav txh->txh_tx->tx_err = err; 236168404Spjd } 237321547Smav } 238321547Smav } else { 239321547Smav zio_t *zio = zio_root(dn->dn_objset->os_spa, 240321547Smav NULL, NULL, ZIO_FLAG_CANFAIL); 241168404Spjd 242321547Smav /* first level-0 block */ 243321547Smav uint64_t start = off >> dn->dn_datablkshift; 244321547Smav if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { 245321547Smav err = dmu_tx_check_ioerr(zio, dn, 0, start); 246321547Smav if (err != 0) { 247321547Smav txh->txh_tx->tx_err = err; 248168404Spjd } 249168404Spjd } 250168404Spjd 251321547Smav /* last level-0 block */ 252321547Smav uint64_t end = (off + len - 1) >> dn->dn_datablkshift; 253321547Smav if (end != start && end <= dn->dn_maxblkid && 254321547Smav P2PHASE(off + len, dn->dn_datablksz)) { 255321547Smav err = dmu_tx_check_ioerr(zio, dn, 0, end); 256321547Smav if (err != 0) { 257219089Spjd txh->txh_tx->tx_err = err; 258209962Smm } 259321547Smav } 260219089Spjd 261321547Smav /* level-1 blocks */ 262321547Smav if (dn->dn_nlevels > 1) { 263321547Smav int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 264321547Smav for (uint64_t i = (start >> shft) + 1; 265321547Smav i < end >> shft; i++) { 266321547Smav err = dmu_tx_check_ioerr(zio, dn, 1, i); 267321547Smav if (err != 0) { 268321547Smav txh->txh_tx->tx_err = err; 269307049Smav } 270209962Smm } 271209962Smm } 272168404Spjd 273321547Smav err = zio_wait(zio); 274321547Smav if (err != 0) { 275321547Smav txh->txh_tx->tx_err = err; 276209962Smm } 277168404Spjd } 278168404Spjd} 279168404Spjd 280168404Spjdstatic void 281168404Spjddmu_tx_count_dnode(dmu_tx_hold_t *txh) 282168404Spjd{ 283321547Smav (void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG); 284168404Spjd} 285168404Spjd 286168404Spjdvoid 287168404Spjddmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) 288168404Spjd{ 289168404Spjd dmu_tx_hold_t *txh; 290168404Spjd 291321547Smav ASSERT0(tx->tx_txg); 292321547Smav ASSERT3U(len, <=, DMU_MAX_ACCESS); 293168404Spjd ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 294168404Spjd 295168404Spjd txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 296168404Spjd object, THT_WRITE, off, len); 297321549Smav if (txh != NULL) { 298321549Smav dmu_tx_count_write(txh, off, len); 299321549Smav dmu_tx_count_dnode(txh); 300321549Smav } 301321549Smav} 302168404Spjd 303321549Smavvoid 304321549Smavdmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) 305321549Smav{ 306321549Smav dmu_tx_hold_t *txh; 307321549Smav 308321549Smav ASSERT0(tx->tx_txg); 309321549Smav ASSERT3U(len, <=, DMU_MAX_ACCESS); 310321549Smav ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 311321549Smav 312321549Smav txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); 313321549Smav if (txh != NULL) { 314321549Smav dmu_tx_count_write(txh, off, len); 315321549Smav dmu_tx_count_dnode(txh); 316321549Smav } 317168404Spjd} 318168404Spjd 319268464Sdelphij/* 320268464Sdelphij * This function marks the transaction as being a "net free". The end 321268464Sdelphij * result is that refquotas will be disabled for this transaction, and 322268464Sdelphij * this transaction will be able to use half of the pool space overhead 323268464Sdelphij * (see dsl_pool_adjustedsize()). Therefore this function should only 324268464Sdelphij * be called for transactions that we expect will not cause a net increase 325268464Sdelphij * in the amount of space used (but it's OK if that is occasionally not true). 326268464Sdelphij */ 327168404Spjdvoid 328268464Sdelphijdmu_tx_mark_netfree(dmu_tx_t *tx) 329268464Sdelphij{ 330321547Smav tx->tx_netfree = B_TRUE; 331268464Sdelphij} 332268464Sdelphij 333321549Smavstatic void 334321549Smavdmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 335168404Spjd{ 336321549Smav dmu_tx_t *tx; 337321549Smav dnode_t *dn; 338253821Sdelphij int err; 339321549Smav zio_t *zio; 340168404Spjd 341321549Smav tx = txh->txh_tx; 342168404Spjd ASSERT(tx->tx_txg == 0); 343168404Spjd 344321549Smav dn = txh->txh_dnode; 345258632Savg dmu_tx_count_dnode(txh); 346168404Spjd 347321547Smav if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) 348168404Spjd return; 349168404Spjd if (len == DMU_OBJECT_END) 350321547Smav len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; 351168404Spjd 352253821Sdelphij 353168404Spjd /* 354253821Sdelphij * For i/o error checking, we read the first and last level-0 355253821Sdelphij * blocks if they are not aligned, and all the level-1 blocks. 356253821Sdelphij * 357253821Sdelphij * Note: dbuf_free_range() assumes that we have not instantiated 358253821Sdelphij * any level-0 dbufs that will be completely freed. Therefore we must 359253821Sdelphij * exercise care to not read or count the first and last blocks 360253821Sdelphij * if they are blocksize-aligned. 361168404Spjd */ 362253821Sdelphij if (dn->dn_datablkshift == 0) { 363254753Sdelphij if (off != 0 || len < dn->dn_datablksz) 364256259Savg dmu_tx_count_write(txh, 0, dn->dn_datablksz); 365253821Sdelphij } else { 366253821Sdelphij /* first block will be modified if it is not aligned */ 367253821Sdelphij if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) 368253821Sdelphij dmu_tx_count_write(txh, off, 1); 369253821Sdelphij /* last block will be modified if it is not aligned */ 370253821Sdelphij if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) 371321547Smav dmu_tx_count_write(txh, off + len, 1); 372253821Sdelphij } 373253821Sdelphij 374253821Sdelphij /* 375253821Sdelphij * Check level-1 blocks. 376253821Sdelphij */ 377168404Spjd if (dn->dn_nlevels > 1) { 378253821Sdelphij int shift = dn->dn_datablkshift + dn->dn_indblkshift - 379168404Spjd SPA_BLKPTRSHIFT; 380253821Sdelphij uint64_t start = off >> shift; 381253821Sdelphij uint64_t end = (off + len) >> shift; 382168404Spjd 383253821Sdelphij ASSERT(dn->dn_indblkshift != 0); 384253821Sdelphij 385259576Spjd /* 386259576Spjd * dnode_reallocate() can result in an object with indirect 387259576Spjd * blocks having an odd data block size. In this case, 388259576Spjd * just check the single block. 389259576Spjd */ 390259576Spjd if (dn->dn_datablkshift == 0) 391259576Spjd start = end = 0; 392259576Spjd 393321547Smav zio_t *zio = zio_root(tx->tx_pool->dp_spa, 394168404Spjd NULL, NULL, ZIO_FLAG_CANFAIL); 395253821Sdelphij for (uint64_t i = start; i <= end; i++) { 396168404Spjd uint64_t ibyte = i << shift; 397185029Spjd err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); 398168404Spjd i = ibyte >> shift; 399284593Savg if (err == ESRCH || i > end) 400168404Spjd break; 401321547Smav if (err != 0) { 402168404Spjd tx->tx_err = err; 403321547Smav (void) zio_wait(zio); 404168404Spjd return; 405168404Spjd } 406168404Spjd 407321547Smav (void) refcount_add_many(&txh->txh_memory_tohold, 408321547Smav 1 << dn->dn_indblkshift, FTAG); 409321547Smav 410168404Spjd err = dmu_tx_check_ioerr(zio, dn, 1, i); 411321547Smav if (err != 0) { 412168404Spjd tx->tx_err = err; 413321547Smav (void) zio_wait(zio); 414168404Spjd return; 415168404Spjd } 416168404Spjd } 417168404Spjd err = zio_wait(zio); 418321547Smav if (err != 0) { 419168404Spjd tx->tx_err = err; 420168404Spjd return; 421168404Spjd } 422168404Spjd } 423168404Spjd} 424168404Spjd 425168404Spjdvoid 426321549Smavdmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) 427168404Spjd{ 428321549Smav dmu_tx_hold_t *txh; 429321549Smav 430321549Smav txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 431321549Smav object, THT_FREE, off, len); 432321549Smav if (txh != NULL) 433321549Smav (void) dmu_tx_hold_free_impl(txh, off, len); 434321549Smav} 435321549Smav 436321549Smavvoid 437321549Smavdmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) 438321549Smav{ 439321549Smav dmu_tx_hold_t *txh; 440321549Smav 441321549Smav txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); 442321549Smav if (txh != NULL) 443321549Smav (void) dmu_tx_hold_free_impl(txh, off, len); 444321549Smav} 445321549Smav 446321549Smavstatic void 447321549Smavdmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, int add, const char *name) 448321549Smav{ 449321549Smav dmu_tx_t *tx = txh->txh_tx; 450321549Smav dnode_t *dn; 451307049Smav int err; 452168404Spjd 453168404Spjd ASSERT(tx->tx_txg == 0); 454168404Spjd 455321549Smav dn = txh->txh_dnode; 456168404Spjd 457168404Spjd dmu_tx_count_dnode(txh); 458168404Spjd 459321547Smav /* 460321547Smav * Modifying a almost-full microzap is around the worst case (128KB) 461321547Smav * 462321547Smav * If it is a fat zap, the worst case would be 7*16KB=112KB: 463321547Smav * - 3 blocks overwritten: target leaf, ptrtbl block, header block 464321547Smav * - 4 new blocks written if adding: 465321547Smav * - 2 blocks for possibly split leaves, 466321547Smav * - 2 grown ptrtbl blocks 467321547Smav */ 468321547Smav (void) refcount_add_many(&txh->txh_space_towrite, 469321547Smav MZAP_MAX_BLKSZ, FTAG); 470321547Smav 471321547Smav if (dn == NULL) 472168404Spjd return; 473168404Spjd 474236884Smm ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); 475168404Spjd 476321547Smav if (dn->dn_maxblkid == 0 || name == NULL) { 477168404Spjd /* 478321547Smav * This is a microzap (only one block), or we don't know 479321547Smav * the name. Check the first block for i/o errors. 480168404Spjd */ 481168404Spjd err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 482321547Smav if (err != 0) { 483168404Spjd tx->tx_err = err; 484168404Spjd } 485321547Smav } else { 486168404Spjd /* 487321547Smav * Access the name so that we'll check for i/o errors to 488321547Smav * the leaf blocks, etc. We ignore ENOENT, as this name 489321547Smav * may not yet exist. 490168404Spjd */ 491307290Smav err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); 492321547Smav if (err == EIO || err == ECKSUM || err == ENXIO) { 493168404Spjd tx->tx_err = err; 494168404Spjd } 495168404Spjd } 496168404Spjd} 497168404Spjd 498168404Spjdvoid 499321549Smavdmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) 500321549Smav{ 501321549Smav dmu_tx_hold_t *txh; 502321549Smav 503321549Smav ASSERT0(tx->tx_txg); 504321549Smav 505321549Smav txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 506321549Smav object, THT_ZAP, add, (uintptr_t)name); 507321549Smav if (txh != NULL) 508321549Smav dmu_tx_hold_zap_impl(txh, add, name); 509321549Smav} 510321549Smav 511321549Smavvoid 512321549Smavdmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) 513321549Smav{ 514321549Smav dmu_tx_hold_t *txh; 515321549Smav 516321549Smav ASSERT0(tx->tx_txg); 517321549Smav ASSERT(dn != NULL); 518321549Smav 519321549Smav txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); 520321549Smav if (txh != NULL) 521321549Smav dmu_tx_hold_zap_impl(txh, add, name); 522321549Smav} 523321549Smav 524321549Smavvoid 525168404Spjddmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) 526168404Spjd{ 527168404Spjd dmu_tx_hold_t *txh; 528168404Spjd 529168404Spjd ASSERT(tx->tx_txg == 0); 530168404Spjd 531168404Spjd txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 532168404Spjd object, THT_BONUS, 0, 0); 533168404Spjd if (txh) 534168404Spjd dmu_tx_count_dnode(txh); 535168404Spjd} 536168404Spjd 537168404Spjdvoid 538321549Smavdmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) 539321549Smav{ 540321549Smav dmu_tx_hold_t *txh; 541321549Smav 542321549Smav ASSERT0(tx->tx_txg); 543321549Smav 544321549Smav txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); 545321549Smav if (txh) 546321549Smav dmu_tx_count_dnode(txh); 547321549Smav} 548321549Smav 549321549Smavvoid 550168404Spjddmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) 551168404Spjd{ 552168404Spjd dmu_tx_hold_t *txh; 553168404Spjd ASSERT(tx->tx_txg == 0); 554168404Spjd 555168404Spjd txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 556168404Spjd DMU_NEW_OBJECT, THT_SPACE, space, 0); 557168404Spjd 558307049Smav (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); 559168404Spjd} 560168404Spjd 561168404Spjd#ifdef ZFS_DEBUG 562168404Spjdvoid 563168404Spjddmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) 564168404Spjd{ 565321547Smav boolean_t match_object = B_FALSE; 566321547Smav boolean_t match_offset = B_FALSE; 567168404Spjd 568219089Spjd DB_DNODE_ENTER(db); 569321547Smav dnode_t *dn = DB_DNODE(db); 570168404Spjd ASSERT(tx->tx_txg != 0); 571219089Spjd ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); 572168404Spjd ASSERT3U(dn->dn_object, ==, db->db.db_object); 573168404Spjd 574219089Spjd if (tx->tx_anyobj) { 575219089Spjd DB_DNODE_EXIT(db); 576168404Spjd return; 577219089Spjd } 578168404Spjd 579168404Spjd /* XXX No checking on the meta dnode for now */ 580219089Spjd if (db->db.db_object == DMU_META_DNODE_OBJECT) { 581219089Spjd DB_DNODE_EXIT(db); 582168404Spjd return; 583219089Spjd } 584168404Spjd 585321547Smav for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 586168404Spjd txh = list_next(&tx->tx_holds, txh)) { 587168404Spjd ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); 588168404Spjd if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) 589168404Spjd match_object = TRUE; 590168404Spjd if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { 591168404Spjd int datablkshift = dn->dn_datablkshift ? 592168404Spjd dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; 593168404Spjd int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 594168404Spjd int shift = datablkshift + epbs * db->db_level; 595168404Spjd uint64_t beginblk = shift >= 64 ? 0 : 596168404Spjd (txh->txh_arg1 >> shift); 597168404Spjd uint64_t endblk = shift >= 64 ? 0 : 598168404Spjd ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); 599168404Spjd uint64_t blkid = db->db_blkid; 600168404Spjd 601168404Spjd /* XXX txh_arg2 better not be zero... */ 602168404Spjd 603168404Spjd dprintf("found txh type %x beginblk=%llx endblk=%llx\n", 604168404Spjd txh->txh_type, beginblk, endblk); 605168404Spjd 606168404Spjd switch (txh->txh_type) { 607168404Spjd case THT_WRITE: 608168404Spjd if (blkid >= beginblk && blkid <= endblk) 609168404Spjd match_offset = TRUE; 610168404Spjd /* 611168404Spjd * We will let this hold work for the bonus 612219089Spjd * or spill buffer so that we don't need to 613219089Spjd * hold it when creating a new object. 614168404Spjd */ 615219089Spjd if (blkid == DMU_BONUS_BLKID || 616219089Spjd blkid == DMU_SPILL_BLKID) 617168404Spjd match_offset = TRUE; 618168404Spjd /* 619168404Spjd * They might have to increase nlevels, 620168404Spjd * thus dirtying the new TLIBs. Or the 621168404Spjd * might have to change the block size, 622168404Spjd * thus dirying the new lvl=0 blk=0. 623168404Spjd */ 624168404Spjd if (blkid == 0) 625168404Spjd match_offset = TRUE; 626168404Spjd break; 627168404Spjd case THT_FREE: 628185029Spjd /* 629185029Spjd * We will dirty all the level 1 blocks in 630185029Spjd * the free range and perhaps the first and 631185029Spjd * last level 0 block. 632185029Spjd */ 633185029Spjd if (blkid >= beginblk && (blkid <= endblk || 634185029Spjd txh->txh_arg2 == DMU_OBJECT_END)) 635168404Spjd match_offset = TRUE; 636168404Spjd break; 637219089Spjd case THT_SPILL: 638219089Spjd if (blkid == DMU_SPILL_BLKID) 639219089Spjd match_offset = TRUE; 640219089Spjd break; 641168404Spjd case THT_BONUS: 642219089Spjd if (blkid == DMU_BONUS_BLKID) 643168404Spjd match_offset = TRUE; 644168404Spjd break; 645168404Spjd case THT_ZAP: 646168404Spjd match_offset = TRUE; 647168404Spjd break; 648168404Spjd case THT_NEWOBJECT: 649168404Spjd match_object = TRUE; 650168404Spjd break; 651168404Spjd default: 652168404Spjd ASSERT(!"bad txh_type"); 653168404Spjd } 654168404Spjd } 655219089Spjd if (match_object && match_offset) { 656219089Spjd DB_DNODE_EXIT(db); 657168404Spjd return; 658219089Spjd } 659168404Spjd } 660219089Spjd DB_DNODE_EXIT(db); 661168404Spjd panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", 662168404Spjd (u_longlong_t)db->db.db_object, db->db_level, 663168404Spjd (u_longlong_t)db->db_blkid); 664168404Spjd} 665168404Spjd#endif 666168404Spjd 667258632Savg/* 668258632Savg * If we can't do 10 iops, something is wrong. Let us go ahead 669258632Savg * and hit zfs_dirty_data_max. 670258632Savg */ 671258632Savghrtime_t zfs_delay_max_ns = MSEC2NSEC(100); 672258632Savgint zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ 673258632Savg 674258632Savg/* 675258632Savg * We delay transactions when we've determined that the backend storage 676258632Savg * isn't able to accommodate the rate of incoming writes. 677258632Savg * 678258632Savg * If there is already a transaction waiting, we delay relative to when 679258632Savg * that transaction finishes waiting. This way the calculated min_time 680258632Savg * is independent of the number of threads concurrently executing 681258632Savg * transactions. 682258632Savg * 683258632Savg * If we are the only waiter, wait relative to when the transaction 684258632Savg * started, rather than the current time. This credits the transaction for 685258632Savg * "time already served", e.g. reading indirect blocks. 686258632Savg * 687258632Savg * The minimum time for a transaction to take is calculated as: 688258632Savg * min_time = scale * (dirty - min) / (max - dirty) 689258632Savg * min_time is then capped at zfs_delay_max_ns. 690258632Savg * 691258632Savg * The delay has two degrees of freedom that can be adjusted via tunables. 692258632Savg * The percentage of dirty data at which we start to delay is defined by 693258632Savg * zfs_delay_min_dirty_percent. This should typically be at or above 694258632Savg * zfs_vdev_async_write_active_max_dirty_percent so that we only start to 695258632Savg * delay after writing at full speed has failed to keep up with the incoming 696258632Savg * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly 697258632Savg * speaking, this variable determines the amount of delay at the midpoint of 698258632Savg * the curve. 699258632Savg * 700258632Savg * delay 701258632Savg * 10ms +-------------------------------------------------------------*+ 702258632Savg * | *| 703258632Savg * 9ms + *+ 704258632Savg * | *| 705258632Savg * 8ms + *+ 706258632Savg * | * | 707258632Savg * 7ms + * + 708258632Savg * | * | 709258632Savg * 6ms + * + 710258632Savg * | * | 711258632Savg * 5ms + * + 712258632Savg * | * | 713258632Savg * 4ms + * + 714258632Savg * | * | 715258632Savg * 3ms + * + 716258632Savg * | * | 717258632Savg * 2ms + (midpoint) * + 718258632Savg * | | ** | 719258632Savg * 1ms + v *** + 720258632Savg * | zfs_delay_scale ----------> ******** | 721258632Savg * 0 +-------------------------------------*********----------------+ 722258632Savg * 0% <- zfs_dirty_data_max -> 100% 723258632Savg * 724258632Savg * Note that since the delay is added to the outstanding time remaining on the 725258632Savg * most recent transaction, the delay is effectively the inverse of IOPS. 726258632Savg * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve 727258632Savg * was chosen such that small changes in the amount of accumulated dirty data 728258632Savg * in the first 3/4 of the curve yield relatively small differences in the 729258632Savg * amount of delay. 730258632Savg * 731258632Savg * The effects can be easier to understand when the amount of delay is 732258632Savg * represented on a log scale: 733258632Savg * 734258632Savg * delay 735258632Savg * 100ms +-------------------------------------------------------------++ 736258632Savg * + + 737258632Savg * | | 738258632Savg * + *+ 739258632Savg * 10ms + *+ 740258632Savg * + ** + 741258632Savg * | (midpoint) ** | 742258632Savg * + | ** + 743258632Savg * 1ms + v **** + 744258632Savg * + zfs_delay_scale ----------> ***** + 745258632Savg * | **** | 746258632Savg * + **** + 747258632Savg * 100us + ** + 748258632Savg * + * + 749258632Savg * | * | 750258632Savg * + * + 751258632Savg * 10us + * + 752258632Savg * + + 753258632Savg * | | 754258632Savg * + + 755258632Savg * +--------------------------------------------------------------+ 756258632Savg * 0% <- zfs_dirty_data_max -> 100% 757258632Savg * 758258632Savg * Note here that only as the amount of dirty data approaches its limit does 759258632Savg * the delay start to increase rapidly. The goal of a properly tuned system 760258632Savg * should be to keep the amount of dirty data out of that range by first 761258632Savg * ensuring that the appropriate limits are set for the I/O scheduler to reach 762258632Savg * optimal throughput on the backend storage, and then by changing the value 763258632Savg * of zfs_delay_scale to increase the steepness of the curve. 764258632Savg */ 765258632Savgstatic void 766258632Savgdmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) 767258632Savg{ 768258632Savg dsl_pool_t *dp = tx->tx_pool; 769258632Savg uint64_t delay_min_bytes = 770258632Savg zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; 771258632Savg hrtime_t wakeup, min_tx_time, now; 772258632Savg 773258632Savg if (dirty <= delay_min_bytes) 774258632Savg return; 775258632Savg 776258632Savg /* 777258632Savg * The caller has already waited until we are under the max. 778258632Savg * We make them pass us the amount of dirty data so we don't 779258632Savg * have to handle the case of it being >= the max, which could 780258632Savg * cause a divide-by-zero if it's == the max. 781258632Savg */ 782258632Savg ASSERT3U(dirty, <, zfs_dirty_data_max); 783258632Savg 784258632Savg now = gethrtime(); 785258632Savg min_tx_time = zfs_delay_scale * 786258632Savg (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); 787258632Savg if (now > tx->tx_start + min_tx_time) 788258632Savg return; 789258632Savg 790258632Savg min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); 791258632Savg 792258632Savg DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, 793258632Savg uint64_t, min_tx_time); 794258632Savg 795258632Savg mutex_enter(&dp->dp_lock); 796258632Savg wakeup = MAX(tx->tx_start + min_tx_time, 797258632Savg dp->dp_last_wakeup + min_tx_time); 798258632Savg dp->dp_last_wakeup = wakeup; 799258632Savg mutex_exit(&dp->dp_lock); 800258632Savg 801258632Savg#ifdef _KERNEL 802258632Savg#ifdef illumos 803258632Savg mutex_enter(&curthread->t_delay_lock); 804258632Savg while (cv_timedwait_hires(&curthread->t_delay_cv, 805258632Savg &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns, 806258632Savg CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0) 807258632Savg continue; 808258632Savg mutex_exit(&curthread->t_delay_lock); 809258632Savg#else 810258632Savg pause_sbt("dmu_tx_delay", wakeup * SBT_1NS, 811258632Savg zfs_delay_resolution_ns * SBT_1NS, C_ABSOLUTE); 812258632Savg#endif 813258632Savg#else 814258632Savg hrtime_t delta = wakeup - gethrtime(); 815258632Savg struct timespec ts; 816258632Savg ts.tv_sec = delta / NANOSEC; 817258632Savg ts.tv_nsec = delta % NANOSEC; 818258632Savg (void) nanosleep(&ts, NULL); 819258632Savg#endif 820258632Savg} 821258632Savg 822321547Smav/* 823321547Smav * This routine attempts to assign the transaction to a transaction group. 824321547Smav * To do so, we must determine if there is sufficient free space on disk. 825321547Smav * 826321547Smav * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() 827321547Smav * on it), then it is assumed that there is sufficient free space, 828321547Smav * unless there's insufficient slop space in the pool (see the comment 829321547Smav * above spa_slop_shift in spa_misc.c). 830321547Smav * 831321547Smav * If it is not a "netfree" transaction, then if the data already on disk 832321547Smav * is over the allowed usage (e.g. quota), this will fail with EDQUOT or 833321547Smav * ENOSPC. Otherwise, if the current rough estimate of pending changes, 834321547Smav * plus the rough estimate of this transaction's changes, may exceed the 835321547Smav * allowed usage, then this will fail with ERESTART, which will cause the 836321547Smav * caller to wait for the pending changes to be written to disk (by waiting 837321547Smav * for the next TXG to open), and then check the space usage again. 838321547Smav * 839321547Smav * The rough estimate of pending changes is comprised of the sum of: 840321547Smav * 841321547Smav * - this transaction's holds' txh_space_towrite 842321547Smav * 843321547Smav * - dd_tempreserved[], which is the sum of in-flight transactions' 844321547Smav * holds' txh_space_towrite (i.e. those transactions that have called 845321547Smav * dmu_tx_assign() but not yet called dmu_tx_commit()). 846321547Smav * 847321547Smav * - dd_space_towrite[], which is the amount of dirtied dbufs. 848321547Smav * 849321547Smav * Note that all of these values are inflated by spa_get_worst_case_asize(), 850321547Smav * which means that we may get ERESTART well before we are actually in danger 851321547Smav * of running out of space, but this also mitigates any small inaccuracies 852321547Smav * in the rough estimate (e.g. txh_space_towrite doesn't take into account 853321547Smav * indirect blocks, and dd_space_towrite[] doesn't take into account changes 854321547Smav * to the MOS). 855321547Smav * 856321547Smav * Note that due to this algorithm, it is possible to exceed the allowed 857321547Smav * usage by one transaction. Also, as we approach the allowed usage, 858321547Smav * we will allow a very limited amount of changes into each TXG, thus 859321547Smav * decreasing performance. 860321547Smav */ 861168404Spjdstatic int 862248571Smmdmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) 863168404Spjd{ 864185029Spjd spa_t *spa = tx->tx_pool->dp_spa; 865168404Spjd 866240415Smm ASSERT0(tx->tx_txg); 867185029Spjd 868168404Spjd if (tx->tx_err) 869168404Spjd return (tx->tx_err); 870168404Spjd 871185029Spjd if (spa_suspended(spa)) { 872185029Spjd /* 873185029Spjd * If the user has indicated a blocking failure mode 874185029Spjd * then return ERESTART which will block in dmu_tx_wait(). 875185029Spjd * Otherwise, return EIO so that an error can get 876185029Spjd * propagated back to the VOP calls. 877185029Spjd * 878185029Spjd * Note that we always honor the txg_how flag regardless 879185029Spjd * of the failuremode setting. 880185029Spjd */ 881185029Spjd if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && 882185029Spjd txg_how != TXG_WAIT) 883249195Smm return (SET_ERROR(EIO)); 884185029Spjd 885249195Smm return (SET_ERROR(ERESTART)); 886185029Spjd } 887185029Spjd 888258632Savg if (!tx->tx_waited && 889258632Savg dsl_pool_need_dirty_delay(tx->tx_pool)) { 890258632Savg tx->tx_wait_dirty = B_TRUE; 891258632Savg return (SET_ERROR(ERESTART)); 892258632Savg } 893258632Savg 894168404Spjd tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); 895168404Spjd tx->tx_needassign_txh = NULL; 896168404Spjd 897168404Spjd /* 898168404Spjd * NB: No error returns are allowed after txg_hold_open, but 899168404Spjd * before processing the dnode holds, due to the 900168404Spjd * dmu_tx_unassign() logic. 901168404Spjd */ 902168404Spjd 903321547Smav uint64_t towrite = 0; 904321547Smav uint64_t tohold = 0; 905321547Smav for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 906168404Spjd txh = list_next(&tx->tx_holds, txh)) { 907168404Spjd dnode_t *dn = txh->txh_dnode; 908168404Spjd if (dn != NULL) { 909168404Spjd mutex_enter(&dn->dn_mtx); 910168404Spjd if (dn->dn_assigned_txg == tx->tx_txg - 1) { 911168404Spjd mutex_exit(&dn->dn_mtx); 912168404Spjd tx->tx_needassign_txh = txh; 913249195Smm return (SET_ERROR(ERESTART)); 914168404Spjd } 915168404Spjd if (dn->dn_assigned_txg == 0) 916168404Spjd dn->dn_assigned_txg = tx->tx_txg; 917168404Spjd ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 918168404Spjd (void) refcount_add(&dn->dn_tx_holds, tx); 919168404Spjd mutex_exit(&dn->dn_mtx); 920168404Spjd } 921307049Smav towrite += refcount_count(&txh->txh_space_towrite); 922307049Smav tohold += refcount_count(&txh->txh_memory_tohold); 923168404Spjd } 924168404Spjd 925185029Spjd /* needed allocation: worst-case estimate of write space */ 926321547Smav uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); 927185029Spjd /* calculate memory footprint estimate */ 928321547Smav uint64_t memory = towrite + tohold; 929168404Spjd 930321547Smav if (tx->tx_dir != NULL && asize != 0) { 931185029Spjd int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, 932321547Smav asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); 933321547Smav if (err != 0) 934168404Spjd return (err); 935168404Spjd } 936168404Spjd 937168404Spjd return (0); 938168404Spjd} 939168404Spjd 940168404Spjdstatic void 941168404Spjddmu_tx_unassign(dmu_tx_t *tx) 942168404Spjd{ 943168404Spjd if (tx->tx_txg == 0) 944168404Spjd return; 945168404Spjd 946168404Spjd txg_rele_to_quiesce(&tx->tx_txgh); 947168404Spjd 948251629Sdelphij /* 949251629Sdelphij * Walk the transaction's hold list, removing the hold on the 950251629Sdelphij * associated dnode, and notifying waiters if the refcount drops to 0. 951251629Sdelphij */ 952321547Smav for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); 953321547Smav txh != tx->tx_needassign_txh; 954168404Spjd txh = list_next(&tx->tx_holds, txh)) { 955168404Spjd dnode_t *dn = txh->txh_dnode; 956168404Spjd 957168404Spjd if (dn == NULL) 958168404Spjd continue; 959168404Spjd mutex_enter(&dn->dn_mtx); 960168404Spjd ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 961168404Spjd 962168404Spjd if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 963168404Spjd dn->dn_assigned_txg = 0; 964168404Spjd cv_broadcast(&dn->dn_notxholds); 965168404Spjd } 966168404Spjd mutex_exit(&dn->dn_mtx); 967168404Spjd } 968168404Spjd 969168404Spjd txg_rele_to_sync(&tx->tx_txgh); 970168404Spjd 971168404Spjd tx->tx_lasttried_txg = tx->tx_txg; 972168404Spjd tx->tx_txg = 0; 973168404Spjd} 974168404Spjd 975168404Spjd/* 976168404Spjd * Assign tx to a transaction group. txg_how can be one of: 977168404Spjd * 978168404Spjd * (1) TXG_WAIT. If the current open txg is full, waits until there's 979168404Spjd * a new one. This should be used when you're not holding locks. 980248571Smm * It will only fail if we're truly out of space (or over quota). 981168404Spjd * 982168404Spjd * (2) TXG_NOWAIT. If we can't assign into the current open txg without 983168404Spjd * blocking, returns immediately with ERESTART. This should be used 984168404Spjd * whenever you're holding locks. On an ERESTART error, the caller 985168404Spjd * should drop locks, do a dmu_tx_wait(tx), and try again. 986258632Savg * 987258632Savg * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait() 988258632Savg * has already been called on behalf of this operation (though 989258632Savg * most likely on a different tx). 990168404Spjd */ 991168404Spjdint 992248571Smmdmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) 993168404Spjd{ 994168404Spjd int err; 995168404Spjd 996168404Spjd ASSERT(tx->tx_txg == 0); 997258632Savg ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || 998258632Savg txg_how == TXG_WAITED); 999168404Spjd ASSERT(!dsl_pool_sync_context(tx->tx_pool)); 1000168404Spjd 1001248571Smm /* If we might wait, we must not hold the config lock. */ 1002248571Smm ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); 1003248571Smm 1004258632Savg if (txg_how == TXG_WAITED) 1005258632Savg tx->tx_waited = B_TRUE; 1006258632Savg 1007168404Spjd while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { 1008168404Spjd dmu_tx_unassign(tx); 1009168404Spjd 1010168404Spjd if (err != ERESTART || txg_how != TXG_WAIT) 1011168404Spjd return (err); 1012168404Spjd 1013168404Spjd dmu_tx_wait(tx); 1014168404Spjd } 1015168404Spjd 1016168404Spjd txg_rele_to_quiesce(&tx->tx_txgh); 1017168404Spjd 1018168404Spjd return (0); 1019168404Spjd} 1020168404Spjd 1021168404Spjdvoid 1022168404Spjddmu_tx_wait(dmu_tx_t *tx) 1023168404Spjd{ 1024185029Spjd spa_t *spa = tx->tx_pool->dp_spa; 1025258632Savg dsl_pool_t *dp = tx->tx_pool; 1026185029Spjd 1027168404Spjd ASSERT(tx->tx_txg == 0); 1028248571Smm ASSERT(!dsl_pool_config_held(tx->tx_pool)); 1029168404Spjd 1030258632Savg if (tx->tx_wait_dirty) { 1031258632Savg /* 1032258632Savg * dmu_tx_try_assign() has determined that we need to wait 1033258632Savg * because we've consumed much or all of the dirty buffer 1034258632Savg * space. 1035258632Savg */ 1036258632Savg mutex_enter(&dp->dp_lock); 1037258632Savg while (dp->dp_dirty_total >= zfs_dirty_data_max) 1038258632Savg cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); 1039258632Savg uint64_t dirty = dp->dp_dirty_total; 1040258632Savg mutex_exit(&dp->dp_lock); 1041258632Savg 1042258632Savg dmu_tx_delay(tx, dirty); 1043258632Savg 1044258632Savg tx->tx_wait_dirty = B_FALSE; 1045258632Savg 1046258632Savg /* 1047258632Savg * Note: setting tx_waited only has effect if the caller 1048258632Savg * used TX_WAIT. Otherwise they are going to destroy 1049258632Savg * this tx and try again. The common case, zfs_write(), 1050258632Savg * uses TX_WAIT. 1051258632Savg */ 1052258632Savg tx->tx_waited = B_TRUE; 1053258632Savg } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { 1054258632Savg /* 1055258632Savg * If the pool is suspended we need to wait until it 1056258632Savg * is resumed. Note that it's possible that the pool 1057258632Savg * has become active after this thread has tried to 1058258632Savg * obtain a tx. If that's the case then tx_lasttried_txg 1059258632Savg * would not have been set. 1060258632Savg */ 1061258632Savg txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); 1062185029Spjd } else if (tx->tx_needassign_txh) { 1063258632Savg /* 1064258632Savg * A dnode is assigned to the quiescing txg. Wait for its 1065258632Savg * transaction to complete. 1066258632Savg */ 1067168404Spjd dnode_t *dn = tx->tx_needassign_txh->txh_dnode; 1068168404Spjd 1069168404Spjd mutex_enter(&dn->dn_mtx); 1070168404Spjd while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) 1071168404Spjd cv_wait(&dn->dn_notxholds, &dn->dn_mtx); 1072168404Spjd mutex_exit(&dn->dn_mtx); 1073168404Spjd tx->tx_needassign_txh = NULL; 1074168404Spjd } else { 1075168404Spjd txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); 1076168404Spjd } 1077168404Spjd} 1078168404Spjd 1079307049Smavstatic void 1080307049Smavdmu_tx_destroy(dmu_tx_t *tx) 1081307049Smav{ 1082307049Smav dmu_tx_hold_t *txh; 1083307049Smav 1084307049Smav while ((txh = list_head(&tx->tx_holds)) != NULL) { 1085307049Smav dnode_t *dn = txh->txh_dnode; 1086307049Smav 1087307049Smav list_remove(&tx->tx_holds, txh); 1088307049Smav refcount_destroy_many(&txh->txh_space_towrite, 1089307049Smav refcount_count(&txh->txh_space_towrite)); 1090307049Smav refcount_destroy_many(&txh->txh_memory_tohold, 1091307049Smav refcount_count(&txh->txh_memory_tohold)); 1092307049Smav kmem_free(txh, sizeof (dmu_tx_hold_t)); 1093307049Smav if (dn != NULL) 1094307049Smav dnode_rele(dn, tx); 1095307049Smav } 1096307049Smav 1097307049Smav list_destroy(&tx->tx_callbacks); 1098307049Smav list_destroy(&tx->tx_holds); 1099307049Smav kmem_free(tx, sizeof (dmu_tx_t)); 1100307049Smav} 1101307049Smav 1102168404Spjdvoid 1103168404Spjddmu_tx_commit(dmu_tx_t *tx) 1104168404Spjd{ 1105168404Spjd ASSERT(tx->tx_txg != 0); 1106168404Spjd 1107251629Sdelphij /* 1108251629Sdelphij * Go through the transaction's hold list and remove holds on 1109251629Sdelphij * associated dnodes, notifying waiters if no holds remain. 1110251629Sdelphij */ 1111307049Smav for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 1112307049Smav txh = list_next(&tx->tx_holds, txh)) { 1113168404Spjd dnode_t *dn = txh->txh_dnode; 1114168404Spjd 1115168404Spjd if (dn == NULL) 1116168404Spjd continue; 1117307049Smav 1118168404Spjd mutex_enter(&dn->dn_mtx); 1119168404Spjd ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1120168404Spjd 1121168404Spjd if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 1122168404Spjd dn->dn_assigned_txg = 0; 1123168404Spjd cv_broadcast(&dn->dn_notxholds); 1124168404Spjd } 1125168404Spjd mutex_exit(&dn->dn_mtx); 1126168404Spjd } 1127168404Spjd 1128168404Spjd if (tx->tx_tempreserve_cookie) 1129168404Spjd dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); 1130168404Spjd 1131219089Spjd if (!list_is_empty(&tx->tx_callbacks)) 1132219089Spjd txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); 1133219089Spjd 1134168404Spjd if (tx->tx_anyobj == FALSE) 1135168404Spjd txg_rele_to_sync(&tx->tx_txgh); 1136219089Spjd 1137307049Smav dmu_tx_destroy(tx); 1138168404Spjd} 1139168404Spjd 1140168404Spjdvoid 1141168404Spjddmu_tx_abort(dmu_tx_t *tx) 1142168404Spjd{ 1143168404Spjd ASSERT(tx->tx_txg == 0); 1144168404Spjd 1145219089Spjd /* 1146219089Spjd * Call any registered callbacks with an error code. 1147219089Spjd */ 1148219089Spjd if (!list_is_empty(&tx->tx_callbacks)) 1149219089Spjd dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); 1150219089Spjd 1151307049Smav dmu_tx_destroy(tx); 1152168404Spjd} 1153168404Spjd 1154168404Spjduint64_t 1155168404Spjddmu_tx_get_txg(dmu_tx_t *tx) 1156168404Spjd{ 1157168404Spjd ASSERT(tx->tx_txg != 0); 1158168404Spjd return (tx->tx_txg); 1159168404Spjd} 1160219089Spjd 1161248571Smmdsl_pool_t * 1162248571Smmdmu_tx_pool(dmu_tx_t *tx) 1163248571Smm{ 1164248571Smm ASSERT(tx->tx_pool != NULL); 1165248571Smm return (tx->tx_pool); 1166248571Smm} 1167248571Smm 1168219089Spjdvoid 1169219089Spjddmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) 1170219089Spjd{ 1171219089Spjd dmu_tx_callback_t *dcb; 1172219089Spjd 1173219089Spjd dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); 1174219089Spjd 1175219089Spjd dcb->dcb_func = func; 1176219089Spjd dcb->dcb_data = data; 1177219089Spjd 1178219089Spjd list_insert_tail(&tx->tx_callbacks, dcb); 1179219089Spjd} 1180219089Spjd 1181219089Spjd/* 1182219089Spjd * Call all the commit callbacks on a list, with a given error code. 1183219089Spjd */ 1184219089Spjdvoid 1185219089Spjddmu_tx_do_callbacks(list_t *cb_list, int error) 1186219089Spjd{ 1187219089Spjd dmu_tx_callback_t *dcb; 1188219089Spjd 1189307049Smav while ((dcb = list_head(cb_list)) != NULL) { 1190219089Spjd list_remove(cb_list, dcb); 1191219089Spjd dcb->dcb_func(dcb->dcb_data, error); 1192219089Spjd kmem_free(dcb, sizeof (dmu_tx_callback_t)); 1193219089Spjd } 1194219089Spjd} 1195219089Spjd 1196219089Spjd/* 1197219089Spjd * Interface to hold a bunch of attributes. 1198219089Spjd * used for creating new files. 1199219089Spjd * attrsize is the total size of all attributes 1200219089Spjd * to be added during object creation 1201219089Spjd * 1202219089Spjd * For updating/adding a single attribute dmu_tx_hold_sa() should be used. 1203219089Spjd */ 1204219089Spjd 1205219089Spjd/* 1206219089Spjd * hold necessary attribute name for attribute registration. 1207219089Spjd * should be a very rare case where this is needed. If it does 1208219089Spjd * happen it would only happen on the first write to the file system. 1209219089Spjd */ 1210219089Spjdstatic void 1211219089Spjddmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) 1212219089Spjd{ 1213219089Spjd if (!sa->sa_need_attr_registration) 1214219089Spjd return; 1215219089Spjd 1216321547Smav for (int i = 0; i != sa->sa_num_attrs; i++) { 1217219089Spjd if (!sa->sa_attr_table[i].sa_registered) { 1218219089Spjd if (sa->sa_reg_attr_obj) 1219219089Spjd dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, 1220219089Spjd B_TRUE, sa->sa_attr_table[i].sa_name); 1221219089Spjd else 1222219089Spjd dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1223219089Spjd B_TRUE, sa->sa_attr_table[i].sa_name); 1224219089Spjd } 1225219089Spjd } 1226219089Spjd} 1227219089Spjd 1228219089Spjdvoid 1229219089Spjddmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) 1230219089Spjd{ 1231321547Smav dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, 1232321547Smav tx->tx_objset, object, THT_SPILL, 0, 0); 1233219089Spjd 1234321547Smav (void) refcount_add_many(&txh->txh_space_towrite, 1235321547Smav SPA_OLD_MAXBLOCKSIZE, FTAG); 1236219089Spjd} 1237219089Spjd 1238219089Spjdvoid 1239219089Spjddmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) 1240219089Spjd{ 1241219089Spjd sa_os_t *sa = tx->tx_objset->os_sa; 1242219089Spjd 1243219089Spjd dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1244219089Spjd 1245219089Spjd if (tx->tx_objset->os_sa->sa_master_obj == 0) 1246219089Spjd return; 1247219089Spjd 1248321547Smav if (tx->tx_objset->os_sa->sa_layout_attr_obj) { 1249219089Spjd dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); 1250321547Smav } else { 1251219089Spjd dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); 1252219089Spjd dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); 1253219089Spjd dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1254219089Spjd dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1255219089Spjd } 1256219089Spjd 1257219089Spjd dmu_tx_sa_registration_hold(sa, tx); 1258219089Spjd 1259219089Spjd if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) 1260219089Spjd return; 1261219089Spjd 1262219089Spjd (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, 1263219089Spjd THT_SPILL, 0, 0); 1264219089Spjd} 1265219089Spjd 1266219089Spjd/* 1267219089Spjd * Hold SA attribute 1268219089Spjd * 1269219089Spjd * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) 1270219089Spjd * 1271219089Spjd * variable_size is the total size of all variable sized attributes 1272219089Spjd * passed to this function. It is not the total size of all 1273219089Spjd * variable size attributes that *may* exist on this object. 1274219089Spjd */ 1275219089Spjdvoid 1276219089Spjddmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) 1277219089Spjd{ 1278219089Spjd uint64_t object; 1279219089Spjd sa_os_t *sa = tx->tx_objset->os_sa; 1280219089Spjd 1281219089Spjd ASSERT(hdl != NULL); 1282219089Spjd 1283219089Spjd object = sa_handle_object(hdl); 1284219089Spjd 1285219089Spjd dmu_tx_hold_bonus(tx, object); 1286219089Spjd 1287219089Spjd if (tx->tx_objset->os_sa->sa_master_obj == 0) 1288219089Spjd return; 1289219089Spjd 1290219089Spjd if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || 1291219089Spjd tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { 1292219089Spjd dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); 1293219089Spjd dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); 1294219089Spjd dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1295219089Spjd dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1296219089Spjd } 1297219089Spjd 1298219089Spjd dmu_tx_sa_registration_hold(sa, tx); 1299219089Spjd 1300219089Spjd if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) 1301219089Spjd dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); 1302219089Spjd 1303219089Spjd if (sa->sa_force_spill || may_grow || hdl->sa_spill) { 1304219089Spjd ASSERT(tx->tx_txg == 0); 1305219089Spjd dmu_tx_hold_spill(tx, object); 1306219089Spjd } else { 1307219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; 1308219089Spjd dnode_t *dn; 1309219089Spjd 1310219089Spjd DB_DNODE_ENTER(db); 1311219089Spjd dn = DB_DNODE(db); 1312219089Spjd if (dn->dn_have_spill) { 1313219089Spjd ASSERT(tx->tx_txg == 0); 1314219089Spjd dmu_tx_hold_spill(tx, object); 1315219089Spjd } 1316219089Spjd DB_DNODE_EXIT(db); 1317219089Spjd } 1318219089Spjd} 1319