dmu_tx.c revision 168404
155682Smarkm/* 2233294Sstas * CDDL HEADER START 3233294Sstas * 4233294Sstas * The contents of this file are subject to the terms of the 555682Smarkm * Common Development and Distribution License (the "License"). 6233294Sstas * You may not use this file except in compliance with the License. 755682Smarkm * 8233294Sstas * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9233294Sstas * or http://www.opensolaris.org/os/licensing. 10233294Sstas * See the License for the specific language governing permissions 1155682Smarkm * and limitations under the License. 12233294Sstas * 13233294Sstas * When distributing Covered Code, include this CDDL HEADER in each 1455682Smarkm * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15233294Sstas * If applicable, add the following below this CDDL HEADER, with the 16233294Sstas * fields enclosed by brackets "[]" replaced with your own identifying 17233294Sstas * information: Portions Copyright [yyyy] [name of copyright owner] 1855682Smarkm * 19233294Sstas * CDDL HEADER END 20233294Sstas */ 21233294Sstas/* 22233294Sstas * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23233294Sstas * Use is subject to license terms. 24233294Sstas */ 25233294Sstas 26233294Sstas#pragma ident "%Z%%M% %I% %E% SMI" 27233294Sstas 28233294Sstas#include <sys/dmu.h> 29233294Sstas#include <sys/dmu_impl.h> 30233294Sstas#include <sys/dbuf.h> 31233294Sstas#include <sys/dmu_tx.h> 32233294Sstas#include <sys/dmu_objset.h> 33233294Sstas#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ 3455682Smarkm#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ 3555682Smarkm#include <sys/dsl_pool.h> 3655682Smarkm#include <sys/zap_impl.h> /* for fzap_default_block_shift */ 3755682Smarkm#include <sys/spa.h> 38233294Sstas#include <sys/zfs_context.h> 39233294Sstas 40233294Sstastypedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, 41233294Sstas uint64_t arg1, uint64_t arg2); 42233294Sstas 43233294Sstas 44233294Sstasdmu_tx_t * 45233294Sstasdmu_tx_create_dd(dsl_dir_t *dd) 46233294Sstas{ 47233294Sstas dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); 48233294Sstas tx->tx_dir = dd; 4955682Smarkm if (dd) 50233294Sstas tx->tx_pool = dd->dd_pool; 5155682Smarkm list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), 5255682Smarkm offsetof(dmu_tx_hold_t, txh_node)); 5355682Smarkm#ifdef ZFS_DEBUG 5455682Smarkm refcount_create(&tx->tx_space_written); 5555682Smarkm refcount_create(&tx->tx_space_freed); 5655682Smarkm#endif 57178825Sdfr return (tx); 5855682Smarkm} 5955682Smarkm 6055682Smarkmdmu_tx_t * 61233294Sstasdmu_tx_create(objset_t *os) 62233294Sstas{ 63233294Sstas dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir); 64233294Sstas tx->tx_objset = os; 65233294Sstas tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset); 66233294Sstas return (tx); 67233294Sstas} 68233294Sstas 69233294Sstasdmu_tx_t * 70233294Sstasdmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) 71233294Sstas{ 72233294Sstas dmu_tx_t *tx = dmu_tx_create_dd(NULL); 73233294Sstas 74233294Sstas ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); 7555682Smarkm tx->tx_pool = dp; 7655682Smarkm tx->tx_txg = txg; 7755682Smarkm tx->tx_anyobj = TRUE; 7855682Smarkm 7955682Smarkm return (tx); 80127808Snectar} 81127808Snectar 82127808Snectarint 83127808Snectardmu_tx_is_syncing(dmu_tx_t *tx) 8478527Sassar{ 85233294Sstas return (tx->tx_anyobj); 86233294Sstas} 8755682Smarkm 8878527Sassarint 8955682Smarkmdmu_tx_private_ok(dmu_tx_t *tx) 9055682Smarkm{ 9155682Smarkm return (tx->tx_anyobj); 9255682Smarkm} 9355682Smarkm 9455682Smarkmstatic dmu_tx_hold_t * 9555682Smarkmdmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, 96127808Snectar enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) 9755682Smarkm{ 9855682Smarkm dmu_tx_hold_t *txh; 99127808Snectar dnode_t *dn = NULL; 10055682Smarkm int err; 10155682Smarkm 10255682Smarkm if (object != DMU_NEW_OBJECT) { 103127808Snectar err = dnode_hold(os->os, object, tx, &dn); 10455682Smarkm if (err) { 10555682Smarkm tx->tx_err = err; 10655682Smarkm return (NULL); 10755682Smarkm } 10855682Smarkm 109178825Sdfr if (err == 0 && tx->tx_txg != 0) { 110233294Sstas mutex_enter(&dn->dn_mtx); 111233294Sstas /* 112233294Sstas * dn->dn_assigned_txg == tx->tx_txg doesn't pose a 113233294Sstas * problem, but there's no way for it to happen (for 114233294Sstas * now, at least). 115233294Sstas */ 116233294Sstas ASSERT(dn->dn_assigned_txg == 0); 117233294Sstas dn->dn_assigned_txg = tx->tx_txg; 118233294Sstas (void) refcount_add(&dn->dn_tx_holds, tx); 119233294Sstas mutex_exit(&dn->dn_mtx); 120233294Sstas } 121233294Sstas } 122233294Sstas 123233294Sstas txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); 124178825Sdfr txh->txh_tx = tx; 125178825Sdfr txh->txh_dnode = dn; 126178825Sdfr#ifdef ZFS_DEBUG 127178825Sdfr txh->txh_type = type; 128178825Sdfr txh->txh_arg1 = arg1; 129178825Sdfr txh->txh_arg2 = arg2; 130178825Sdfr#endif 131233294Sstas list_insert_tail(&tx->tx_holds, txh); 132233294Sstas 133233294Sstas return (txh); 134233294Sstas} 135233294Sstas 136233294Sstasvoid 137233294Sstasdmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) 138233294Sstas{ 139233294Sstas /* 140233294Sstas * If we're syncing, they can manipulate any object anyhow, and 141233294Sstas * the hold on the dnode_t can cause problems. 142233294Sstas */ 143233294Sstas if (!dmu_tx_is_syncing(tx)) { 144233294Sstas (void) dmu_tx_hold_object_impl(tx, os, 145178825Sdfr object, THT_NEWOBJECT, 0, 0); 146178825Sdfr } 147178825Sdfr} 148178825Sdfr 149178825Sdfrstatic int 150178825Sdfrdmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) 151178825Sdfr{ 152233294Sstas int err; 153233294Sstas dmu_buf_impl_t *db; 154233294Sstas 155233294Sstas rw_enter(&dn->dn_struct_rwlock, RW_READER); 156233294Sstas db = dbuf_hold_level(dn, level, blkid, FTAG); 157233294Sstas rw_exit(&dn->dn_struct_rwlock); 158233294Sstas if (db == NULL) 159233294Sstas return (EIO); 160233294Sstas err = dbuf_read(db, zio, DB_RF_CANFAIL); 161233294Sstas dbuf_rele(db, FTAG); 162233294Sstas return (err); 163233294Sstas} 164178825Sdfr 165178825Sdfr/* ARGSUSED */ 166178825Sdfrstatic void 167178825Sdfrdmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 168178825Sdfr{ 169178825Sdfr dnode_t *dn = txh->txh_dnode; 170233294Sstas uint64_t start, end, i; 171233294Sstas int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; 172233294Sstas int err = 0; 173233294Sstas 174233294Sstas if (len == 0) 175233294Sstas return; 176233294Sstas 177233294Sstas min_bs = SPA_MINBLOCKSHIFT; 178233294Sstas max_bs = SPA_MAXBLOCKSHIFT; 179233294Sstas min_ibs = DN_MIN_INDBLKSHIFT; 180233294Sstas max_ibs = DN_MAX_INDBLKSHIFT; 181233294Sstas 182233294Sstas 183233294Sstas /* 184233294Sstas * For i/o error checking, read the first and last level-0 185233294Sstas * blocks (if they are not aligned), and all the level-1 blocks. 186233294Sstas */ 187178825Sdfr 188178825Sdfr if (dn) { 189233294Sstas if (dn->dn_maxblkid == 0) { 190178825Sdfr err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 191178825Sdfr if (err) 192178825Sdfr goto out; 193178825Sdfr } else { 194178825Sdfr zio_t *zio = zio_root(dn->dn_objset->os_spa, 195178825Sdfr NULL, NULL, ZIO_FLAG_CANFAIL); 196178825Sdfr 197178825Sdfr /* first level-0 block */ 198233294Sstas start = off >> dn->dn_datablkshift; 199178825Sdfr if (P2PHASE(off, dn->dn_datablksz) || 200178825Sdfr len < dn->dn_datablksz) { 201178825Sdfr err = dmu_tx_check_ioerr(zio, dn, 0, start); 202233294Sstas if (err) 203233294Sstas goto out; 204233294Sstas } 205233294Sstas 206178825Sdfr /* last level-0 block */ 207178825Sdfr end = (off+len-1) >> dn->dn_datablkshift; 208178825Sdfr if (end != start && 209178825Sdfr P2PHASE(off+len, dn->dn_datablksz)) { 210178825Sdfr err = dmu_tx_check_ioerr(zio, dn, 0, end); 211178825Sdfr if (err) 212178825Sdfr goto out; 213178825Sdfr } 214178825Sdfr 215178825Sdfr /* level-1 blocks */ 216178825Sdfr if (dn->dn_nlevels > 1) { 217178825Sdfr start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; 218233294Sstas end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; 219233294Sstas for (i = start+1; i < end; i++) { 220178825Sdfr err = dmu_tx_check_ioerr(zio, dn, 1, i); 221178825Sdfr if (err) 222178825Sdfr goto out; 223178825Sdfr } 224178825Sdfr } 225178825Sdfr 226178825Sdfr err = zio_wait(zio); 227178825Sdfr if (err) 228178825Sdfr goto out; 229178825Sdfr } 230178825Sdfr } 231178825Sdfr 232178825Sdfr /* 233233294Sstas * If there's more than one block, the blocksize can't change, 234233294Sstas * so we can make a more precise estimate. Alternatively, 235233294Sstas * if the dnode's ibs is larger than max_ibs, always use that. 236233294Sstas * This ensures that if we reduce DN_MAX_INDBLKSHIFT, 237178825Sdfr * the code will still work correctly on existing pools. 238178825Sdfr */ 239178825Sdfr if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) { 240178825Sdfr min_ibs = max_ibs = dn->dn_indblkshift; 241178825Sdfr if (dn->dn_datablkshift != 0) 242178825Sdfr min_bs = max_bs = dn->dn_datablkshift; 243178825Sdfr } 244178825Sdfr 245178825Sdfr /* 246178825Sdfr * 'end' is the last thing we will access, not one past. 247178825Sdfr * This way we won't overflow when accessing the last byte. 248178825Sdfr */ 249178825Sdfr start = P2ALIGN(off, 1ULL << max_bs); 250178825Sdfr end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; 251178825Sdfr txh->txh_space_towrite += end - start + 1; 252178825Sdfr 253178825Sdfr start >>= min_bs; 254178825Sdfr end >>= min_bs; 255233294Sstas 256233294Sstas epbs = min_ibs - SPA_BLKPTRSHIFT; 257233294Sstas 258233294Sstas /* 259178825Sdfr * The object contains at most 2^(64 - min_bs) blocks, 260178825Sdfr * and each indirect level maps 2^epbs. 261178825Sdfr */ 262178825Sdfr for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { 263178825Sdfr start >>= epbs; 264178825Sdfr end >>= epbs; 265178825Sdfr /* 266233294Sstas * If we increase the number of levels of indirection, 267178825Sdfr * we'll need new blkid=0 indirect blocks. If start == 0, 268178825Sdfr * we're already accounting for that blocks; and if end == 0, 269178825Sdfr * we can't increase the number of levels beyond that. 270233294Sstas */ 271178825Sdfr if (start != 0 && end != 0) 272178825Sdfr txh->txh_space_towrite += 1ULL << max_ibs; 273178825Sdfr txh->txh_space_towrite += (end - start + 1) << max_ibs; 274178825Sdfr } 275178825Sdfr 276178825Sdfr ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS); 277178825Sdfr 278178825Sdfrout: 279178825Sdfr if (err) 280178825Sdfr txh->txh_tx->tx_err = err; 281178825Sdfr} 282178825Sdfr 283178825Sdfrstatic void 284233294Sstasdmu_tx_count_dnode(dmu_tx_hold_t *txh) 285178825Sdfr{ 286178825Sdfr dnode_t *dn = txh->txh_dnode; 287178825Sdfr dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode; 288178825Sdfr uint64_t space = mdn->dn_datablksz + 289178825Sdfr ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); 290178825Sdfr 291178825Sdfr if (dn && dn->dn_dbuf->db_blkptr && 292178825Sdfr dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 293178825Sdfr dn->dn_dbuf->db_blkptr->blk_birth)) { 294178825Sdfr txh->txh_space_tooverwrite += space; 295178825Sdfr } else { 296178825Sdfr txh->txh_space_towrite += space; 297178825Sdfr } 298178825Sdfr} 299178825Sdfr 300178825Sdfrvoid 301178825Sdfrdmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) 302233294Sstas{ 303233294Sstas dmu_tx_hold_t *txh; 304233294Sstas 305233294Sstas ASSERT(tx->tx_txg == 0); 306178825Sdfr ASSERT(len < DMU_MAX_ACCESS); 307178825Sdfr ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 308178825Sdfr 309178825Sdfr txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 310178825Sdfr object, THT_WRITE, off, len); 311233294Sstas if (txh == NULL) 312233294Sstas return; 313233294Sstas 314233294Sstas dmu_tx_count_write(txh, off, len); 315178825Sdfr dmu_tx_count_dnode(txh); 316178825Sdfr} 317178825Sdfr 318178825Sdfrstatic void 319178825Sdfrdmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 320178825Sdfr{ 321178825Sdfr uint64_t blkid, nblks; 322178825Sdfr uint64_t space = 0; 323178825Sdfr dnode_t *dn = txh->txh_dnode; 324178825Sdfr dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 325178825Sdfr spa_t *spa = txh->txh_tx->tx_pool->dp_spa; 326178825Sdfr int dirty; 327178825Sdfr 328233294Sstas /* 329233294Sstas * We don't need to use any locking to check for dirtyness 330233294Sstas * because it's OK if we get stale data -- the dnode may become 331233294Sstas * dirty immediately after our check anyway. This is just a 332233294Sstas * means to avoid the expensive count when we aren't sure we 333233294Sstas * need it. We need to be able to deal with a dirty dnode. 334233294Sstas */ 335233294Sstas dirty = list_link_active(&dn->dn_dirty_link[0]) | 336233294Sstas list_link_active(&dn->dn_dirty_link[1]) | 337233294Sstas list_link_active(&dn->dn_dirty_link[2]) | 338233294Sstas list_link_active(&dn->dn_dirty_link[3]); 339178825Sdfr if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0) 340178825Sdfr return; 341233294Sstas 342178825Sdfr /* 343178825Sdfr * the struct_rwlock protects us against dn_phys->dn_nlevels 344178825Sdfr * changing, in case (against all odds) we manage to dirty & 345178825Sdfr * sync out the changes after we check for being dirty. 346178825Sdfr * also, dbuf_hold_impl() wants us to have the struct_rwlock. 347178825Sdfr * 348178825Sdfr * It's fine to use dn_datablkshift rather than the dn_phys 349178825Sdfr * equivalent because if it is changing, maxblkid==0 and we will 350178825Sdfr * bail. 351178825Sdfr */ 352178825Sdfr rw_enter(&dn->dn_struct_rwlock, RW_READER); 353178825Sdfr if (dn->dn_phys->dn_maxblkid == 0) { 354178825Sdfr if (off == 0 && len >= dn->dn_datablksz) { 355233294Sstas blkid = 0; 356233294Sstas nblks = 1; 357178825Sdfr } else { 358178825Sdfr rw_exit(&dn->dn_struct_rwlock); 359178825Sdfr return; 360178825Sdfr } 361178825Sdfr } else { 362178825Sdfr blkid = off >> dn->dn_datablkshift; 363178825Sdfr nblks = (off + len) >> dn->dn_datablkshift; 364178825Sdfr 365233294Sstas if (blkid >= dn->dn_phys->dn_maxblkid) { 366233294Sstas rw_exit(&dn->dn_struct_rwlock); 367233294Sstas return; 368233294Sstas } 369178825Sdfr if (blkid + nblks > dn->dn_phys->dn_maxblkid) 370178825Sdfr nblks = dn->dn_phys->dn_maxblkid - blkid; 371178825Sdfr 372178825Sdfr /* don't bother after 128,000 blocks */ 373233294Sstas nblks = MIN(nblks, 128*1024); 374233294Sstas } 375233294Sstas 376233294Sstas if (dn->dn_phys->dn_nlevels == 1) { 377233294Sstas int i; 378233294Sstas for (i = 0; i < nblks; i++) { 379233294Sstas blkptr_t *bp = dn->dn_phys->dn_blkptr; 380233294Sstas ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr); 381233294Sstas bp += blkid + i; 382233294Sstas if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { 383233294Sstas dprintf_bp(bp, "can free old%s", ""); 384233294Sstas space += bp_get_dasize(spa, bp); 385233294Sstas } 386233294Sstas } 387233294Sstas nblks = 0; 388233294Sstas } 389233294Sstas 390233294Sstas while (nblks) { 391233294Sstas dmu_buf_impl_t *dbuf; 392233294Sstas int err, epbs, blkoff, tochk; 393233294Sstas 394233294Sstas epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 395233294Sstas blkoff = P2PHASE(blkid, 1<<epbs); 396233294Sstas tochk = MIN((1<<epbs) - blkoff, nblks); 397233294Sstas 398233294Sstas err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf); 399233294Sstas if (err == 0) { 400233294Sstas int i; 401233294Sstas blkptr_t *bp; 402233294Sstas 403233294Sstas err = dbuf_read(dbuf, NULL, 404233294Sstas DB_RF_HAVESTRUCT | DB_RF_CANFAIL); 405233294Sstas if (err != 0) { 406233294Sstas txh->txh_tx->tx_err = err; 407233294Sstas dbuf_rele(dbuf, FTAG); 408233294Sstas break; 409233294Sstas } 410233294Sstas 411233294Sstas bp = dbuf->db.db_data; 412233294Sstas bp += blkoff; 413233294Sstas 414233294Sstas for (i = 0; i < tochk; i++) { 415233294Sstas if (dsl_dataset_block_freeable(ds, 416233294Sstas bp[i].blk_birth)) { 417233294Sstas dprintf_bp(&bp[i], 418233294Sstas "can free old%s", ""); 419233294Sstas space += bp_get_dasize(spa, &bp[i]); 420233294Sstas } 421233294Sstas } 422233294Sstas dbuf_rele(dbuf, FTAG); 423233294Sstas } 424233294Sstas if (err && err != ENOENT) { 425233294Sstas txh->txh_tx->tx_err = err; 426233294Sstas break; 427233294Sstas } 428233294Sstas 429233294Sstas blkid += tochk; 430233294Sstas nblks -= tochk; 431233294Sstas } 432233294Sstas rw_exit(&dn->dn_struct_rwlock); 433233294Sstas 434233294Sstas txh->txh_space_tofree += space; 435233294Sstas} 436233294Sstas 437233294Sstasvoid 438233294Sstasdmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) 439233294Sstas{ 440233294Sstas dmu_tx_hold_t *txh; 441233294Sstas dnode_t *dn; 442233294Sstas uint64_t start, end, i; 443233294Sstas int err, shift; 444233294Sstas zio_t *zio; 445233294Sstas 446233294Sstas ASSERT(tx->tx_txg == 0); 447233294Sstas 448233294Sstas txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 449233294Sstas object, THT_FREE, off, len); 450233294Sstas if (txh == NULL) 451233294Sstas return; 452233294Sstas dn = txh->txh_dnode; 453233294Sstas 454233294Sstas /* first block */ 455233294Sstas if (off != 0) 456233294Sstas dmu_tx_count_write(txh, off, 1); 457233294Sstas /* last block */ 458233294Sstas if (len != DMU_OBJECT_END) 459233294Sstas dmu_tx_count_write(txh, off+len, 1); 460233294Sstas 461233294Sstas if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) 462233294Sstas return; 463233294Sstas if (len == DMU_OBJECT_END) 464233294Sstas len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; 465233294Sstas 466233294Sstas /* 467233294Sstas * For i/o error checking, read the first and last level-0 468233294Sstas * blocks, and all the level-1 blocks. The above count_write's 469233294Sstas * will take care of the level-0 blocks. 470233294Sstas */ 471233294Sstas if (dn->dn_nlevels > 1) { 472233294Sstas shift = dn->dn_datablkshift + dn->dn_indblkshift - 473233294Sstas SPA_BLKPTRSHIFT; 474233294Sstas start = off >> shift; 475233294Sstas end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; 476233294Sstas 477233294Sstas zio = zio_root(tx->tx_pool->dp_spa, 478233294Sstas NULL, NULL, ZIO_FLAG_CANFAIL); 479233294Sstas for (i = start; i <= end; i++) { 480233294Sstas uint64_t ibyte = i << shift; 481233294Sstas err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0); 482233294Sstas i = ibyte >> shift; 483233294Sstas if (err == ESRCH) 484233294Sstas break; 485233294Sstas if (err) { 486233294Sstas tx->tx_err = err; 487233294Sstas return; 488233294Sstas } 489233294Sstas 490233294Sstas err = dmu_tx_check_ioerr(zio, dn, 1, i); 491233294Sstas if (err) { 492233294Sstas tx->tx_err = err; 493233294Sstas return; 494233294Sstas } 495233294Sstas } 496233294Sstas err = zio_wait(zio); 497233294Sstas if (err) { 498233294Sstas tx->tx_err = err; 499233294Sstas return; 500233294Sstas } 501233294Sstas } 502233294Sstas 503233294Sstas dmu_tx_count_dnode(txh); 504233294Sstas dmu_tx_count_free(txh, off, len); 505233294Sstas} 506233294Sstas 507233294Sstasvoid 508233294Sstasdmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) 509233294Sstas{ 510233294Sstas dmu_tx_hold_t *txh; 511233294Sstas dnode_t *dn; 512233294Sstas uint64_t nblocks; 513233294Sstas int epbs, err; 514233294Sstas 515233294Sstas ASSERT(tx->tx_txg == 0); 516233294Sstas 517233294Sstas txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 518233294Sstas object, THT_ZAP, add, (uintptr_t)name); 519233294Sstas if (txh == NULL) 520233294Sstas return; 521233294Sstas dn = txh->txh_dnode; 522233294Sstas 523233294Sstas dmu_tx_count_dnode(txh); 524233294Sstas 525233294Sstas if (dn == NULL) { 526233294Sstas /* 527233294Sstas * We will be able to fit a new object's entries into one leaf 528233294Sstas * block. So there will be at most 2 blocks total, 529233294Sstas * including the header block. 530233294Sstas */ 531233294Sstas dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); 532233294Sstas return; 533233294Sstas } 534233294Sstas 535233294Sstas ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); 536233294Sstas 537233294Sstas if (dn->dn_maxblkid == 0 && !add) { 538233294Sstas /* 539233294Sstas * If there is only one block (i.e. this is a micro-zap) 540233294Sstas * and we are not adding anything, the accounting is simple. 541233294Sstas */ 542233294Sstas err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 543233294Sstas if (err) { 544233294Sstas tx->tx_err = err; 545233294Sstas return; 546233294Sstas } 547233294Sstas 548233294Sstas /* 549233294Sstas * Use max block size here, since we don't know how much 550233294Sstas * the size will change between now and the dbuf dirty call. 551233294Sstas */ 552233294Sstas if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 553233294Sstas dn->dn_phys->dn_blkptr[0].blk_birth)) 554233294Sstas txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; 555233294Sstas else 556233294Sstas txh->txh_space_towrite += SPA_MAXBLOCKSIZE; 557233294Sstas return; 558233294Sstas } 559233294Sstas 560233294Sstas if (dn->dn_maxblkid > 0 && name) { 561233294Sstas /* 562233294Sstas * access the name in this fat-zap so that we'll check 563233294Sstas * for i/o errors to the leaf blocks, etc. 564233294Sstas */ 565233294Sstas err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name, 566233294Sstas 8, 0, NULL); 567233294Sstas if (err == EIO) { 568233294Sstas tx->tx_err = err; 569233294Sstas return; 570233294Sstas } 571233294Sstas } 572233294Sstas 573233294Sstas /* 574233294Sstas * 3 blocks overwritten: target leaf, ptrtbl block, header block 575233294Sstas * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks 576233294Sstas */ 577233294Sstas dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz, 578233294Sstas (3 + add ? 3 : 0) << dn->dn_datablkshift); 579233294Sstas 580233294Sstas /* 581233294Sstas * If the modified blocks are scattered to the four winds, 582233294Sstas * we'll have to modify an indirect twig for each. 583233294Sstas */ 584233294Sstas epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 585233294Sstas for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) 586233294Sstas txh->txh_space_towrite += 3 << dn->dn_indblkshift; 587233294Sstas} 588233294Sstas 589233294Sstasvoid 590233294Sstasdmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) 591233294Sstas{ 592233294Sstas dmu_tx_hold_t *txh; 593233294Sstas 594233294Sstas ASSERT(tx->tx_txg == 0); 595233294Sstas 596233294Sstas txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 597233294Sstas object, THT_BONUS, 0, 0); 598233294Sstas if (txh) 599233294Sstas dmu_tx_count_dnode(txh); 600233294Sstas} 601233294Sstas 602233294Sstasvoid 603233294Sstasdmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) 604233294Sstas{ 605233294Sstas dmu_tx_hold_t *txh; 606233294Sstas ASSERT(tx->tx_txg == 0); 607233294Sstas 608233294Sstas txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 609233294Sstas DMU_NEW_OBJECT, THT_SPACE, space, 0); 610233294Sstas 611233294Sstas txh->txh_space_towrite += space; 612233294Sstas} 613233294Sstas 614233294Sstasint 615233294Sstasdmu_tx_holds(dmu_tx_t *tx, uint64_t object) 616233294Sstas{ 617233294Sstas dmu_tx_hold_t *txh; 618233294Sstas int holds = 0; 619233294Sstas 620233294Sstas /* 621233294Sstas * By asserting that the tx is assigned, we're counting the 622233294Sstas * number of dn_tx_holds, which is the same as the number of 623233294Sstas * dn_holds. Otherwise, we'd be counting dn_holds, but 624233294Sstas * dn_tx_holds could be 0. 625233294Sstas */ 626233294Sstas ASSERT(tx->tx_txg != 0); 627233294Sstas 628233294Sstas /* if (tx->tx_anyobj == TRUE) */ 629233294Sstas /* return (0); */ 630233294Sstas 631233294Sstas for (txh = list_head(&tx->tx_holds); txh; 632233294Sstas txh = list_next(&tx->tx_holds, txh)) { 633233294Sstas if (txh->txh_dnode && txh->txh_dnode->dn_object == object) 634233294Sstas holds++; 635233294Sstas } 636233294Sstas 637233294Sstas return (holds); 638233294Sstas} 639233294Sstas 640233294Sstas#ifdef ZFS_DEBUG 641233294Sstasvoid 642233294Sstasdmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) 643233294Sstas{ 644233294Sstas dmu_tx_hold_t *txh; 645233294Sstas int match_object = FALSE, match_offset = FALSE; 646233294Sstas dnode_t *dn = db->db_dnode; 647233294Sstas 648233294Sstas ASSERT(tx->tx_txg != 0); 649233294Sstas ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os); 650233294Sstas ASSERT3U(dn->dn_object, ==, db->db.db_object); 651233294Sstas 652233294Sstas if (tx->tx_anyobj) 653233294Sstas return; 654233294Sstas 655233294Sstas /* XXX No checking on the meta dnode for now */ 656233294Sstas if (db->db.db_object == DMU_META_DNODE_OBJECT) 657233294Sstas return; 658233294Sstas 659233294Sstas for (txh = list_head(&tx->tx_holds); txh; 660233294Sstas txh = list_next(&tx->tx_holds, txh)) { 661233294Sstas ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); 662233294Sstas if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) 663233294Sstas match_object = TRUE; 664233294Sstas if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { 665233294Sstas int datablkshift = dn->dn_datablkshift ? 666233294Sstas dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; 667233294Sstas int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 668233294Sstas int shift = datablkshift + epbs * db->db_level; 669233294Sstas uint64_t beginblk = shift >= 64 ? 0 : 670233294Sstas (txh->txh_arg1 >> shift); 671233294Sstas uint64_t endblk = shift >= 64 ? 0 : 672233294Sstas ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); 673233294Sstas uint64_t blkid = db->db_blkid; 674233294Sstas 675233294Sstas /* XXX txh_arg2 better not be zero... */ 676233294Sstas 677233294Sstas dprintf("found txh type %x beginblk=%llx endblk=%llx\n", 678233294Sstas txh->txh_type, beginblk, endblk); 679233294Sstas 680233294Sstas switch (txh->txh_type) { 681233294Sstas case THT_WRITE: 682233294Sstas if (blkid >= beginblk && blkid <= endblk) 683233294Sstas match_offset = TRUE; 684233294Sstas /* 685233294Sstas * We will let this hold work for the bonus 686233294Sstas * buffer so that we don't need to hold it 687233294Sstas * when creating a new object. 688233294Sstas */ 689233294Sstas if (blkid == DB_BONUS_BLKID) 690233294Sstas match_offset = TRUE; 691233294Sstas /* 692233294Sstas * They might have to increase nlevels, 693233294Sstas * thus dirtying the new TLIBs. Or the 694233294Sstas * might have to change the block size, 695233294Sstas * thus dirying the new lvl=0 blk=0. 696233294Sstas */ 697233294Sstas if (blkid == 0) 698233294Sstas match_offset = TRUE; 699233294Sstas break; 700233294Sstas case THT_FREE: 701233294Sstas if (blkid == beginblk && 702233294Sstas (txh->txh_arg1 != 0 || 703233294Sstas dn->dn_maxblkid == 0)) 704233294Sstas match_offset = TRUE; 705233294Sstas if (blkid == endblk && 706233294Sstas txh->txh_arg2 != DMU_OBJECT_END) 707233294Sstas match_offset = TRUE; 708233294Sstas break; 709233294Sstas case THT_BONUS: 710233294Sstas if (blkid == DB_BONUS_BLKID) 711233294Sstas match_offset = TRUE; 712233294Sstas break; 713233294Sstas case THT_ZAP: 714233294Sstas match_offset = TRUE; 715233294Sstas break; 716320907Sdelphij case THT_NEWOBJECT: 717320907Sdelphij match_object = TRUE; 718233294Sstas break; 719233294Sstas default: 720233294Sstas ASSERT(!"bad txh_type"); 721233294Sstas } 722233294Sstas } 723233294Sstas if (match_object && match_offset) 724233294Sstas return; 725233294Sstas } 726233294Sstas panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", 727233294Sstas (u_longlong_t)db->db.db_object, db->db_level, 728233294Sstas (u_longlong_t)db->db_blkid); 729233294Sstas} 730233294Sstas#endif 731233294Sstas 732233294Sstasstatic int 733233294Sstasdmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) 734233294Sstas{ 735233294Sstas dmu_tx_hold_t *txh; 736233294Sstas uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite; 737233294Sstas 738233294Sstas ASSERT3U(tx->tx_txg, ==, 0); 739233294Sstas if (tx->tx_err) 740233294Sstas return (tx->tx_err); 741233294Sstas 742233294Sstas tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); 743233294Sstas tx->tx_needassign_txh = NULL; 744233294Sstas 745233294Sstas /* 746233294Sstas * NB: No error returns are allowed after txg_hold_open, but 747233294Sstas * before processing the dnode holds, due to the 748233294Sstas * dmu_tx_unassign() logic. 749233294Sstas */ 750233294Sstas 751233294Sstas towrite = tofree = tooverwrite = 0; 752233294Sstas for (txh = list_head(&tx->tx_holds); txh; 753233294Sstas txh = list_next(&tx->tx_holds, txh)) { 754233294Sstas dnode_t *dn = txh->txh_dnode; 755233294Sstas if (dn != NULL) { 756233294Sstas mutex_enter(&dn->dn_mtx); 757233294Sstas if (dn->dn_assigned_txg == tx->tx_txg - 1) { 758233294Sstas mutex_exit(&dn->dn_mtx); 759233294Sstas tx->tx_needassign_txh = txh; 760233294Sstas return (ERESTART); 761233294Sstas } 762233294Sstas if (dn->dn_assigned_txg == 0) 763233294Sstas dn->dn_assigned_txg = tx->tx_txg; 764233294Sstas ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 765233294Sstas (void) refcount_add(&dn->dn_tx_holds, tx); 766233294Sstas mutex_exit(&dn->dn_mtx); 767233294Sstas } 768233294Sstas towrite += txh->txh_space_towrite; 769233294Sstas tofree += txh->txh_space_tofree; 770233294Sstas tooverwrite += txh->txh_space_tooverwrite; 771233294Sstas } 772233294Sstas 773233294Sstas /* 774233294Sstas * NB: This check must be after we've held the dnodes, so that 775233294Sstas * the dmu_tx_unassign() logic will work properly 776233294Sstas */ 777233294Sstas if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) 778233294Sstas return (ERESTART); 779233294Sstas 780233294Sstas /* 781233294Sstas * If a snapshot has been taken since we made our estimates, 782233294Sstas * assume that we won't be able to free or overwrite anything. 783233294Sstas */ 784233294Sstas if (tx->tx_objset && 785233294Sstas dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) > 786233294Sstas tx->tx_lastsnap_txg) { 787233294Sstas towrite += tooverwrite; 788233294Sstas tooverwrite = tofree = 0; 789233294Sstas } 790233294Sstas 791233294Sstas /* 792233294Sstas * Convert logical size to worst-case allocated size. 793233294Sstas */ 794233294Sstas fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; 795233294Sstas lsize = towrite + tooverwrite; 796233294Sstas asize = spa_get_asize(tx->tx_pool->dp_spa, lsize); 797233294Sstas 798233294Sstas#ifdef ZFS_DEBUG 799233294Sstas tx->tx_space_towrite = asize; 800233294Sstas tx->tx_space_tofree = tofree; 801233294Sstas tx->tx_space_tooverwrite = tooverwrite; 802233294Sstas#endif 803233294Sstas 804233294Sstas if (tx->tx_dir && asize != 0) { 805233294Sstas int err = dsl_dir_tempreserve_space(tx->tx_dir, 806233294Sstas lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx); 807233294Sstas if (err) 808233294Sstas return (err); 809233294Sstas } 810233294Sstas 811233294Sstas return (0); 812233294Sstas} 813233294Sstas 814233294Sstasstatic void 815233294Sstasdmu_tx_unassign(dmu_tx_t *tx) 816233294Sstas{ 817233294Sstas dmu_tx_hold_t *txh; 818233294Sstas 819233294Sstas if (tx->tx_txg == 0) 820233294Sstas return; 821233294Sstas 822233294Sstas txg_rele_to_quiesce(&tx->tx_txgh); 823233294Sstas 824233294Sstas for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; 825233294Sstas txh = list_next(&tx->tx_holds, txh)) { 826233294Sstas dnode_t *dn = txh->txh_dnode; 827233294Sstas 828233294Sstas if (dn == NULL) 829233294Sstas continue; 830233294Sstas mutex_enter(&dn->dn_mtx); 831233294Sstas ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 832233294Sstas 833233294Sstas if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 834233294Sstas dn->dn_assigned_txg = 0; 835233294Sstas cv_broadcast(&dn->dn_notxholds); 836233294Sstas } 837233294Sstas mutex_exit(&dn->dn_mtx); 838233294Sstas } 839233294Sstas 840233294Sstas txg_rele_to_sync(&tx->tx_txgh); 841233294Sstas 842233294Sstas tx->tx_lasttried_txg = tx->tx_txg; 843233294Sstas tx->tx_txg = 0; 844233294Sstas} 845233294Sstas 846233294Sstas/* 847233294Sstas * Assign tx to a transaction group. txg_how can be one of: 848233294Sstas * 849233294Sstas * (1) TXG_WAIT. If the current open txg is full, waits until there's 850233294Sstas * a new one. This should be used when you're not holding locks. 851 * If will only fail if we're truly out of space (or over quota). 852 * 853 * (2) TXG_NOWAIT. If we can't assign into the current open txg without 854 * blocking, returns immediately with ERESTART. This should be used 855 * whenever you're holding locks. On an ERESTART error, the caller 856 * should drop locks, do a dmu_tx_wait(tx), and try again. 857 * 858 * (3) A specific txg. Use this if you need to ensure that multiple 859 * transactions all sync in the same txg. Like TXG_NOWAIT, it 860 * returns ERESTART if it can't assign you into the requested txg. 861 */ 862int 863dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) 864{ 865 int err; 866 867 ASSERT(tx->tx_txg == 0); 868 ASSERT(txg_how != 0); 869 ASSERT(!dsl_pool_sync_context(tx->tx_pool)); 870 871 while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { 872 dmu_tx_unassign(tx); 873 874 if (err != ERESTART || txg_how != TXG_WAIT) 875 return (err); 876 877 dmu_tx_wait(tx); 878 } 879 880 txg_rele_to_quiesce(&tx->tx_txgh); 881 882 return (0); 883} 884 885void 886dmu_tx_wait(dmu_tx_t *tx) 887{ 888 ASSERT(tx->tx_txg == 0); 889 ASSERT(tx->tx_lasttried_txg != 0); 890 891 if (tx->tx_needassign_txh) { 892 dnode_t *dn = tx->tx_needassign_txh->txh_dnode; 893 894 mutex_enter(&dn->dn_mtx); 895 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) 896 cv_wait(&dn->dn_notxholds, &dn->dn_mtx); 897 mutex_exit(&dn->dn_mtx); 898 tx->tx_needassign_txh = NULL; 899 } else { 900 txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); 901 } 902} 903 904void 905dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) 906{ 907#ifdef ZFS_DEBUG 908 if (tx->tx_dir == NULL || delta == 0) 909 return; 910 911 if (delta > 0) { 912 ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, 913 tx->tx_space_towrite); 914 (void) refcount_add_many(&tx->tx_space_written, delta, NULL); 915 } else { 916 (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); 917 } 918#endif 919} 920 921void 922dmu_tx_commit(dmu_tx_t *tx) 923{ 924 dmu_tx_hold_t *txh; 925 926 ASSERT(tx->tx_txg != 0); 927 928 while (txh = list_head(&tx->tx_holds)) { 929 dnode_t *dn = txh->txh_dnode; 930 931 list_remove(&tx->tx_holds, txh); 932 kmem_free(txh, sizeof (dmu_tx_hold_t)); 933 if (dn == NULL) 934 continue; 935 mutex_enter(&dn->dn_mtx); 936 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 937 938 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 939 dn->dn_assigned_txg = 0; 940 cv_broadcast(&dn->dn_notxholds); 941 } 942 mutex_exit(&dn->dn_mtx); 943 dnode_rele(dn, tx); 944 } 945 946 if (tx->tx_tempreserve_cookie) 947 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); 948 949 if (tx->tx_anyobj == FALSE) 950 txg_rele_to_sync(&tx->tx_txgh); 951#ifdef ZFS_DEBUG 952 dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", 953 tx->tx_space_towrite, refcount_count(&tx->tx_space_written), 954 tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); 955 refcount_destroy_many(&tx->tx_space_written, 956 refcount_count(&tx->tx_space_written)); 957 refcount_destroy_many(&tx->tx_space_freed, 958 refcount_count(&tx->tx_space_freed)); 959#endif 960 kmem_free(tx, sizeof (dmu_tx_t)); 961} 962 963void 964dmu_tx_abort(dmu_tx_t *tx) 965{ 966 dmu_tx_hold_t *txh; 967 968 ASSERT(tx->tx_txg == 0); 969 970 while (txh = list_head(&tx->tx_holds)) { 971 dnode_t *dn = txh->txh_dnode; 972 973 list_remove(&tx->tx_holds, txh); 974 kmem_free(txh, sizeof (dmu_tx_hold_t)); 975 if (dn != NULL) 976 dnode_rele(dn, tx); 977 } 978#ifdef ZFS_DEBUG 979 refcount_destroy_many(&tx->tx_space_written, 980 refcount_count(&tx->tx_space_written)); 981 refcount_destroy_many(&tx->tx_space_freed, 982 refcount_count(&tx->tx_space_freed)); 983#endif 984 kmem_free(tx, sizeof (dmu_tx_t)); 985} 986 987uint64_t 988dmu_tx_get_txg(dmu_tx_t *tx) 989{ 990 ASSERT(tx->tx_txg != 0); 991 return (tx->tx_txg); 992} 993