fs/zfs/dmu_tx.c

168404Spjd/*
168404Spjd * CDDL HEADER START
168404Spjd *
168404Spjd * The contents of this file are subject to the terms of the
168404Spjd * Common Development and Distribution License (the "License").
168404Spjd * You may not use this file except in compliance with the License.
168404Spjd *
168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
168404Spjd * or http://www.opensolaris.org/os/licensing.
168404Spjd * See the License for the specific language governing permissions
168404Spjd * and limitations under the License.
168404Spjd *
168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
168404Spjd * If applicable, add the following below this CDDL HEADER, with the
168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
168404Spjd *
168404Spjd * CDDL HEADER END
168404Spjd */
168404Spjd/*
219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
226512Smm * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
307290Smav * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
296519Smav * Copyright (c) 2014 Integros [integros.com]
226512Smm */
168404Spjd
168404Spjd#include <sys/dmu.h>
168404Spjd#include <sys/dmu_impl.h>
168404Spjd#include <sys/dbuf.h>
168404Spjd#include <sys/dmu_tx.h>
168404Spjd#include <sys/dmu_objset.h>
321547Smav#include <sys/dsl_dataset.h>
321547Smav#include <sys/dsl_dir.h>
168404Spjd#include <sys/dsl_pool.h>
321547Smav#include <sys/zap_impl.h>
168404Spjd#include <sys/spa.h>
219089Spjd#include <sys/sa.h>
219089Spjd#include <sys/sa_impl.h>
168404Spjd#include <sys/zfs_context.h>
219089Spjd#include <sys/varargs.h>
168404Spjd
168404Spjdtypedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
168404Spjd    uint64_t arg1, uint64_t arg2);
168404Spjd
168404Spjd
168404Spjddmu_tx_t *
168404Spjddmu_tx_create_dd(dsl_dir_t *dd)
168404Spjd{
168404Spjd	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
168404Spjd	tx->tx_dir = dd;
248571Smm	if (dd != NULL)
168404Spjd		tx->tx_pool = dd->dd_pool;
168404Spjd	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
168404Spjd	    offsetof(dmu_tx_hold_t, txh_node));
219089Spjd	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
219089Spjd	    offsetof(dmu_tx_callback_t, dcb_node));
258632Savg	tx->tx_start = gethrtime();
168404Spjd	return (tx);
168404Spjd}
168404Spjd
168404Spjddmu_tx_t *
168404Spjddmu_tx_create(objset_t *os)
168404Spjd{
219089Spjd	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
168404Spjd	tx->tx_objset = os;
168404Spjd	return (tx);
168404Spjd}
168404Spjd
168404Spjddmu_tx_t *
168404Spjddmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
168404Spjd{
168404Spjd	dmu_tx_t *tx = dmu_tx_create_dd(NULL);
168404Spjd
168404Spjd	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
168404Spjd	tx->tx_pool = dp;
168404Spjd	tx->tx_txg = txg;
168404Spjd	tx->tx_anyobj = TRUE;
168404Spjd
168404Spjd	return (tx);
168404Spjd}
168404Spjd
168404Spjdint
168404Spjddmu_tx_is_syncing(dmu_tx_t *tx)
168404Spjd{
168404Spjd	return (tx->tx_anyobj);
168404Spjd}
168404Spjd
168404Spjdint
168404Spjddmu_tx_private_ok(dmu_tx_t *tx)
168404Spjd{
168404Spjd	return (tx->tx_anyobj);
168404Spjd}
168404Spjd
168404Spjdstatic dmu_tx_hold_t *
321549Smavdmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
321549Smav    uint64_t arg1, uint64_t arg2)
168404Spjd{
168404Spjd	dmu_tx_hold_t *txh;
168404Spjd
321549Smav	if (dn != NULL) {
321549Smav		(void) refcount_add(&dn->dn_holds, tx);
321549Smav		if (tx->tx_txg != 0) {
168404Spjd			mutex_enter(&dn->dn_mtx);
168404Spjd			/*
168404Spjd			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
168404Spjd			 * problem, but there's no way for it to happen (for
168404Spjd			 * now, at least).
168404Spjd			 */
168404Spjd			ASSERT(dn->dn_assigned_txg == 0);
168404Spjd			dn->dn_assigned_txg = tx->tx_txg;
168404Spjd			(void) refcount_add(&dn->dn_tx_holds, tx);
168404Spjd			mutex_exit(&dn->dn_mtx);
168404Spjd		}
168404Spjd	}
168404Spjd
168404Spjd	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
168404Spjd	txh->txh_tx = tx;
168404Spjd	txh->txh_dnode = dn;
307049Smav	refcount_create(&txh->txh_space_towrite);
307049Smav	refcount_create(&txh->txh_memory_tohold);
168404Spjd	txh->txh_type = type;
168404Spjd	txh->txh_arg1 = arg1;
168404Spjd	txh->txh_arg2 = arg2;
168404Spjd	list_insert_tail(&tx->tx_holds, txh);
168404Spjd
168404Spjd	return (txh);
168404Spjd}
168404Spjd
321549Smavstatic dmu_tx_hold_t *
321549Smavdmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
321549Smav    enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
321549Smav{
321549Smav	dnode_t *dn = NULL;
321549Smav	dmu_tx_hold_t *txh;
321549Smav	int err;
321549Smav
321549Smav	if (object != DMU_NEW_OBJECT) {
321549Smav		err = dnode_hold(os, object, FTAG, &dn);
321549Smav		if (err != 0) {
321549Smav			tx->tx_err = err;
321549Smav			return (NULL);
321549Smav		}
321549Smav	}
321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
321549Smav	if (dn != NULL)
321549Smav		dnode_rele(dn, FTAG);
321549Smav	return (txh);
321549Smav}
321549Smav
168404Spjdvoid
321549Smavdmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
168404Spjd{
168404Spjd	/*
168404Spjd	 * If we're syncing, they can manipulate any object anyhow, and
168404Spjd	 * the hold on the dnode_t can cause problems.
168404Spjd	 */
321549Smav	if (!dmu_tx_is_syncing(tx))
321549Smav		(void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
168404Spjd}
168404Spjd
321547Smav/*
321547Smav * This function reads specified data from disk.  The specified data will
321547Smav * be needed to perform the transaction -- i.e, it will be read after
321547Smav * we do dmu_tx_assign().  There are two reasons that we read the data now
321547Smav * (before dmu_tx_assign()):
321547Smav *
321547Smav * 1. Reading it now has potentially better performance.  The transaction
321547Smav * has not yet been assigned, so the TXG is not held open, and also the
321547Smav * caller typically has less locks held when calling dmu_tx_hold_*() than
321547Smav * after the transaction has been assigned.  This reduces the lock (and txg)
321547Smav * hold times, thus reducing lock contention.
321547Smav *
321547Smav * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
321547Smav * that are detected before they start making changes to the DMU state
321547Smav * (i.e. now).  Once the transaction has been assigned, and some DMU
321547Smav * state has been changed, it can be difficult to recover from an i/o
321547Smav * error (e.g. to undo the changes already made in memory at the DMU
321547Smav * layer).  Typically code to do so does not exist in the caller -- it
321547Smav * assumes that the data has already been cached and thus i/o errors are
321547Smav * not possible.
321547Smav *
321547Smav * It has been observed that the i/o initiated here can be a performance
321547Smav * problem, and it appears to be optional, because we don't look at the
321547Smav * data which is read.  However, removing this read would only serve to
321547Smav * move the work elsewhere (after the dmu_tx_assign()), where it may
321547Smav * have a greater impact on performance (in addition to the impact on
321547Smav * fault tolerance noted above).
321547Smav */
168404Spjdstatic int
168404Spjddmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
168404Spjd{
168404Spjd	int err;
168404Spjd	dmu_buf_impl_t *db;
168404Spjd
168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
168404Spjd	db = dbuf_hold_level(dn, level, blkid, FTAG);
168404Spjd	rw_exit(&dn->dn_struct_rwlock);
168404Spjd	if (db == NULL)
249195Smm		return (SET_ERROR(EIO));
185029Spjd	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
168404Spjd	dbuf_rele(db, FTAG);
168404Spjd	return (err);
168404Spjd}
168404Spjd
168404Spjd/* ARGSUSED */
168404Spjdstatic void
168404Spjddmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
168404Spjd{
168404Spjd	dnode_t *dn = txh->txh_dnode;
168404Spjd	int err = 0;
168404Spjd
168404Spjd	if (len == 0)
168404Spjd		return;
168404Spjd
321547Smav	(void) refcount_add_many(&txh->txh_space_towrite, len, FTAG);
168404Spjd
321547Smav	if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
321547Smav		err = SET_ERROR(EFBIG);
168404Spjd
321547Smav	if (dn == NULL)
321547Smav		return;
168404Spjd
321547Smav	/*
321547Smav	 * For i/o error checking, read the blocks that will be needed
321547Smav	 * to perform the write: the first and last level-0 blocks (if
321547Smav	 * they are not aligned, i.e. if they are partial-block writes),
321547Smav	 * and all the level-1 blocks.
321547Smav	 */
321547Smav	if (dn->dn_maxblkid == 0) {
321547Smav		if (off < dn->dn_datablksz &&
321547Smav		    (off > 0 || len < dn->dn_datablksz)) {
321547Smav			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
321547Smav			if (err != 0) {
321547Smav				txh->txh_tx->tx_err = err;
168404Spjd			}
321547Smav		}
321547Smav	} else {
321547Smav		zio_t *zio = zio_root(dn->dn_objset->os_spa,
321547Smav		    NULL, NULL, ZIO_FLAG_CANFAIL);
168404Spjd
321547Smav		/* first level-0 block */
321547Smav		uint64_t start = off >> dn->dn_datablkshift;
321547Smav		if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
321547Smav			err = dmu_tx_check_ioerr(zio, dn, 0, start);
321547Smav			if (err != 0) {
321547Smav				txh->txh_tx->tx_err = err;
168404Spjd			}
168404Spjd		}
168404Spjd
321547Smav		/* last level-0 block */
321547Smav		uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
321547Smav		if (end != start && end <= dn->dn_maxblkid &&
321547Smav		    P2PHASE(off + len, dn->dn_datablksz)) {
321547Smav			err = dmu_tx_check_ioerr(zio, dn, 0, end);
321547Smav			if (err != 0) {
219089Spjd				txh->txh_tx->tx_err = err;
209962Smm			}
321547Smav		}
219089Spjd
321547Smav		/* level-1 blocks */
321547Smav		if (dn->dn_nlevels > 1) {
321547Smav			int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
321547Smav			for (uint64_t i = (start >> shft) + 1;
321547Smav			    i < end >> shft; i++) {
321547Smav				err = dmu_tx_check_ioerr(zio, dn, 1, i);
321547Smav				if (err != 0) {
321547Smav					txh->txh_tx->tx_err = err;
307049Smav				}
209962Smm			}
209962Smm		}
168404Spjd
321547Smav		err = zio_wait(zio);
321547Smav		if (err != 0) {
321547Smav			txh->txh_tx->tx_err = err;
209962Smm		}
168404Spjd	}
168404Spjd}
168404Spjd
168404Spjdstatic void
168404Spjddmu_tx_count_dnode(dmu_tx_hold_t *txh)
168404Spjd{
321547Smav	(void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG);
168404Spjd}
168404Spjd
168404Spjdvoid
168404Spjddmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
168404Spjd{
168404Spjd	dmu_tx_hold_t *txh;
168404Spjd
321547Smav	ASSERT0(tx->tx_txg);
321547Smav	ASSERT3U(len, <=, DMU_MAX_ACCESS);
168404Spjd	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
168404Spjd
168404Spjd	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
168404Spjd	    object, THT_WRITE, off, len);
321549Smav	if (txh != NULL) {
321549Smav		dmu_tx_count_write(txh, off, len);
321549Smav		dmu_tx_count_dnode(txh);
321549Smav	}
321549Smav}
168404Spjd
321549Smavvoid
321549Smavdmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
321549Smav{
321549Smav	dmu_tx_hold_t *txh;
321549Smav
321549Smav	ASSERT0(tx->tx_txg);
321549Smav	ASSERT3U(len, <=, DMU_MAX_ACCESS);
321549Smav	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
321549Smav
321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
321549Smav	if (txh != NULL) {
321549Smav		dmu_tx_count_write(txh, off, len);
321549Smav		dmu_tx_count_dnode(txh);
321549Smav	}
168404Spjd}
168404Spjd
268464Sdelphij/*
268464Sdelphij * This function marks the transaction as being a "net free".  The end
268464Sdelphij * result is that refquotas will be disabled for this transaction, and
268464Sdelphij * this transaction will be able to use half of the pool space overhead
268464Sdelphij * (see dsl_pool_adjustedsize()).  Therefore this function should only
268464Sdelphij * be called for transactions that we expect will not cause a net increase
268464Sdelphij * in the amount of space used (but it's OK if that is occasionally not true).
268464Sdelphij */
168404Spjdvoid
268464Sdelphijdmu_tx_mark_netfree(dmu_tx_t *tx)
268464Sdelphij{
321547Smav	tx->tx_netfree = B_TRUE;
268464Sdelphij}
268464Sdelphij
321549Smavstatic void
321549Smavdmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
168404Spjd{
321549Smav	dmu_tx_t *tx;
321549Smav	dnode_t *dn;
253821Sdelphij	int err;
321549Smav	zio_t *zio;
168404Spjd
321549Smav	tx = txh->txh_tx;
168404Spjd	ASSERT(tx->tx_txg == 0);
168404Spjd
321549Smav	dn = txh->txh_dnode;
258632Savg	dmu_tx_count_dnode(txh);
168404Spjd
321547Smav	if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
168404Spjd		return;
168404Spjd	if (len == DMU_OBJECT_END)
321547Smav		len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
168404Spjd
253821Sdelphij
168404Spjd	/*
253821Sdelphij	 * For i/o error checking, we read the first and last level-0
253821Sdelphij	 * blocks if they are not aligned, and all the level-1 blocks.
253821Sdelphij	 *
253821Sdelphij	 * Note:  dbuf_free_range() assumes that we have not instantiated
253821Sdelphij	 * any level-0 dbufs that will be completely freed.  Therefore we must
253821Sdelphij	 * exercise care to not read or count the first and last blocks
253821Sdelphij	 * if they are blocksize-aligned.
168404Spjd	 */
253821Sdelphij	if (dn->dn_datablkshift == 0) {
254753Sdelphij		if (off != 0 || len < dn->dn_datablksz)
256259Savg			dmu_tx_count_write(txh, 0, dn->dn_datablksz);
253821Sdelphij	} else {
253821Sdelphij		/* first block will be modified if it is not aligned */
253821Sdelphij		if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
253821Sdelphij			dmu_tx_count_write(txh, off, 1);
253821Sdelphij		/* last block will be modified if it is not aligned */
253821Sdelphij		if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
321547Smav			dmu_tx_count_write(txh, off + len, 1);
253821Sdelphij	}
253821Sdelphij
253821Sdelphij	/*
253821Sdelphij	 * Check level-1 blocks.
253821Sdelphij	 */
168404Spjd	if (dn->dn_nlevels > 1) {
253821Sdelphij		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
168404Spjd		    SPA_BLKPTRSHIFT;
253821Sdelphij		uint64_t start = off >> shift;
253821Sdelphij		uint64_t end = (off + len) >> shift;
168404Spjd
253821Sdelphij		ASSERT(dn->dn_indblkshift != 0);
253821Sdelphij
259576Spjd		/*
259576Spjd		 * dnode_reallocate() can result in an object with indirect
259576Spjd		 * blocks having an odd data block size.  In this case,
259576Spjd		 * just check the single block.
259576Spjd		 */
259576Spjd		if (dn->dn_datablkshift == 0)
259576Spjd			start = end = 0;
259576Spjd
321547Smav		zio_t *zio = zio_root(tx->tx_pool->dp_spa,
168404Spjd		    NULL, NULL, ZIO_FLAG_CANFAIL);
253821Sdelphij		for (uint64_t i = start; i <= end; i++) {
168404Spjd			uint64_t ibyte = i << shift;
185029Spjd			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
168404Spjd			i = ibyte >> shift;
284593Savg			if (err == ESRCH || i > end)
168404Spjd				break;
321547Smav			if (err != 0) {
168404Spjd				tx->tx_err = err;
321547Smav				(void) zio_wait(zio);
168404Spjd				return;
168404Spjd			}
168404Spjd
321547Smav			(void) refcount_add_many(&txh->txh_memory_tohold,
321547Smav			    1 << dn->dn_indblkshift, FTAG);
321547Smav
168404Spjd			err = dmu_tx_check_ioerr(zio, dn, 1, i);
321547Smav			if (err != 0) {
168404Spjd				tx->tx_err = err;
321547Smav				(void) zio_wait(zio);
168404Spjd				return;
168404Spjd			}
168404Spjd		}
168404Spjd		err = zio_wait(zio);
321547Smav		if (err != 0) {
168404Spjd			tx->tx_err = err;
168404Spjd			return;
168404Spjd		}
168404Spjd	}
168404Spjd}
168404Spjd
168404Spjdvoid
321549Smavdmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
168404Spjd{
321549Smav	dmu_tx_hold_t *txh;
321549Smav
321549Smav	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
321549Smav	    object, THT_FREE, off, len);
321549Smav	if (txh != NULL)
321549Smav		(void) dmu_tx_hold_free_impl(txh, off, len);
321549Smav}
321549Smav
321549Smavvoid
321549Smavdmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
321549Smav{
321549Smav	dmu_tx_hold_t *txh;
321549Smav
321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
321549Smav	if (txh != NULL)
321549Smav		(void) dmu_tx_hold_free_impl(txh, off, len);
321549Smav}
321549Smav
321549Smavstatic void
321549Smavdmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, int add, const char *name)
321549Smav{
321549Smav	dmu_tx_t *tx = txh->txh_tx;
321549Smav	dnode_t *dn;
307049Smav	int err;
168404Spjd
168404Spjd	ASSERT(tx->tx_txg == 0);
168404Spjd
321549Smav	dn = txh->txh_dnode;
168404Spjd
168404Spjd	dmu_tx_count_dnode(txh);
168404Spjd
321547Smav	/*
321547Smav	 * Modifying a almost-full microzap is around the worst case (128KB)
321547Smav	 *
321547Smav	 * If it is a fat zap, the worst case would be 7*16KB=112KB:
321547Smav	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
321547Smav	 * - 4 new blocks written if adding:
321547Smav	 *    - 2 blocks for possibly split leaves,
321547Smav	 *    - 2 grown ptrtbl blocks
321547Smav	 */
321547Smav	(void) refcount_add_many(&txh->txh_space_towrite,
321547Smav	    MZAP_MAX_BLKSZ, FTAG);
321547Smav
321547Smav	if (dn == NULL)
168404Spjd		return;
168404Spjd
236884Smm	ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
168404Spjd
321547Smav	if (dn->dn_maxblkid == 0 || name == NULL) {
168404Spjd		/*
321547Smav		 * This is a microzap (only one block), or we don't know
321547Smav		 * the name.  Check the first block for i/o errors.
168404Spjd		 */
168404Spjd		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
321547Smav		if (err != 0) {
168404Spjd			tx->tx_err = err;
168404Spjd		}
321547Smav	} else {
168404Spjd		/*
321547Smav		 * Access the name so that we'll check for i/o errors to
321547Smav		 * the leaf blocks, etc.  We ignore ENOENT, as this name
321547Smav		 * may not yet exist.
168404Spjd		 */
307290Smav		err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
321547Smav		if (err == EIO || err == ECKSUM || err == ENXIO) {
168404Spjd			tx->tx_err = err;
168404Spjd		}
168404Spjd	}
168404Spjd}
168404Spjd
168404Spjdvoid
321549Smavdmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
321549Smav{
321549Smav	dmu_tx_hold_t *txh;
321549Smav
321549Smav	ASSERT0(tx->tx_txg);
321549Smav
321549Smav	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
321549Smav	    object, THT_ZAP, add, (uintptr_t)name);
321549Smav	if (txh != NULL)
321549Smav		dmu_tx_hold_zap_impl(txh, add, name);
321549Smav}
321549Smav
321549Smavvoid
321549Smavdmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
321549Smav{
321549Smav	dmu_tx_hold_t *txh;
321549Smav
321549Smav	ASSERT0(tx->tx_txg);
321549Smav	ASSERT(dn != NULL);
321549Smav
321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
321549Smav	if (txh != NULL)
321549Smav		dmu_tx_hold_zap_impl(txh, add, name);
321549Smav}
321549Smav
321549Smavvoid
168404Spjddmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
168404Spjd{
168404Spjd	dmu_tx_hold_t *txh;
168404Spjd
168404Spjd	ASSERT(tx->tx_txg == 0);
168404Spjd
168404Spjd	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
168404Spjd	    object, THT_BONUS, 0, 0);
168404Spjd	if (txh)
168404Spjd		dmu_tx_count_dnode(txh);
168404Spjd}
168404Spjd
168404Spjdvoid
321549Smavdmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
321549Smav{
321549Smav	dmu_tx_hold_t *txh;
321549Smav
321549Smav	ASSERT0(tx->tx_txg);
321549Smav
321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
321549Smav	if (txh)
321549Smav		dmu_tx_count_dnode(txh);
321549Smav}
321549Smav
321549Smavvoid
168404Spjddmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
168404Spjd{
168404Spjd	dmu_tx_hold_t *txh;
168404Spjd	ASSERT(tx->tx_txg == 0);
168404Spjd
168404Spjd	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
168404Spjd	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
168404Spjd
307049Smav	(void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
168404Spjd}
168404Spjd
168404Spjd#ifdef ZFS_DEBUG
168404Spjdvoid
168404Spjddmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
168404Spjd{
321547Smav	boolean_t match_object = B_FALSE;
321547Smav	boolean_t match_offset = B_FALSE;
168404Spjd
219089Spjd	DB_DNODE_ENTER(db);
321547Smav	dnode_t *dn = DB_DNODE(db);
168404Spjd	ASSERT(tx->tx_txg != 0);
219089Spjd	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
168404Spjd	ASSERT3U(dn->dn_object, ==, db->db.db_object);
168404Spjd
219089Spjd	if (tx->tx_anyobj) {
219089Spjd		DB_DNODE_EXIT(db);
168404Spjd		return;
219089Spjd	}
168404Spjd
168404Spjd	/* XXX No checking on the meta dnode for now */
219089Spjd	if (db->db.db_object == DMU_META_DNODE_OBJECT) {
219089Spjd		DB_DNODE_EXIT(db);
168404Spjd		return;
219089Spjd	}
168404Spjd
321547Smav	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
168404Spjd	    txh = list_next(&tx->tx_holds, txh)) {
168404Spjd		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
168404Spjd		if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
168404Spjd			match_object = TRUE;
168404Spjd		if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
168404Spjd			int datablkshift = dn->dn_datablkshift ?
168404Spjd			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
168404Spjd			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
168404Spjd			int shift = datablkshift + epbs * db->db_level;
168404Spjd			uint64_t beginblk = shift >= 64 ? 0 :
168404Spjd			    (txh->txh_arg1 >> shift);
168404Spjd			uint64_t endblk = shift >= 64 ? 0 :
168404Spjd			    ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
168404Spjd			uint64_t blkid = db->db_blkid;
168404Spjd
168404Spjd			/* XXX txh_arg2 better not be zero... */
168404Spjd
168404Spjd			dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
168404Spjd			    txh->txh_type, beginblk, endblk);
168404Spjd
168404Spjd			switch (txh->txh_type) {
168404Spjd			case THT_WRITE:
168404Spjd				if (blkid >= beginblk && blkid <= endblk)
168404Spjd					match_offset = TRUE;
168404Spjd				/*
168404Spjd				 * We will let this hold work for the bonus
219089Spjd				 * or spill buffer so that we don't need to
219089Spjd				 * hold it when creating a new object.
168404Spjd				 */
219089Spjd				if (blkid == DMU_BONUS_BLKID ||
219089Spjd				    blkid == DMU_SPILL_BLKID)
168404Spjd					match_offset = TRUE;
168404Spjd				/*
168404Spjd				 * They might have to increase nlevels,
168404Spjd				 * thus dirtying the new TLIBs.  Or the
168404Spjd				 * might have to change the block size,
168404Spjd				 * thus dirying the new lvl=0 blk=0.
168404Spjd				 */
168404Spjd				if (blkid == 0)
168404Spjd					match_offset = TRUE;
168404Spjd				break;
168404Spjd			case THT_FREE:
185029Spjd				/*
185029Spjd				 * We will dirty all the level 1 blocks in
185029Spjd				 * the free range and perhaps the first and
185029Spjd				 * last level 0 block.
185029Spjd				 */
185029Spjd				if (blkid >= beginblk && (blkid <= endblk ||
185029Spjd				    txh->txh_arg2 == DMU_OBJECT_END))
168404Spjd					match_offset = TRUE;
168404Spjd				break;
219089Spjd			case THT_SPILL:
219089Spjd				if (blkid == DMU_SPILL_BLKID)
219089Spjd					match_offset = TRUE;
219089Spjd				break;
168404Spjd			case THT_BONUS:
219089Spjd				if (blkid == DMU_BONUS_BLKID)
168404Spjd					match_offset = TRUE;
168404Spjd				break;
168404Spjd			case THT_ZAP:
168404Spjd				match_offset = TRUE;
168404Spjd				break;
168404Spjd			case THT_NEWOBJECT:
168404Spjd				match_object = TRUE;
168404Spjd				break;
168404Spjd			default:
168404Spjd				ASSERT(!"bad txh_type");
168404Spjd			}
168404Spjd		}
219089Spjd		if (match_object && match_offset) {
219089Spjd			DB_DNODE_EXIT(db);
168404Spjd			return;
219089Spjd		}
168404Spjd	}
219089Spjd	DB_DNODE_EXIT(db);
168404Spjd	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
168404Spjd	    (u_longlong_t)db->db.db_object, db->db_level,
168404Spjd	    (u_longlong_t)db->db_blkid);
168404Spjd}
168404Spjd#endif
168404Spjd
258632Savg/*
258632Savg * If we can't do 10 iops, something is wrong.  Let us go ahead
258632Savg * and hit zfs_dirty_data_max.
258632Savg */
258632Savghrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
258632Savgint zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
258632Savg
258632Savg/*
258632Savg * We delay transactions when we've determined that the backend storage
258632Savg * isn't able to accommodate the rate of incoming writes.
258632Savg *
258632Savg * If there is already a transaction waiting, we delay relative to when
258632Savg * that transaction finishes waiting.  This way the calculated min_time
258632Savg * is independent of the number of threads concurrently executing
258632Savg * transactions.
258632Savg *
258632Savg * If we are the only waiter, wait relative to when the transaction
258632Savg * started, rather than the current time.  This credits the transaction for
258632Savg * "time already served", e.g. reading indirect blocks.
258632Savg *
258632Savg * The minimum time for a transaction to take is calculated as:
258632Savg *     min_time = scale * (dirty - min) / (max - dirty)
258632Savg *     min_time is then capped at zfs_delay_max_ns.
258632Savg *
258632Savg * The delay has two degrees of freedom that can be adjusted via tunables.
258632Savg * The percentage of dirty data at which we start to delay is defined by
258632Savg * zfs_delay_min_dirty_percent. This should typically be at or above
258632Savg * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
258632Savg * delay after writing at full speed has failed to keep up with the incoming
258632Savg * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
258632Savg * speaking, this variable determines the amount of delay at the midpoint of
258632Savg * the curve.
258632Savg *
258632Savg * delay
258632Savg *  10ms +-------------------------------------------------------------*+
258632Savg *       |                                                             *|
258632Savg *   9ms +                                                             *+
258632Savg *       |                                                             *|
258632Savg *   8ms +                                                             *+
258632Savg *       |                                                            * |
258632Savg *   7ms +                                                            * +
258632Savg *       |                                                            * |
258632Savg *   6ms +                                                            * +
258632Savg *       |                                                            * |
258632Savg *   5ms +                                                           *  +
258632Savg *       |                                                           *  |
258632Savg *   4ms +                                                           *  +
258632Savg *       |                                                           *  |
258632Savg *   3ms +                                                          *   +
258632Savg *       |                                                          *   |
258632Savg *   2ms +                                              (midpoint) *    +
258632Savg *       |                                                  |    **     |
258632Savg *   1ms +                                                  v ***       +
258632Savg *       |             zfs_delay_scale ---------->     ********         |
258632Savg *     0 +-------------------------------------*********----------------+
258632Savg *       0%                    <- zfs_dirty_data_max ->               100%
258632Savg *
258632Savg * Note that since the delay is added to the outstanding time remaining on the
258632Savg * most recent transaction, the delay is effectively the inverse of IOPS.
258632Savg * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
258632Savg * was chosen such that small changes in the amount of accumulated dirty data
258632Savg * in the first 3/4 of the curve yield relatively small differences in the
258632Savg * amount of delay.
258632Savg *
258632Savg * The effects can be easier to understand when the amount of delay is
258632Savg * represented on a log scale:
258632Savg *
258632Savg * delay
258632Savg * 100ms +-------------------------------------------------------------++
258632Savg *       +                                                              +
258632Savg *       |                                                              |
258632Savg *       +                                                             *+
258632Savg *  10ms +                                                             *+
258632Savg *       +                                                           ** +
258632Savg *       |                                              (midpoint)  **  |
258632Savg *       +                                                  |     **    +
258632Savg *   1ms +                                                  v ****      +
258632Savg *       +             zfs_delay_scale ---------->        *****         +
258632Savg *       |                                             ****             |
258632Savg *       +                                          ****                +
258632Savg * 100us +                                        **                    +
258632Savg *       +                                       *                      +
258632Savg *       |                                      *                       |
258632Savg *       +                                     *                        +
258632Savg *  10us +                                     *                        +
258632Savg *       +                                                              +
258632Savg *       |                                                              |
258632Savg *       +                                                              +
258632Savg *       +--------------------------------------------------------------+
258632Savg *       0%                    <- zfs_dirty_data_max ->               100%
258632Savg *
258632Savg * Note here that only as the amount of dirty data approaches its limit does
258632Savg * the delay start to increase rapidly. The goal of a properly tuned system
258632Savg * should be to keep the amount of dirty data out of that range by first
258632Savg * ensuring that the appropriate limits are set for the I/O scheduler to reach
258632Savg * optimal throughput on the backend storage, and then by changing the value
258632Savg * of zfs_delay_scale to increase the steepness of the curve.
258632Savg */
258632Savgstatic void
258632Savgdmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
258632Savg{
258632Savg	dsl_pool_t *dp = tx->tx_pool;
258632Savg	uint64_t delay_min_bytes =
258632Savg	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
258632Savg	hrtime_t wakeup, min_tx_time, now;
258632Savg
258632Savg	if (dirty <= delay_min_bytes)
258632Savg		return;
258632Savg
258632Savg	/*
258632Savg	 * The caller has already waited until we are under the max.
258632Savg	 * We make them pass us the amount of dirty data so we don't
258632Savg	 * have to handle the case of it being >= the max, which could
258632Savg	 * cause a divide-by-zero if it's == the max.
258632Savg	 */
258632Savg	ASSERT3U(dirty, <, zfs_dirty_data_max);
258632Savg
258632Savg	now = gethrtime();
258632Savg	min_tx_time = zfs_delay_scale *
258632Savg	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
258632Savg	if (now > tx->tx_start + min_tx_time)
258632Savg		return;
258632Savg
258632Savg	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
258632Savg
258632Savg	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
258632Savg	    uint64_t, min_tx_time);
258632Savg
258632Savg	mutex_enter(&dp->dp_lock);
258632Savg	wakeup = MAX(tx->tx_start + min_tx_time,
258632Savg	    dp->dp_last_wakeup + min_tx_time);
258632Savg	dp->dp_last_wakeup = wakeup;
258632Savg	mutex_exit(&dp->dp_lock);
258632Savg
258632Savg#ifdef _KERNEL
258632Savg#ifdef illumos
258632Savg	mutex_enter(&curthread->t_delay_lock);
258632Savg	while (cv_timedwait_hires(&curthread->t_delay_cv,
258632Savg	    &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
258632Savg	    CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
258632Savg		continue;
258632Savg	mutex_exit(&curthread->t_delay_lock);
258632Savg#else
258632Savg	pause_sbt("dmu_tx_delay", wakeup * SBT_1NS,
258632Savg	    zfs_delay_resolution_ns * SBT_1NS, C_ABSOLUTE);
258632Savg#endif
258632Savg#else
258632Savg	hrtime_t delta = wakeup - gethrtime();
258632Savg	struct timespec ts;
258632Savg	ts.tv_sec = delta / NANOSEC;
258632Savg	ts.tv_nsec = delta % NANOSEC;
258632Savg	(void) nanosleep(&ts, NULL);
258632Savg#endif
258632Savg}
258632Savg
321547Smav/*
321547Smav * This routine attempts to assign the transaction to a transaction group.
321547Smav * To do so, we must determine if there is sufficient free space on disk.
321547Smav *
321547Smav * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
321547Smav * on it), then it is assumed that there is sufficient free space,
321547Smav * unless there's insufficient slop space in the pool (see the comment
321547Smav * above spa_slop_shift in spa_misc.c).
321547Smav *
321547Smav * If it is not a "netfree" transaction, then if the data already on disk
321547Smav * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
321547Smav * ENOSPC.  Otherwise, if the current rough estimate of pending changes,
321547Smav * plus the rough estimate of this transaction's changes, may exceed the
321547Smav * allowed usage, then this will fail with ERESTART, which will cause the
321547Smav * caller to wait for the pending changes to be written to disk (by waiting
321547Smav * for the next TXG to open), and then check the space usage again.
321547Smav *
321547Smav * The rough estimate of pending changes is comprised of the sum of:
321547Smav *
321547Smav *  - this transaction's holds' txh_space_towrite
321547Smav *
321547Smav *  - dd_tempreserved[], which is the sum of in-flight transactions'
321547Smav *    holds' txh_space_towrite (i.e. those transactions that have called
321547Smav *    dmu_tx_assign() but not yet called dmu_tx_commit()).
321547Smav *
321547Smav *  - dd_space_towrite[], which is the amount of dirtied dbufs.
321547Smav *
321547Smav * Note that all of these values are inflated by spa_get_worst_case_asize(),
321547Smav * which means that we may get ERESTART well before we are actually in danger
321547Smav * of running out of space, but this also mitigates any small inaccuracies
321547Smav * in the rough estimate (e.g. txh_space_towrite doesn't take into account
321547Smav * indirect blocks, and dd_space_towrite[] doesn't take into account changes
321547Smav * to the MOS).
321547Smav *
321547Smav * Note that due to this algorithm, it is possible to exceed the allowed
321547Smav * usage by one transaction.  Also, as we approach the allowed usage,
321547Smav * we will allow a very limited amount of changes into each TXG, thus
321547Smav * decreasing performance.
321547Smav */
168404Spjdstatic int
248571Smmdmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
168404Spjd{
185029Spjd	spa_t *spa = tx->tx_pool->dp_spa;
168404Spjd
240415Smm	ASSERT0(tx->tx_txg);
185029Spjd
168404Spjd	if (tx->tx_err)
168404Spjd		return (tx->tx_err);
168404Spjd
185029Spjd	if (spa_suspended(spa)) {
185029Spjd		/*
185029Spjd		 * If the user has indicated a blocking failure mode
185029Spjd		 * then return ERESTART which will block in dmu_tx_wait().
185029Spjd		 * Otherwise, return EIO so that an error can get
185029Spjd		 * propagated back to the VOP calls.
185029Spjd		 *
185029Spjd		 * Note that we always honor the txg_how flag regardless
185029Spjd		 * of the failuremode setting.
185029Spjd		 */
185029Spjd		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
185029Spjd		    txg_how != TXG_WAIT)
249195Smm			return (SET_ERROR(EIO));
185029Spjd
249195Smm		return (SET_ERROR(ERESTART));
185029Spjd	}
185029Spjd
258632Savg	if (!tx->tx_waited &&
258632Savg	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
258632Savg		tx->tx_wait_dirty = B_TRUE;
258632Savg		return (SET_ERROR(ERESTART));
258632Savg	}
258632Savg
168404Spjd	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
168404Spjd	tx->tx_needassign_txh = NULL;
168404Spjd
168404Spjd	/*
168404Spjd	 * NB: No error returns are allowed after txg_hold_open, but
168404Spjd	 * before processing the dnode holds, due to the
168404Spjd	 * dmu_tx_unassign() logic.
168404Spjd	 */
168404Spjd
321547Smav	uint64_t towrite = 0;
321547Smav	uint64_t tohold = 0;
321547Smav	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
168404Spjd	    txh = list_next(&tx->tx_holds, txh)) {
168404Spjd		dnode_t *dn = txh->txh_dnode;
168404Spjd		if (dn != NULL) {
168404Spjd			mutex_enter(&dn->dn_mtx);
168404Spjd			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
168404Spjd				mutex_exit(&dn->dn_mtx);
168404Spjd				tx->tx_needassign_txh = txh;
249195Smm				return (SET_ERROR(ERESTART));
168404Spjd			}
168404Spjd			if (dn->dn_assigned_txg == 0)
168404Spjd				dn->dn_assigned_txg = tx->tx_txg;
168404Spjd			ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
168404Spjd			(void) refcount_add(&dn->dn_tx_holds, tx);
168404Spjd			mutex_exit(&dn->dn_mtx);
168404Spjd		}
307049Smav		towrite += refcount_count(&txh->txh_space_towrite);
307049Smav		tohold += refcount_count(&txh->txh_memory_tohold);
168404Spjd	}
168404Spjd
185029Spjd	/* needed allocation: worst-case estimate of write space */
321547Smav	uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
185029Spjd	/* calculate memory footprint estimate */
321547Smav	uint64_t memory = towrite + tohold;
168404Spjd
321547Smav	if (tx->tx_dir != NULL && asize != 0) {
185029Spjd		int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
321547Smav		    asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
321547Smav		if (err != 0)
168404Spjd			return (err);
168404Spjd	}
168404Spjd
168404Spjd	return (0);
168404Spjd}
168404Spjd
168404Spjdstatic void
168404Spjddmu_tx_unassign(dmu_tx_t *tx)
168404Spjd{
168404Spjd	if (tx->tx_txg == 0)
168404Spjd		return;
168404Spjd
168404Spjd	txg_rele_to_quiesce(&tx->tx_txgh);
168404Spjd
251629Sdelphij	/*
251629Sdelphij	 * Walk the transaction's hold list, removing the hold on the
251629Sdelphij	 * associated dnode, and notifying waiters if the refcount drops to 0.
251629Sdelphij	 */
321547Smav	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
321547Smav	    txh != tx->tx_needassign_txh;
168404Spjd	    txh = list_next(&tx->tx_holds, txh)) {
168404Spjd		dnode_t *dn = txh->txh_dnode;
168404Spjd
168404Spjd		if (dn == NULL)
168404Spjd			continue;
168404Spjd		mutex_enter(&dn->dn_mtx);
168404Spjd		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
168404Spjd
168404Spjd		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
168404Spjd			dn->dn_assigned_txg = 0;
168404Spjd			cv_broadcast(&dn->dn_notxholds);
168404Spjd		}
168404Spjd		mutex_exit(&dn->dn_mtx);
168404Spjd	}
168404Spjd
168404Spjd	txg_rele_to_sync(&tx->tx_txgh);
168404Spjd
168404Spjd	tx->tx_lasttried_txg = tx->tx_txg;
168404Spjd	tx->tx_txg = 0;
168404Spjd}
168404Spjd
168404Spjd/*
168404Spjd * Assign tx to a transaction group.  txg_how can be one of:
168404Spjd *
168404Spjd * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
168404Spjd *	a new one.  This should be used when you're not holding locks.
248571Smm *	It will only fail if we're truly out of space (or over quota).
168404Spjd *
168404Spjd * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
168404Spjd *	blocking, returns immediately with ERESTART.  This should be used
168404Spjd *	whenever you're holding locks.  On an ERESTART error, the caller
168404Spjd *	should drop locks, do a dmu_tx_wait(tx), and try again.
258632Savg *
258632Savg * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
258632Savg *      has already been called on behalf of this operation (though
258632Savg *      most likely on a different tx).
168404Spjd */
168404Spjdint
248571Smmdmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
168404Spjd{
168404Spjd	int err;
168404Spjd
168404Spjd	ASSERT(tx->tx_txg == 0);
258632Savg	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
258632Savg	    txg_how == TXG_WAITED);
168404Spjd	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
168404Spjd
248571Smm	/* If we might wait, we must not hold the config lock. */
248571Smm	ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
248571Smm
258632Savg	if (txg_how == TXG_WAITED)
258632Savg		tx->tx_waited = B_TRUE;
258632Savg
168404Spjd	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
168404Spjd		dmu_tx_unassign(tx);
168404Spjd
168404Spjd		if (err != ERESTART || txg_how != TXG_WAIT)
168404Spjd			return (err);
168404Spjd
168404Spjd		dmu_tx_wait(tx);
168404Spjd	}
168404Spjd
168404Spjd	txg_rele_to_quiesce(&tx->tx_txgh);
168404Spjd
168404Spjd	return (0);
168404Spjd}
168404Spjd
168404Spjdvoid
168404Spjddmu_tx_wait(dmu_tx_t *tx)
168404Spjd{
185029Spjd	spa_t *spa = tx->tx_pool->dp_spa;
258632Savg	dsl_pool_t *dp = tx->tx_pool;
185029Spjd
168404Spjd	ASSERT(tx->tx_txg == 0);
248571Smm	ASSERT(!dsl_pool_config_held(tx->tx_pool));
168404Spjd
258632Savg	if (tx->tx_wait_dirty) {
258632Savg		/*
258632Savg		 * dmu_tx_try_assign() has determined that we need to wait
258632Savg		 * because we've consumed much or all of the dirty buffer
258632Savg		 * space.
258632Savg		 */
258632Savg		mutex_enter(&dp->dp_lock);
258632Savg		while (dp->dp_dirty_total >= zfs_dirty_data_max)
258632Savg			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
258632Savg		uint64_t dirty = dp->dp_dirty_total;
258632Savg		mutex_exit(&dp->dp_lock);
258632Savg
258632Savg		dmu_tx_delay(tx, dirty);
258632Savg
258632Savg		tx->tx_wait_dirty = B_FALSE;
258632Savg
258632Savg		/*
258632Savg		 * Note: setting tx_waited only has effect if the caller
258632Savg		 * used TX_WAIT.  Otherwise they are going to destroy
258632Savg		 * this tx and try again.  The common case, zfs_write(),
258632Savg		 * uses TX_WAIT.
258632Savg		 */
258632Savg		tx->tx_waited = B_TRUE;
258632Savg	} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
258632Savg		/*
258632Savg		 * If the pool is suspended we need to wait until it
258632Savg		 * is resumed.  Note that it's possible that the pool
258632Savg		 * has become active after this thread has tried to
258632Savg		 * obtain a tx.  If that's the case then tx_lasttried_txg
258632Savg		 * would not have been set.
258632Savg		 */
258632Savg		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
185029Spjd	} else if (tx->tx_needassign_txh) {
258632Savg		/*
258632Savg		 * A dnode is assigned to the quiescing txg.  Wait for its
258632Savg		 * transaction to complete.
258632Savg		 */
168404Spjd		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
168404Spjd
168404Spjd		mutex_enter(&dn->dn_mtx);
168404Spjd		while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
168404Spjd			cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
168404Spjd		mutex_exit(&dn->dn_mtx);
168404Spjd		tx->tx_needassign_txh = NULL;
168404Spjd	} else {
168404Spjd		txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
168404Spjd	}
168404Spjd}
168404Spjd
307049Smavstatic void
307049Smavdmu_tx_destroy(dmu_tx_t *tx)
307049Smav{
307049Smav	dmu_tx_hold_t *txh;
307049Smav
307049Smav	while ((txh = list_head(&tx->tx_holds)) != NULL) {
307049Smav		dnode_t *dn = txh->txh_dnode;
307049Smav
307049Smav		list_remove(&tx->tx_holds, txh);
307049Smav		refcount_destroy_many(&txh->txh_space_towrite,
307049Smav		    refcount_count(&txh->txh_space_towrite));
307049Smav		refcount_destroy_many(&txh->txh_memory_tohold,
307049Smav		    refcount_count(&txh->txh_memory_tohold));
307049Smav		kmem_free(txh, sizeof (dmu_tx_hold_t));
307049Smav		if (dn != NULL)
307049Smav			dnode_rele(dn, tx);
307049Smav	}
307049Smav
307049Smav	list_destroy(&tx->tx_callbacks);
307049Smav	list_destroy(&tx->tx_holds);
307049Smav	kmem_free(tx, sizeof (dmu_tx_t));
307049Smav}
307049Smav
168404Spjdvoid
168404Spjddmu_tx_commit(dmu_tx_t *tx)
168404Spjd{
168404Spjd	ASSERT(tx->tx_txg != 0);
168404Spjd
251629Sdelphij	/*
251629Sdelphij	 * Go through the transaction's hold list and remove holds on
251629Sdelphij	 * associated dnodes, notifying waiters if no holds remain.
251629Sdelphij	 */
307049Smav	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
307049Smav	    txh = list_next(&tx->tx_holds, txh)) {
168404Spjd		dnode_t *dn = txh->txh_dnode;
168404Spjd
168404Spjd		if (dn == NULL)
168404Spjd			continue;
307049Smav
168404Spjd		mutex_enter(&dn->dn_mtx);
168404Spjd		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
168404Spjd
168404Spjd		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
168404Spjd			dn->dn_assigned_txg = 0;
168404Spjd			cv_broadcast(&dn->dn_notxholds);
168404Spjd		}
168404Spjd		mutex_exit(&dn->dn_mtx);
168404Spjd	}
168404Spjd
168404Spjd	if (tx->tx_tempreserve_cookie)
168404Spjd		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
168404Spjd
219089Spjd	if (!list_is_empty(&tx->tx_callbacks))
219089Spjd		txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
219089Spjd
168404Spjd	if (tx->tx_anyobj == FALSE)
168404Spjd		txg_rele_to_sync(&tx->tx_txgh);
219089Spjd
307049Smav	dmu_tx_destroy(tx);
168404Spjd}
168404Spjd
168404Spjdvoid
168404Spjddmu_tx_abort(dmu_tx_t *tx)
168404Spjd{
168404Spjd	ASSERT(tx->tx_txg == 0);
168404Spjd
219089Spjd	/*
219089Spjd	 * Call any registered callbacks with an error code.
219089Spjd	 */
219089Spjd	if (!list_is_empty(&tx->tx_callbacks))
219089Spjd		dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
219089Spjd
307049Smav	dmu_tx_destroy(tx);
168404Spjd}
168404Spjd
168404Spjduint64_t
168404Spjddmu_tx_get_txg(dmu_tx_t *tx)
168404Spjd{
168404Spjd	ASSERT(tx->tx_txg != 0);
168404Spjd	return (tx->tx_txg);
168404Spjd}
219089Spjd
248571Smmdsl_pool_t *
248571Smmdmu_tx_pool(dmu_tx_t *tx)
248571Smm{
248571Smm	ASSERT(tx->tx_pool != NULL);
248571Smm	return (tx->tx_pool);
248571Smm}
248571Smm
219089Spjdvoid
219089Spjddmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
219089Spjd{
219089Spjd	dmu_tx_callback_t *dcb;
219089Spjd
219089Spjd	dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
219089Spjd
219089Spjd	dcb->dcb_func = func;
219089Spjd	dcb->dcb_data = data;
219089Spjd
219089Spjd	list_insert_tail(&tx->tx_callbacks, dcb);
219089Spjd}
219089Spjd
219089Spjd/*
219089Spjd * Call all the commit callbacks on a list, with a given error code.
219089Spjd */
219089Spjdvoid
219089Spjddmu_tx_do_callbacks(list_t *cb_list, int error)
219089Spjd{
219089Spjd	dmu_tx_callback_t *dcb;
219089Spjd
307049Smav	while ((dcb = list_head(cb_list)) != NULL) {
219089Spjd		list_remove(cb_list, dcb);
219089Spjd		dcb->dcb_func(dcb->dcb_data, error);
219089Spjd		kmem_free(dcb, sizeof (dmu_tx_callback_t));
219089Spjd	}
219089Spjd}
219089Spjd
219089Spjd/*
219089Spjd * Interface to hold a bunch of attributes.
219089Spjd * used for creating new files.
219089Spjd * attrsize is the total size of all attributes
219089Spjd * to be added during object creation
219089Spjd *
219089Spjd * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
219089Spjd */
219089Spjd
219089Spjd/*
219089Spjd * hold necessary attribute name for attribute registration.
219089Spjd * should be a very rare case where this is needed.  If it does
219089Spjd * happen it would only happen on the first write to the file system.
219089Spjd */
219089Spjdstatic void
219089Spjddmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
219089Spjd{
219089Spjd	if (!sa->sa_need_attr_registration)
219089Spjd		return;
219089Spjd
321547Smav	for (int i = 0; i != sa->sa_num_attrs; i++) {
219089Spjd		if (!sa->sa_attr_table[i].sa_registered) {
219089Spjd			if (sa->sa_reg_attr_obj)
219089Spjd				dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
219089Spjd				    B_TRUE, sa->sa_attr_table[i].sa_name);
219089Spjd			else
219089Spjd				dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
219089Spjd				    B_TRUE, sa->sa_attr_table[i].sa_name);
219089Spjd		}
219089Spjd	}
219089Spjd}
219089Spjd
219089Spjdvoid
219089Spjddmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
219089Spjd{
321547Smav	dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
321547Smav	    tx->tx_objset, object, THT_SPILL, 0, 0);
219089Spjd
321547Smav	(void) refcount_add_many(&txh->txh_space_towrite,
321547Smav	    SPA_OLD_MAXBLOCKSIZE, FTAG);
219089Spjd}
219089Spjd
219089Spjdvoid
219089Spjddmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
219089Spjd{
219089Spjd	sa_os_t *sa = tx->tx_objset->os_sa;
219089Spjd
219089Spjd	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
219089Spjd
219089Spjd	if (tx->tx_objset->os_sa->sa_master_obj == 0)
219089Spjd		return;
219089Spjd
321547Smav	if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
219089Spjd		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
321547Smav	} else {
219089Spjd		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
219089Spjd		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
219089Spjd		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
219089Spjd		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
219089Spjd	}
219089Spjd
219089Spjd	dmu_tx_sa_registration_hold(sa, tx);
219089Spjd
219089Spjd	if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
219089Spjd		return;
219089Spjd
219089Spjd	(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
219089Spjd	    THT_SPILL, 0, 0);
219089Spjd}
219089Spjd
219089Spjd/*
219089Spjd * Hold SA attribute
219089Spjd *
219089Spjd * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
219089Spjd *
219089Spjd * variable_size is the total size of all variable sized attributes
219089Spjd * passed to this function.  It is not the total size of all
219089Spjd * variable size attributes that *may* exist on this object.
219089Spjd */
219089Spjdvoid
219089Spjddmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
219089Spjd{
219089Spjd	uint64_t object;
219089Spjd	sa_os_t *sa = tx->tx_objset->os_sa;
219089Spjd
219089Spjd	ASSERT(hdl != NULL);
219089Spjd
219089Spjd	object = sa_handle_object(hdl);
219089Spjd
219089Spjd	dmu_tx_hold_bonus(tx, object);
219089Spjd
219089Spjd	if (tx->tx_objset->os_sa->sa_master_obj == 0)
219089Spjd		return;
219089Spjd
219089Spjd	if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
219089Spjd	    tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
219089Spjd		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
219089Spjd		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
219089Spjd		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
219089Spjd		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
219089Spjd	}
219089Spjd
219089Spjd	dmu_tx_sa_registration_hold(sa, tx);
219089Spjd
219089Spjd	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
219089Spjd		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
219089Spjd
219089Spjd	if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
219089Spjd		ASSERT(tx->tx_txg == 0);
219089Spjd		dmu_tx_hold_spill(tx, object);
219089Spjd	} else {
219089Spjd		dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
219089Spjd		dnode_t *dn;
219089Spjd
219089Spjd		DB_DNODE_ENTER(db);
219089Spjd		dn = DB_DNODE(db);
219089Spjd		if (dn->dn_have_spill) {
219089Spjd			ASSERT(tx->tx_txg == 0);
219089Spjd			dmu_tx_hold_spill(tx, object);
219089Spjd		}
219089Spjd		DB_DNODE_EXIT(db);
219089Spjd	}
219089Spjd}