dmu_tx.c revision 321549
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23226512Smm * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24307290Smav * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
25296519Smav * Copyright (c) 2014 Integros [integros.com]
26226512Smm */
27168404Spjd
28168404Spjd#include <sys/dmu.h>
29168404Spjd#include <sys/dmu_impl.h>
30168404Spjd#include <sys/dbuf.h>
31168404Spjd#include <sys/dmu_tx.h>
32168404Spjd#include <sys/dmu_objset.h>
33321547Smav#include <sys/dsl_dataset.h>
34321547Smav#include <sys/dsl_dir.h>
35168404Spjd#include <sys/dsl_pool.h>
36321547Smav#include <sys/zap_impl.h>
37168404Spjd#include <sys/spa.h>
38219089Spjd#include <sys/sa.h>
39219089Spjd#include <sys/sa_impl.h>
40168404Spjd#include <sys/zfs_context.h>
41219089Spjd#include <sys/varargs.h>
42168404Spjd
43168404Spjdtypedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
44168404Spjd    uint64_t arg1, uint64_t arg2);
45168404Spjd
46168404Spjd
47168404Spjddmu_tx_t *
48168404Spjddmu_tx_create_dd(dsl_dir_t *dd)
49168404Spjd{
50168404Spjd	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
51168404Spjd	tx->tx_dir = dd;
52248571Smm	if (dd != NULL)
53168404Spjd		tx->tx_pool = dd->dd_pool;
54168404Spjd	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
55168404Spjd	    offsetof(dmu_tx_hold_t, txh_node));
56219089Spjd	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
57219089Spjd	    offsetof(dmu_tx_callback_t, dcb_node));
58258632Savg	tx->tx_start = gethrtime();
59168404Spjd	return (tx);
60168404Spjd}
61168404Spjd
62168404Spjddmu_tx_t *
63168404Spjddmu_tx_create(objset_t *os)
64168404Spjd{
65219089Spjd	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
66168404Spjd	tx->tx_objset = os;
67168404Spjd	return (tx);
68168404Spjd}
69168404Spjd
70168404Spjddmu_tx_t *
71168404Spjddmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
72168404Spjd{
73168404Spjd	dmu_tx_t *tx = dmu_tx_create_dd(NULL);
74168404Spjd
75168404Spjd	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
76168404Spjd	tx->tx_pool = dp;
77168404Spjd	tx->tx_txg = txg;
78168404Spjd	tx->tx_anyobj = TRUE;
79168404Spjd
80168404Spjd	return (tx);
81168404Spjd}
82168404Spjd
83168404Spjdint
84168404Spjddmu_tx_is_syncing(dmu_tx_t *tx)
85168404Spjd{
86168404Spjd	return (tx->tx_anyobj);
87168404Spjd}
88168404Spjd
89168404Spjdint
90168404Spjddmu_tx_private_ok(dmu_tx_t *tx)
91168404Spjd{
92168404Spjd	return (tx->tx_anyobj);
93168404Spjd}
94168404Spjd
95168404Spjdstatic dmu_tx_hold_t *
96321549Smavdmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
97321549Smav    uint64_t arg1, uint64_t arg2)
98168404Spjd{
99168404Spjd	dmu_tx_hold_t *txh;
100168404Spjd
101321549Smav	if (dn != NULL) {
102321549Smav		(void) refcount_add(&dn->dn_holds, tx);
103321549Smav		if (tx->tx_txg != 0) {
104168404Spjd			mutex_enter(&dn->dn_mtx);
105168404Spjd			/*
106168404Spjd			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
107168404Spjd			 * problem, but there's no way for it to happen (for
108168404Spjd			 * now, at least).
109168404Spjd			 */
110168404Spjd			ASSERT(dn->dn_assigned_txg == 0);
111168404Spjd			dn->dn_assigned_txg = tx->tx_txg;
112168404Spjd			(void) refcount_add(&dn->dn_tx_holds, tx);
113168404Spjd			mutex_exit(&dn->dn_mtx);
114168404Spjd		}
115168404Spjd	}
116168404Spjd
117168404Spjd	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
118168404Spjd	txh->txh_tx = tx;
119168404Spjd	txh->txh_dnode = dn;
120307049Smav	refcount_create(&txh->txh_space_towrite);
121307049Smav	refcount_create(&txh->txh_memory_tohold);
122168404Spjd	txh->txh_type = type;
123168404Spjd	txh->txh_arg1 = arg1;
124168404Spjd	txh->txh_arg2 = arg2;
125168404Spjd	list_insert_tail(&tx->tx_holds, txh);
126168404Spjd
127168404Spjd	return (txh);
128168404Spjd}
129168404Spjd
130321549Smavstatic dmu_tx_hold_t *
131321549Smavdmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
132321549Smav    enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
133321549Smav{
134321549Smav	dnode_t *dn = NULL;
135321549Smav	dmu_tx_hold_t *txh;
136321549Smav	int err;
137321549Smav
138321549Smav	if (object != DMU_NEW_OBJECT) {
139321549Smav		err = dnode_hold(os, object, FTAG, &dn);
140321549Smav		if (err != 0) {
141321549Smav			tx->tx_err = err;
142321549Smav			return (NULL);
143321549Smav		}
144321549Smav	}
145321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
146321549Smav	if (dn != NULL)
147321549Smav		dnode_rele(dn, FTAG);
148321549Smav	return (txh);
149321549Smav}
150321549Smav
151168404Spjdvoid
152321549Smavdmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
153168404Spjd{
154168404Spjd	/*
155168404Spjd	 * If we're syncing, they can manipulate any object anyhow, and
156168404Spjd	 * the hold on the dnode_t can cause problems.
157168404Spjd	 */
158321549Smav	if (!dmu_tx_is_syncing(tx))
159321549Smav		(void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
160168404Spjd}
161168404Spjd
162321547Smav/*
163321547Smav * This function reads specified data from disk.  The specified data will
164321547Smav * be needed to perform the transaction -- i.e, it will be read after
165321547Smav * we do dmu_tx_assign().  There are two reasons that we read the data now
166321547Smav * (before dmu_tx_assign()):
167321547Smav *
168321547Smav * 1. Reading it now has potentially better performance.  The transaction
169321547Smav * has not yet been assigned, so the TXG is not held open, and also the
170321547Smav * caller typically has less locks held when calling dmu_tx_hold_*() than
171321547Smav * after the transaction has been assigned.  This reduces the lock (and txg)
172321547Smav * hold times, thus reducing lock contention.
173321547Smav *
174321547Smav * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
175321547Smav * that are detected before they start making changes to the DMU state
176321547Smav * (i.e. now).  Once the transaction has been assigned, and some DMU
177321547Smav * state has been changed, it can be difficult to recover from an i/o
178321547Smav * error (e.g. to undo the changes already made in memory at the DMU
179321547Smav * layer).  Typically code to do so does not exist in the caller -- it
180321547Smav * assumes that the data has already been cached and thus i/o errors are
181321547Smav * not possible.
182321547Smav *
183321547Smav * It has been observed that the i/o initiated here can be a performance
184321547Smav * problem, and it appears to be optional, because we don't look at the
185321547Smav * data which is read.  However, removing this read would only serve to
186321547Smav * move the work elsewhere (after the dmu_tx_assign()), where it may
187321547Smav * have a greater impact on performance (in addition to the impact on
188321547Smav * fault tolerance noted above).
189321547Smav */
190168404Spjdstatic int
191168404Spjddmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
192168404Spjd{
193168404Spjd	int err;
194168404Spjd	dmu_buf_impl_t *db;
195168404Spjd
196168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
197168404Spjd	db = dbuf_hold_level(dn, level, blkid, FTAG);
198168404Spjd	rw_exit(&dn->dn_struct_rwlock);
199168404Spjd	if (db == NULL)
200249195Smm		return (SET_ERROR(EIO));
201185029Spjd	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
202168404Spjd	dbuf_rele(db, FTAG);
203168404Spjd	return (err);
204168404Spjd}
205168404Spjd
206168404Spjd/* ARGSUSED */
207168404Spjdstatic void
208168404Spjddmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
209168404Spjd{
210168404Spjd	dnode_t *dn = txh->txh_dnode;
211168404Spjd	int err = 0;
212168404Spjd
213168404Spjd	if (len == 0)
214168404Spjd		return;
215168404Spjd
216321547Smav	(void) refcount_add_many(&txh->txh_space_towrite, len, FTAG);
217168404Spjd
218321547Smav	if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
219321547Smav		err = SET_ERROR(EFBIG);
220168404Spjd
221321547Smav	if (dn == NULL)
222321547Smav		return;
223168404Spjd
224321547Smav	/*
225321547Smav	 * For i/o error checking, read the blocks that will be needed
226321547Smav	 * to perform the write: the first and last level-0 blocks (if
227321547Smav	 * they are not aligned, i.e. if they are partial-block writes),
228321547Smav	 * and all the level-1 blocks.
229321547Smav	 */
230321547Smav	if (dn->dn_maxblkid == 0) {
231321547Smav		if (off < dn->dn_datablksz &&
232321547Smav		    (off > 0 || len < dn->dn_datablksz)) {
233321547Smav			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
234321547Smav			if (err != 0) {
235321547Smav				txh->txh_tx->tx_err = err;
236168404Spjd			}
237321547Smav		}
238321547Smav	} else {
239321547Smav		zio_t *zio = zio_root(dn->dn_objset->os_spa,
240321547Smav		    NULL, NULL, ZIO_FLAG_CANFAIL);
241168404Spjd
242321547Smav		/* first level-0 block */
243321547Smav		uint64_t start = off >> dn->dn_datablkshift;
244321547Smav		if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
245321547Smav			err = dmu_tx_check_ioerr(zio, dn, 0, start);
246321547Smav			if (err != 0) {
247321547Smav				txh->txh_tx->tx_err = err;
248168404Spjd			}
249168404Spjd		}
250168404Spjd
251321547Smav		/* last level-0 block */
252321547Smav		uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
253321547Smav		if (end != start && end <= dn->dn_maxblkid &&
254321547Smav		    P2PHASE(off + len, dn->dn_datablksz)) {
255321547Smav			err = dmu_tx_check_ioerr(zio, dn, 0, end);
256321547Smav			if (err != 0) {
257219089Spjd				txh->txh_tx->tx_err = err;
258209962Smm			}
259321547Smav		}
260219089Spjd
261321547Smav		/* level-1 blocks */
262321547Smav		if (dn->dn_nlevels > 1) {
263321547Smav			int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
264321547Smav			for (uint64_t i = (start >> shft) + 1;
265321547Smav			    i < end >> shft; i++) {
266321547Smav				err = dmu_tx_check_ioerr(zio, dn, 1, i);
267321547Smav				if (err != 0) {
268321547Smav					txh->txh_tx->tx_err = err;
269307049Smav				}
270209962Smm			}
271209962Smm		}
272168404Spjd
273321547Smav		err = zio_wait(zio);
274321547Smav		if (err != 0) {
275321547Smav			txh->txh_tx->tx_err = err;
276209962Smm		}
277168404Spjd	}
278168404Spjd}
279168404Spjd
280168404Spjdstatic void
281168404Spjddmu_tx_count_dnode(dmu_tx_hold_t *txh)
282168404Spjd{
283321547Smav	(void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG);
284168404Spjd}
285168404Spjd
286168404Spjdvoid
287168404Spjddmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
288168404Spjd{
289168404Spjd	dmu_tx_hold_t *txh;
290168404Spjd
291321547Smav	ASSERT0(tx->tx_txg);
292321547Smav	ASSERT3U(len, <=, DMU_MAX_ACCESS);
293168404Spjd	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
294168404Spjd
295168404Spjd	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
296168404Spjd	    object, THT_WRITE, off, len);
297321549Smav	if (txh != NULL) {
298321549Smav		dmu_tx_count_write(txh, off, len);
299321549Smav		dmu_tx_count_dnode(txh);
300321549Smav	}
301321549Smav}
302168404Spjd
303321549Smavvoid
304321549Smavdmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
305321549Smav{
306321549Smav	dmu_tx_hold_t *txh;
307321549Smav
308321549Smav	ASSERT0(tx->tx_txg);
309321549Smav	ASSERT3U(len, <=, DMU_MAX_ACCESS);
310321549Smav	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
311321549Smav
312321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
313321549Smav	if (txh != NULL) {
314321549Smav		dmu_tx_count_write(txh, off, len);
315321549Smav		dmu_tx_count_dnode(txh);
316321549Smav	}
317168404Spjd}
318168404Spjd
319268464Sdelphij/*
320268464Sdelphij * This function marks the transaction as being a "net free".  The end
321268464Sdelphij * result is that refquotas will be disabled for this transaction, and
322268464Sdelphij * this transaction will be able to use half of the pool space overhead
323268464Sdelphij * (see dsl_pool_adjustedsize()).  Therefore this function should only
324268464Sdelphij * be called for transactions that we expect will not cause a net increase
325268464Sdelphij * in the amount of space used (but it's OK if that is occasionally not true).
326268464Sdelphij */
327168404Spjdvoid
328268464Sdelphijdmu_tx_mark_netfree(dmu_tx_t *tx)
329268464Sdelphij{
330321547Smav	tx->tx_netfree = B_TRUE;
331268464Sdelphij}
332268464Sdelphij
333321549Smavstatic void
334321549Smavdmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
335168404Spjd{
336321549Smav	dmu_tx_t *tx;
337321549Smav	dnode_t *dn;
338253821Sdelphij	int err;
339321549Smav	zio_t *zio;
340168404Spjd
341321549Smav	tx = txh->txh_tx;
342168404Spjd	ASSERT(tx->tx_txg == 0);
343168404Spjd
344321549Smav	dn = txh->txh_dnode;
345258632Savg	dmu_tx_count_dnode(txh);
346168404Spjd
347321547Smav	if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
348168404Spjd		return;
349168404Spjd	if (len == DMU_OBJECT_END)
350321547Smav		len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
351168404Spjd
352253821Sdelphij
353168404Spjd	/*
354253821Sdelphij	 * For i/o error checking, we read the first and last level-0
355253821Sdelphij	 * blocks if they are not aligned, and all the level-1 blocks.
356253821Sdelphij	 *
357253821Sdelphij	 * Note:  dbuf_free_range() assumes that we have not instantiated
358253821Sdelphij	 * any level-0 dbufs that will be completely freed.  Therefore we must
359253821Sdelphij	 * exercise care to not read or count the first and last blocks
360253821Sdelphij	 * if they are blocksize-aligned.
361168404Spjd	 */
362253821Sdelphij	if (dn->dn_datablkshift == 0) {
363254753Sdelphij		if (off != 0 || len < dn->dn_datablksz)
364256259Savg			dmu_tx_count_write(txh, 0, dn->dn_datablksz);
365253821Sdelphij	} else {
366253821Sdelphij		/* first block will be modified if it is not aligned */
367253821Sdelphij		if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
368253821Sdelphij			dmu_tx_count_write(txh, off, 1);
369253821Sdelphij		/* last block will be modified if it is not aligned */
370253821Sdelphij		if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
371321547Smav			dmu_tx_count_write(txh, off + len, 1);
372253821Sdelphij	}
373253821Sdelphij
374253821Sdelphij	/*
375253821Sdelphij	 * Check level-1 blocks.
376253821Sdelphij	 */
377168404Spjd	if (dn->dn_nlevels > 1) {
378253821Sdelphij		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
379168404Spjd		    SPA_BLKPTRSHIFT;
380253821Sdelphij		uint64_t start = off >> shift;
381253821Sdelphij		uint64_t end = (off + len) >> shift;
382168404Spjd
383253821Sdelphij		ASSERT(dn->dn_indblkshift != 0);
384253821Sdelphij
385259576Spjd		/*
386259576Spjd		 * dnode_reallocate() can result in an object with indirect
387259576Spjd		 * blocks having an odd data block size.  In this case,
388259576Spjd		 * just check the single block.
389259576Spjd		 */
390259576Spjd		if (dn->dn_datablkshift == 0)
391259576Spjd			start = end = 0;
392259576Spjd
393321547Smav		zio_t *zio = zio_root(tx->tx_pool->dp_spa,
394168404Spjd		    NULL, NULL, ZIO_FLAG_CANFAIL);
395253821Sdelphij		for (uint64_t i = start; i <= end; i++) {
396168404Spjd			uint64_t ibyte = i << shift;
397185029Spjd			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
398168404Spjd			i = ibyte >> shift;
399284593Savg			if (err == ESRCH || i > end)
400168404Spjd				break;
401321547Smav			if (err != 0) {
402168404Spjd				tx->tx_err = err;
403321547Smav				(void) zio_wait(zio);
404168404Spjd				return;
405168404Spjd			}
406168404Spjd
407321547Smav			(void) refcount_add_many(&txh->txh_memory_tohold,
408321547Smav			    1 << dn->dn_indblkshift, FTAG);
409321547Smav
410168404Spjd			err = dmu_tx_check_ioerr(zio, dn, 1, i);
411321547Smav			if (err != 0) {
412168404Spjd				tx->tx_err = err;
413321547Smav				(void) zio_wait(zio);
414168404Spjd				return;
415168404Spjd			}
416168404Spjd		}
417168404Spjd		err = zio_wait(zio);
418321547Smav		if (err != 0) {
419168404Spjd			tx->tx_err = err;
420168404Spjd			return;
421168404Spjd		}
422168404Spjd	}
423168404Spjd}
424168404Spjd
425168404Spjdvoid
426321549Smavdmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
427168404Spjd{
428321549Smav	dmu_tx_hold_t *txh;
429321549Smav
430321549Smav	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
431321549Smav	    object, THT_FREE, off, len);
432321549Smav	if (txh != NULL)
433321549Smav		(void) dmu_tx_hold_free_impl(txh, off, len);
434321549Smav}
435321549Smav
436321549Smavvoid
437321549Smavdmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
438321549Smav{
439321549Smav	dmu_tx_hold_t *txh;
440321549Smav
441321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
442321549Smav	if (txh != NULL)
443321549Smav		(void) dmu_tx_hold_free_impl(txh, off, len);
444321549Smav}
445321549Smav
446321549Smavstatic void
447321549Smavdmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, int add, const char *name)
448321549Smav{
449321549Smav	dmu_tx_t *tx = txh->txh_tx;
450321549Smav	dnode_t *dn;
451307049Smav	int err;
452168404Spjd
453168404Spjd	ASSERT(tx->tx_txg == 0);
454168404Spjd
455321549Smav	dn = txh->txh_dnode;
456168404Spjd
457168404Spjd	dmu_tx_count_dnode(txh);
458168404Spjd
459321547Smav	/*
460321547Smav	 * Modifying a almost-full microzap is around the worst case (128KB)
461321547Smav	 *
462321547Smav	 * If it is a fat zap, the worst case would be 7*16KB=112KB:
463321547Smav	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
464321547Smav	 * - 4 new blocks written if adding:
465321547Smav	 *    - 2 blocks for possibly split leaves,
466321547Smav	 *    - 2 grown ptrtbl blocks
467321547Smav	 */
468321547Smav	(void) refcount_add_many(&txh->txh_space_towrite,
469321547Smav	    MZAP_MAX_BLKSZ, FTAG);
470321547Smav
471321547Smav	if (dn == NULL)
472168404Spjd		return;
473168404Spjd
474236884Smm	ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
475168404Spjd
476321547Smav	if (dn->dn_maxblkid == 0 || name == NULL) {
477168404Spjd		/*
478321547Smav		 * This is a microzap (only one block), or we don't know
479321547Smav		 * the name.  Check the first block for i/o errors.
480168404Spjd		 */
481168404Spjd		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
482321547Smav		if (err != 0) {
483168404Spjd			tx->tx_err = err;
484168404Spjd		}
485321547Smav	} else {
486168404Spjd		/*
487321547Smav		 * Access the name so that we'll check for i/o errors to
488321547Smav		 * the leaf blocks, etc.  We ignore ENOENT, as this name
489321547Smav		 * may not yet exist.
490168404Spjd		 */
491307290Smav		err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
492321547Smav		if (err == EIO || err == ECKSUM || err == ENXIO) {
493168404Spjd			tx->tx_err = err;
494168404Spjd		}
495168404Spjd	}
496168404Spjd}
497168404Spjd
498168404Spjdvoid
499321549Smavdmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
500321549Smav{
501321549Smav	dmu_tx_hold_t *txh;
502321549Smav
503321549Smav	ASSERT0(tx->tx_txg);
504321549Smav
505321549Smav	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
506321549Smav	    object, THT_ZAP, add, (uintptr_t)name);
507321549Smav	if (txh != NULL)
508321549Smav		dmu_tx_hold_zap_impl(txh, add, name);
509321549Smav}
510321549Smav
511321549Smavvoid
512321549Smavdmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
513321549Smav{
514321549Smav	dmu_tx_hold_t *txh;
515321549Smav
516321549Smav	ASSERT0(tx->tx_txg);
517321549Smav	ASSERT(dn != NULL);
518321549Smav
519321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
520321549Smav	if (txh != NULL)
521321549Smav		dmu_tx_hold_zap_impl(txh, add, name);
522321549Smav}
523321549Smav
524321549Smavvoid
525168404Spjddmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
526168404Spjd{
527168404Spjd	dmu_tx_hold_t *txh;
528168404Spjd
529168404Spjd	ASSERT(tx->tx_txg == 0);
530168404Spjd
531168404Spjd	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
532168404Spjd	    object, THT_BONUS, 0, 0);
533168404Spjd	if (txh)
534168404Spjd		dmu_tx_count_dnode(txh);
535168404Spjd}
536168404Spjd
537168404Spjdvoid
538321549Smavdmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
539321549Smav{
540321549Smav	dmu_tx_hold_t *txh;
541321549Smav
542321549Smav	ASSERT0(tx->tx_txg);
543321549Smav
544321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
545321549Smav	if (txh)
546321549Smav		dmu_tx_count_dnode(txh);
547321549Smav}
548321549Smav
549321549Smavvoid
550168404Spjddmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
551168404Spjd{
552168404Spjd	dmu_tx_hold_t *txh;
553168404Spjd	ASSERT(tx->tx_txg == 0);
554168404Spjd
555168404Spjd	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
556168404Spjd	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
557168404Spjd
558307049Smav	(void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
559168404Spjd}
560168404Spjd
561168404Spjd#ifdef ZFS_DEBUG
562168404Spjdvoid
563168404Spjddmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
564168404Spjd{
565321547Smav	boolean_t match_object = B_FALSE;
566321547Smav	boolean_t match_offset = B_FALSE;
567168404Spjd
568219089Spjd	DB_DNODE_ENTER(db);
569321547Smav	dnode_t *dn = DB_DNODE(db);
570168404Spjd	ASSERT(tx->tx_txg != 0);
571219089Spjd	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
572168404Spjd	ASSERT3U(dn->dn_object, ==, db->db.db_object);
573168404Spjd
574219089Spjd	if (tx->tx_anyobj) {
575219089Spjd		DB_DNODE_EXIT(db);
576168404Spjd		return;
577219089Spjd	}
578168404Spjd
579168404Spjd	/* XXX No checking on the meta dnode for now */
580219089Spjd	if (db->db.db_object == DMU_META_DNODE_OBJECT) {
581219089Spjd		DB_DNODE_EXIT(db);
582168404Spjd		return;
583219089Spjd	}
584168404Spjd
585321547Smav	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
586168404Spjd	    txh = list_next(&tx->tx_holds, txh)) {
587168404Spjd		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
588168404Spjd		if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
589168404Spjd			match_object = TRUE;
590168404Spjd		if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
591168404Spjd			int datablkshift = dn->dn_datablkshift ?
592168404Spjd			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
593168404Spjd			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
594168404Spjd			int shift = datablkshift + epbs * db->db_level;
595168404Spjd			uint64_t beginblk = shift >= 64 ? 0 :
596168404Spjd			    (txh->txh_arg1 >> shift);
597168404Spjd			uint64_t endblk = shift >= 64 ? 0 :
598168404Spjd			    ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
599168404Spjd			uint64_t blkid = db->db_blkid;
600168404Spjd
601168404Spjd			/* XXX txh_arg2 better not be zero... */
602168404Spjd
603168404Spjd			dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
604168404Spjd			    txh->txh_type, beginblk, endblk);
605168404Spjd
606168404Spjd			switch (txh->txh_type) {
607168404Spjd			case THT_WRITE:
608168404Spjd				if (blkid >= beginblk && blkid <= endblk)
609168404Spjd					match_offset = TRUE;
610168404Spjd				/*
611168404Spjd				 * We will let this hold work for the bonus
612219089Spjd				 * or spill buffer so that we don't need to
613219089Spjd				 * hold it when creating a new object.
614168404Spjd				 */
615219089Spjd				if (blkid == DMU_BONUS_BLKID ||
616219089Spjd				    blkid == DMU_SPILL_BLKID)
617168404Spjd					match_offset = TRUE;
618168404Spjd				/*
619168404Spjd				 * They might have to increase nlevels,
620168404Spjd				 * thus dirtying the new TLIBs.  Or the
621168404Spjd				 * might have to change the block size,
622168404Spjd				 * thus dirying the new lvl=0 blk=0.
623168404Spjd				 */
624168404Spjd				if (blkid == 0)
625168404Spjd					match_offset = TRUE;
626168404Spjd				break;
627168404Spjd			case THT_FREE:
628185029Spjd				/*
629185029Spjd				 * We will dirty all the level 1 blocks in
630185029Spjd				 * the free range and perhaps the first and
631185029Spjd				 * last level 0 block.
632185029Spjd				 */
633185029Spjd				if (blkid >= beginblk && (blkid <= endblk ||
634185029Spjd				    txh->txh_arg2 == DMU_OBJECT_END))
635168404Spjd					match_offset = TRUE;
636168404Spjd				break;
637219089Spjd			case THT_SPILL:
638219089Spjd				if (blkid == DMU_SPILL_BLKID)
639219089Spjd					match_offset = TRUE;
640219089Spjd				break;
641168404Spjd			case THT_BONUS:
642219089Spjd				if (blkid == DMU_BONUS_BLKID)
643168404Spjd					match_offset = TRUE;
644168404Spjd				break;
645168404Spjd			case THT_ZAP:
646168404Spjd				match_offset = TRUE;
647168404Spjd				break;
648168404Spjd			case THT_NEWOBJECT:
649168404Spjd				match_object = TRUE;
650168404Spjd				break;
651168404Spjd			default:
652168404Spjd				ASSERT(!"bad txh_type");
653168404Spjd			}
654168404Spjd		}
655219089Spjd		if (match_object && match_offset) {
656219089Spjd			DB_DNODE_EXIT(db);
657168404Spjd			return;
658219089Spjd		}
659168404Spjd	}
660219089Spjd	DB_DNODE_EXIT(db);
661168404Spjd	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
662168404Spjd	    (u_longlong_t)db->db.db_object, db->db_level,
663168404Spjd	    (u_longlong_t)db->db_blkid);
664168404Spjd}
665168404Spjd#endif
666168404Spjd
667258632Savg/*
668258632Savg * If we can't do 10 iops, something is wrong.  Let us go ahead
669258632Savg * and hit zfs_dirty_data_max.
670258632Savg */
671258632Savghrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
672258632Savgint zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
673258632Savg
674258632Savg/*
675258632Savg * We delay transactions when we've determined that the backend storage
676258632Savg * isn't able to accommodate the rate of incoming writes.
677258632Savg *
678258632Savg * If there is already a transaction waiting, we delay relative to when
679258632Savg * that transaction finishes waiting.  This way the calculated min_time
680258632Savg * is independent of the number of threads concurrently executing
681258632Savg * transactions.
682258632Savg *
683258632Savg * If we are the only waiter, wait relative to when the transaction
684258632Savg * started, rather than the current time.  This credits the transaction for
685258632Savg * "time already served", e.g. reading indirect blocks.
686258632Savg *
687258632Savg * The minimum time for a transaction to take is calculated as:
688258632Savg *     min_time = scale * (dirty - min) / (max - dirty)
689258632Savg *     min_time is then capped at zfs_delay_max_ns.
690258632Savg *
691258632Savg * The delay has two degrees of freedom that can be adjusted via tunables.
692258632Savg * The percentage of dirty data at which we start to delay is defined by
693258632Savg * zfs_delay_min_dirty_percent. This should typically be at or above
694258632Savg * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
695258632Savg * delay after writing at full speed has failed to keep up with the incoming
696258632Savg * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
697258632Savg * speaking, this variable determines the amount of delay at the midpoint of
698258632Savg * the curve.
699258632Savg *
700258632Savg * delay
701258632Savg *  10ms +-------------------------------------------------------------*+
702258632Savg *       |                                                             *|
703258632Savg *   9ms +                                                             *+
704258632Savg *       |                                                             *|
705258632Savg *   8ms +                                                             *+
706258632Savg *       |                                                            * |
707258632Savg *   7ms +                                                            * +
708258632Savg *       |                                                            * |
709258632Savg *   6ms +                                                            * +
710258632Savg *       |                                                            * |
711258632Savg *   5ms +                                                           *  +
712258632Savg *       |                                                           *  |
713258632Savg *   4ms +                                                           *  +
714258632Savg *       |                                                           *  |
715258632Savg *   3ms +                                                          *   +
716258632Savg *       |                                                          *   |
717258632Savg *   2ms +                                              (midpoint) *    +
718258632Savg *       |                                                  |    **     |
719258632Savg *   1ms +                                                  v ***       +
720258632Savg *       |             zfs_delay_scale ---------->     ********         |
721258632Savg *     0 +-------------------------------------*********----------------+
722258632Savg *       0%                    <- zfs_dirty_data_max ->               100%
723258632Savg *
724258632Savg * Note that since the delay is added to the outstanding time remaining on the
725258632Savg * most recent transaction, the delay is effectively the inverse of IOPS.
726258632Savg * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
727258632Savg * was chosen such that small changes in the amount of accumulated dirty data
728258632Savg * in the first 3/4 of the curve yield relatively small differences in the
729258632Savg * amount of delay.
730258632Savg *
731258632Savg * The effects can be easier to understand when the amount of delay is
732258632Savg * represented on a log scale:
733258632Savg *
734258632Savg * delay
735258632Savg * 100ms +-------------------------------------------------------------++
736258632Savg *       +                                                              +
737258632Savg *       |                                                              |
738258632Savg *       +                                                             *+
739258632Savg *  10ms +                                                             *+
740258632Savg *       +                                                           ** +
741258632Savg *       |                                              (midpoint)  **  |
742258632Savg *       +                                                  |     **    +
743258632Savg *   1ms +                                                  v ****      +
744258632Savg *       +             zfs_delay_scale ---------->        *****         +
745258632Savg *       |                                             ****             |
746258632Savg *       +                                          ****                +
747258632Savg * 100us +                                        **                    +
748258632Savg *       +                                       *                      +
749258632Savg *       |                                      *                       |
750258632Savg *       +                                     *                        +
751258632Savg *  10us +                                     *                        +
752258632Savg *       +                                                              +
753258632Savg *       |                                                              |
754258632Savg *       +                                                              +
755258632Savg *       +--------------------------------------------------------------+
756258632Savg *       0%                    <- zfs_dirty_data_max ->               100%
757258632Savg *
758258632Savg * Note here that only as the amount of dirty data approaches its limit does
759258632Savg * the delay start to increase rapidly. The goal of a properly tuned system
760258632Savg * should be to keep the amount of dirty data out of that range by first
761258632Savg * ensuring that the appropriate limits are set for the I/O scheduler to reach
762258632Savg * optimal throughput on the backend storage, and then by changing the value
763258632Savg * of zfs_delay_scale to increase the steepness of the curve.
764258632Savg */
765258632Savgstatic void
766258632Savgdmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
767258632Savg{
768258632Savg	dsl_pool_t *dp = tx->tx_pool;
769258632Savg	uint64_t delay_min_bytes =
770258632Savg	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
771258632Savg	hrtime_t wakeup, min_tx_time, now;
772258632Savg
773258632Savg	if (dirty <= delay_min_bytes)
774258632Savg		return;
775258632Savg
776258632Savg	/*
777258632Savg	 * The caller has already waited until we are under the max.
778258632Savg	 * We make them pass us the amount of dirty data so we don't
779258632Savg	 * have to handle the case of it being >= the max, which could
780258632Savg	 * cause a divide-by-zero if it's == the max.
781258632Savg	 */
782258632Savg	ASSERT3U(dirty, <, zfs_dirty_data_max);
783258632Savg
784258632Savg	now = gethrtime();
785258632Savg	min_tx_time = zfs_delay_scale *
786258632Savg	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
787258632Savg	if (now > tx->tx_start + min_tx_time)
788258632Savg		return;
789258632Savg
790258632Savg	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
791258632Savg
792258632Savg	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
793258632Savg	    uint64_t, min_tx_time);
794258632Savg
795258632Savg	mutex_enter(&dp->dp_lock);
796258632Savg	wakeup = MAX(tx->tx_start + min_tx_time,
797258632Savg	    dp->dp_last_wakeup + min_tx_time);
798258632Savg	dp->dp_last_wakeup = wakeup;
799258632Savg	mutex_exit(&dp->dp_lock);
800258632Savg
801258632Savg#ifdef _KERNEL
802258632Savg#ifdef illumos
803258632Savg	mutex_enter(&curthread->t_delay_lock);
804258632Savg	while (cv_timedwait_hires(&curthread->t_delay_cv,
805258632Savg	    &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
806258632Savg	    CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
807258632Savg		continue;
808258632Savg	mutex_exit(&curthread->t_delay_lock);
809258632Savg#else
810258632Savg	pause_sbt("dmu_tx_delay", wakeup * SBT_1NS,
811258632Savg	    zfs_delay_resolution_ns * SBT_1NS, C_ABSOLUTE);
812258632Savg#endif
813258632Savg#else
814258632Savg	hrtime_t delta = wakeup - gethrtime();
815258632Savg	struct timespec ts;
816258632Savg	ts.tv_sec = delta / NANOSEC;
817258632Savg	ts.tv_nsec = delta % NANOSEC;
818258632Savg	(void) nanosleep(&ts, NULL);
819258632Savg#endif
820258632Savg}
821258632Savg
822321547Smav/*
823321547Smav * This routine attempts to assign the transaction to a transaction group.
824321547Smav * To do so, we must determine if there is sufficient free space on disk.
825321547Smav *
826321547Smav * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
827321547Smav * on it), then it is assumed that there is sufficient free space,
828321547Smav * unless there's insufficient slop space in the pool (see the comment
829321547Smav * above spa_slop_shift in spa_misc.c).
830321547Smav *
831321547Smav * If it is not a "netfree" transaction, then if the data already on disk
832321547Smav * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
833321547Smav * ENOSPC.  Otherwise, if the current rough estimate of pending changes,
834321547Smav * plus the rough estimate of this transaction's changes, may exceed the
835321547Smav * allowed usage, then this will fail with ERESTART, which will cause the
836321547Smav * caller to wait for the pending changes to be written to disk (by waiting
837321547Smav * for the next TXG to open), and then check the space usage again.
838321547Smav *
839321547Smav * The rough estimate of pending changes is comprised of the sum of:
840321547Smav *
841321547Smav *  - this transaction's holds' txh_space_towrite
842321547Smav *
843321547Smav *  - dd_tempreserved[], which is the sum of in-flight transactions'
844321547Smav *    holds' txh_space_towrite (i.e. those transactions that have called
845321547Smav *    dmu_tx_assign() but not yet called dmu_tx_commit()).
846321547Smav *
847321547Smav *  - dd_space_towrite[], which is the amount of dirtied dbufs.
848321547Smav *
849321547Smav * Note that all of these values are inflated by spa_get_worst_case_asize(),
850321547Smav * which means that we may get ERESTART well before we are actually in danger
851321547Smav * of running out of space, but this also mitigates any small inaccuracies
852321547Smav * in the rough estimate (e.g. txh_space_towrite doesn't take into account
853321547Smav * indirect blocks, and dd_space_towrite[] doesn't take into account changes
854321547Smav * to the MOS).
855321547Smav *
856321547Smav * Note that due to this algorithm, it is possible to exceed the allowed
857321547Smav * usage by one transaction.  Also, as we approach the allowed usage,
858321547Smav * we will allow a very limited amount of changes into each TXG, thus
859321547Smav * decreasing performance.
860321547Smav */
861168404Spjdstatic int
862248571Smmdmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
863168404Spjd{
864185029Spjd	spa_t *spa = tx->tx_pool->dp_spa;
865168404Spjd
866240415Smm	ASSERT0(tx->tx_txg);
867185029Spjd
868168404Spjd	if (tx->tx_err)
869168404Spjd		return (tx->tx_err);
870168404Spjd
871185029Spjd	if (spa_suspended(spa)) {
872185029Spjd		/*
873185029Spjd		 * If the user has indicated a blocking failure mode
874185029Spjd		 * then return ERESTART which will block in dmu_tx_wait().
875185029Spjd		 * Otherwise, return EIO so that an error can get
876185029Spjd		 * propagated back to the VOP calls.
877185029Spjd		 *
878185029Spjd		 * Note that we always honor the txg_how flag regardless
879185029Spjd		 * of the failuremode setting.
880185029Spjd		 */
881185029Spjd		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
882185029Spjd		    txg_how != TXG_WAIT)
883249195Smm			return (SET_ERROR(EIO));
884185029Spjd
885249195Smm		return (SET_ERROR(ERESTART));
886185029Spjd	}
887185029Spjd
888258632Savg	if (!tx->tx_waited &&
889258632Savg	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
890258632Savg		tx->tx_wait_dirty = B_TRUE;
891258632Savg		return (SET_ERROR(ERESTART));
892258632Savg	}
893258632Savg
894168404Spjd	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
895168404Spjd	tx->tx_needassign_txh = NULL;
896168404Spjd
897168404Spjd	/*
898168404Spjd	 * NB: No error returns are allowed after txg_hold_open, but
899168404Spjd	 * before processing the dnode holds, due to the
900168404Spjd	 * dmu_tx_unassign() logic.
901168404Spjd	 */
902168404Spjd
903321547Smav	uint64_t towrite = 0;
904321547Smav	uint64_t tohold = 0;
905321547Smav	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
906168404Spjd	    txh = list_next(&tx->tx_holds, txh)) {
907168404Spjd		dnode_t *dn = txh->txh_dnode;
908168404Spjd		if (dn != NULL) {
909168404Spjd			mutex_enter(&dn->dn_mtx);
910168404Spjd			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
911168404Spjd				mutex_exit(&dn->dn_mtx);
912168404Spjd				tx->tx_needassign_txh = txh;
913249195Smm				return (SET_ERROR(ERESTART));
914168404Spjd			}
915168404Spjd			if (dn->dn_assigned_txg == 0)
916168404Spjd				dn->dn_assigned_txg = tx->tx_txg;
917168404Spjd			ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
918168404Spjd			(void) refcount_add(&dn->dn_tx_holds, tx);
919168404Spjd			mutex_exit(&dn->dn_mtx);
920168404Spjd		}
921307049Smav		towrite += refcount_count(&txh->txh_space_towrite);
922307049Smav		tohold += refcount_count(&txh->txh_memory_tohold);
923168404Spjd	}
924168404Spjd
925185029Spjd	/* needed allocation: worst-case estimate of write space */
926321547Smav	uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
927185029Spjd	/* calculate memory footprint estimate */
928321547Smav	uint64_t memory = towrite + tohold;
929168404Spjd
930321547Smav	if (tx->tx_dir != NULL && asize != 0) {
931185029Spjd		int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
932321547Smav		    asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
933321547Smav		if (err != 0)
934168404Spjd			return (err);
935168404Spjd	}
936168404Spjd
937168404Spjd	return (0);
938168404Spjd}
939168404Spjd
940168404Spjdstatic void
941168404Spjddmu_tx_unassign(dmu_tx_t *tx)
942168404Spjd{
943168404Spjd	if (tx->tx_txg == 0)
944168404Spjd		return;
945168404Spjd
946168404Spjd	txg_rele_to_quiesce(&tx->tx_txgh);
947168404Spjd
948251629Sdelphij	/*
949251629Sdelphij	 * Walk the transaction's hold list, removing the hold on the
950251629Sdelphij	 * associated dnode, and notifying waiters if the refcount drops to 0.
951251629Sdelphij	 */
952321547Smav	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
953321547Smav	    txh != tx->tx_needassign_txh;
954168404Spjd	    txh = list_next(&tx->tx_holds, txh)) {
955168404Spjd		dnode_t *dn = txh->txh_dnode;
956168404Spjd
957168404Spjd		if (dn == NULL)
958168404Spjd			continue;
959168404Spjd		mutex_enter(&dn->dn_mtx);
960168404Spjd		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
961168404Spjd
962168404Spjd		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
963168404Spjd			dn->dn_assigned_txg = 0;
964168404Spjd			cv_broadcast(&dn->dn_notxholds);
965168404Spjd		}
966168404Spjd		mutex_exit(&dn->dn_mtx);
967168404Spjd	}
968168404Spjd
969168404Spjd	txg_rele_to_sync(&tx->tx_txgh);
970168404Spjd
971168404Spjd	tx->tx_lasttried_txg = tx->tx_txg;
972168404Spjd	tx->tx_txg = 0;
973168404Spjd}
974168404Spjd
975168404Spjd/*
976168404Spjd * Assign tx to a transaction group.  txg_how can be one of:
977168404Spjd *
978168404Spjd * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
979168404Spjd *	a new one.  This should be used when you're not holding locks.
980248571Smm *	It will only fail if we're truly out of space (or over quota).
981168404Spjd *
982168404Spjd * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
983168404Spjd *	blocking, returns immediately with ERESTART.  This should be used
984168404Spjd *	whenever you're holding locks.  On an ERESTART error, the caller
985168404Spjd *	should drop locks, do a dmu_tx_wait(tx), and try again.
986258632Savg *
987258632Savg * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
988258632Savg *      has already been called on behalf of this operation (though
989258632Savg *      most likely on a different tx).
990168404Spjd */
991168404Spjdint
992248571Smmdmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
993168404Spjd{
994168404Spjd	int err;
995168404Spjd
996168404Spjd	ASSERT(tx->tx_txg == 0);
997258632Savg	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
998258632Savg	    txg_how == TXG_WAITED);
999168404Spjd	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1000168404Spjd
1001248571Smm	/* If we might wait, we must not hold the config lock. */
1002248571Smm	ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1003248571Smm
1004258632Savg	if (txg_how == TXG_WAITED)
1005258632Savg		tx->tx_waited = B_TRUE;
1006258632Savg
1007168404Spjd	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1008168404Spjd		dmu_tx_unassign(tx);
1009168404Spjd
1010168404Spjd		if (err != ERESTART || txg_how != TXG_WAIT)
1011168404Spjd			return (err);
1012168404Spjd
1013168404Spjd		dmu_tx_wait(tx);
1014168404Spjd	}
1015168404Spjd
1016168404Spjd	txg_rele_to_quiesce(&tx->tx_txgh);
1017168404Spjd
1018168404Spjd	return (0);
1019168404Spjd}
1020168404Spjd
1021168404Spjdvoid
1022168404Spjddmu_tx_wait(dmu_tx_t *tx)
1023168404Spjd{
1024185029Spjd	spa_t *spa = tx->tx_pool->dp_spa;
1025258632Savg	dsl_pool_t *dp = tx->tx_pool;
1026185029Spjd
1027168404Spjd	ASSERT(tx->tx_txg == 0);
1028248571Smm	ASSERT(!dsl_pool_config_held(tx->tx_pool));
1029168404Spjd
1030258632Savg	if (tx->tx_wait_dirty) {
1031258632Savg		/*
1032258632Savg		 * dmu_tx_try_assign() has determined that we need to wait
1033258632Savg		 * because we've consumed much or all of the dirty buffer
1034258632Savg		 * space.
1035258632Savg		 */
1036258632Savg		mutex_enter(&dp->dp_lock);
1037258632Savg		while (dp->dp_dirty_total >= zfs_dirty_data_max)
1038258632Savg			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1039258632Savg		uint64_t dirty = dp->dp_dirty_total;
1040258632Savg		mutex_exit(&dp->dp_lock);
1041258632Savg
1042258632Savg		dmu_tx_delay(tx, dirty);
1043258632Savg
1044258632Savg		tx->tx_wait_dirty = B_FALSE;
1045258632Savg
1046258632Savg		/*
1047258632Savg		 * Note: setting tx_waited only has effect if the caller
1048258632Savg		 * used TX_WAIT.  Otherwise they are going to destroy
1049258632Savg		 * this tx and try again.  The common case, zfs_write(),
1050258632Savg		 * uses TX_WAIT.
1051258632Savg		 */
1052258632Savg		tx->tx_waited = B_TRUE;
1053258632Savg	} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1054258632Savg		/*
1055258632Savg		 * If the pool is suspended we need to wait until it
1056258632Savg		 * is resumed.  Note that it's possible that the pool
1057258632Savg		 * has become active after this thread has tried to
1058258632Savg		 * obtain a tx.  If that's the case then tx_lasttried_txg
1059258632Savg		 * would not have been set.
1060258632Savg		 */
1061258632Savg		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1062185029Spjd	} else if (tx->tx_needassign_txh) {
1063258632Savg		/*
1064258632Savg		 * A dnode is assigned to the quiescing txg.  Wait for its
1065258632Savg		 * transaction to complete.
1066258632Savg		 */
1067168404Spjd		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1068168404Spjd
1069168404Spjd		mutex_enter(&dn->dn_mtx);
1070168404Spjd		while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1071168404Spjd			cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1072168404Spjd		mutex_exit(&dn->dn_mtx);
1073168404Spjd		tx->tx_needassign_txh = NULL;
1074168404Spjd	} else {
1075168404Spjd		txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1076168404Spjd	}
1077168404Spjd}
1078168404Spjd
1079307049Smavstatic void
1080307049Smavdmu_tx_destroy(dmu_tx_t *tx)
1081307049Smav{
1082307049Smav	dmu_tx_hold_t *txh;
1083307049Smav
1084307049Smav	while ((txh = list_head(&tx->tx_holds)) != NULL) {
1085307049Smav		dnode_t *dn = txh->txh_dnode;
1086307049Smav
1087307049Smav		list_remove(&tx->tx_holds, txh);
1088307049Smav		refcount_destroy_many(&txh->txh_space_towrite,
1089307049Smav		    refcount_count(&txh->txh_space_towrite));
1090307049Smav		refcount_destroy_many(&txh->txh_memory_tohold,
1091307049Smav		    refcount_count(&txh->txh_memory_tohold));
1092307049Smav		kmem_free(txh, sizeof (dmu_tx_hold_t));
1093307049Smav		if (dn != NULL)
1094307049Smav			dnode_rele(dn, tx);
1095307049Smav	}
1096307049Smav
1097307049Smav	list_destroy(&tx->tx_callbacks);
1098307049Smav	list_destroy(&tx->tx_holds);
1099307049Smav	kmem_free(tx, sizeof (dmu_tx_t));
1100307049Smav}
1101307049Smav
1102168404Spjdvoid
1103168404Spjddmu_tx_commit(dmu_tx_t *tx)
1104168404Spjd{
1105168404Spjd	ASSERT(tx->tx_txg != 0);
1106168404Spjd
1107251629Sdelphij	/*
1108251629Sdelphij	 * Go through the transaction's hold list and remove holds on
1109251629Sdelphij	 * associated dnodes, notifying waiters if no holds remain.
1110251629Sdelphij	 */
1111307049Smav	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
1112307049Smav	    txh = list_next(&tx->tx_holds, txh)) {
1113168404Spjd		dnode_t *dn = txh->txh_dnode;
1114168404Spjd
1115168404Spjd		if (dn == NULL)
1116168404Spjd			continue;
1117307049Smav
1118168404Spjd		mutex_enter(&dn->dn_mtx);
1119168404Spjd		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1120168404Spjd
1121168404Spjd		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1122168404Spjd			dn->dn_assigned_txg = 0;
1123168404Spjd			cv_broadcast(&dn->dn_notxholds);
1124168404Spjd		}
1125168404Spjd		mutex_exit(&dn->dn_mtx);
1126168404Spjd	}
1127168404Spjd
1128168404Spjd	if (tx->tx_tempreserve_cookie)
1129168404Spjd		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1130168404Spjd
1131219089Spjd	if (!list_is_empty(&tx->tx_callbacks))
1132219089Spjd		txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1133219089Spjd
1134168404Spjd	if (tx->tx_anyobj == FALSE)
1135168404Spjd		txg_rele_to_sync(&tx->tx_txgh);
1136219089Spjd
1137307049Smav	dmu_tx_destroy(tx);
1138168404Spjd}
1139168404Spjd
1140168404Spjdvoid
1141168404Spjddmu_tx_abort(dmu_tx_t *tx)
1142168404Spjd{
1143168404Spjd	ASSERT(tx->tx_txg == 0);
1144168404Spjd
1145219089Spjd	/*
1146219089Spjd	 * Call any registered callbacks with an error code.
1147219089Spjd	 */
1148219089Spjd	if (!list_is_empty(&tx->tx_callbacks))
1149219089Spjd		dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1150219089Spjd
1151307049Smav	dmu_tx_destroy(tx);
1152168404Spjd}
1153168404Spjd
1154168404Spjduint64_t
1155168404Spjddmu_tx_get_txg(dmu_tx_t *tx)
1156168404Spjd{
1157168404Spjd	ASSERT(tx->tx_txg != 0);
1158168404Spjd	return (tx->tx_txg);
1159168404Spjd}
1160219089Spjd
1161248571Smmdsl_pool_t *
1162248571Smmdmu_tx_pool(dmu_tx_t *tx)
1163248571Smm{
1164248571Smm	ASSERT(tx->tx_pool != NULL);
1165248571Smm	return (tx->tx_pool);
1166248571Smm}
1167248571Smm
1168219089Spjdvoid
1169219089Spjddmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1170219089Spjd{
1171219089Spjd	dmu_tx_callback_t *dcb;
1172219089Spjd
1173219089Spjd	dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1174219089Spjd
1175219089Spjd	dcb->dcb_func = func;
1176219089Spjd	dcb->dcb_data = data;
1177219089Spjd
1178219089Spjd	list_insert_tail(&tx->tx_callbacks, dcb);
1179219089Spjd}
1180219089Spjd
1181219089Spjd/*
1182219089Spjd * Call all the commit callbacks on a list, with a given error code.
1183219089Spjd */
1184219089Spjdvoid
1185219089Spjddmu_tx_do_callbacks(list_t *cb_list, int error)
1186219089Spjd{
1187219089Spjd	dmu_tx_callback_t *dcb;
1188219089Spjd
1189307049Smav	while ((dcb = list_head(cb_list)) != NULL) {
1190219089Spjd		list_remove(cb_list, dcb);
1191219089Spjd		dcb->dcb_func(dcb->dcb_data, error);
1192219089Spjd		kmem_free(dcb, sizeof (dmu_tx_callback_t));
1193219089Spjd	}
1194219089Spjd}
1195219089Spjd
1196219089Spjd/*
1197219089Spjd * Interface to hold a bunch of attributes.
1198219089Spjd * used for creating new files.
1199219089Spjd * attrsize is the total size of all attributes
1200219089Spjd * to be added during object creation
1201219089Spjd *
1202219089Spjd * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1203219089Spjd */
1204219089Spjd
1205219089Spjd/*
1206219089Spjd * hold necessary attribute name for attribute registration.
1207219089Spjd * should be a very rare case where this is needed.  If it does
1208219089Spjd * happen it would only happen on the first write to the file system.
1209219089Spjd */
1210219089Spjdstatic void
1211219089Spjddmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1212219089Spjd{
1213219089Spjd	if (!sa->sa_need_attr_registration)
1214219089Spjd		return;
1215219089Spjd
1216321547Smav	for (int i = 0; i != sa->sa_num_attrs; i++) {
1217219089Spjd		if (!sa->sa_attr_table[i].sa_registered) {
1218219089Spjd			if (sa->sa_reg_attr_obj)
1219219089Spjd				dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1220219089Spjd				    B_TRUE, sa->sa_attr_table[i].sa_name);
1221219089Spjd			else
1222219089Spjd				dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1223219089Spjd				    B_TRUE, sa->sa_attr_table[i].sa_name);
1224219089Spjd		}
1225219089Spjd	}
1226219089Spjd}
1227219089Spjd
1228219089Spjdvoid
1229219089Spjddmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1230219089Spjd{
1231321547Smav	dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
1232321547Smav	    tx->tx_objset, object, THT_SPILL, 0, 0);
1233219089Spjd
1234321547Smav	(void) refcount_add_many(&txh->txh_space_towrite,
1235321547Smav	    SPA_OLD_MAXBLOCKSIZE, FTAG);
1236219089Spjd}
1237219089Spjd
1238219089Spjdvoid
1239219089Spjddmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1240219089Spjd{
1241219089Spjd	sa_os_t *sa = tx->tx_objset->os_sa;
1242219089Spjd
1243219089Spjd	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1244219089Spjd
1245219089Spjd	if (tx->tx_objset->os_sa->sa_master_obj == 0)
1246219089Spjd		return;
1247219089Spjd
1248321547Smav	if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
1249219089Spjd		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1250321547Smav	} else {
1251219089Spjd		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1252219089Spjd		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1253219089Spjd		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1254219089Spjd		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1255219089Spjd	}
1256219089Spjd
1257219089Spjd	dmu_tx_sa_registration_hold(sa, tx);
1258219089Spjd
1259219089Spjd	if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1260219089Spjd		return;
1261219089Spjd
1262219089Spjd	(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1263219089Spjd	    THT_SPILL, 0, 0);
1264219089Spjd}
1265219089Spjd
1266219089Spjd/*
1267219089Spjd * Hold SA attribute
1268219089Spjd *
1269219089Spjd * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1270219089Spjd *
1271219089Spjd * variable_size is the total size of all variable sized attributes
1272219089Spjd * passed to this function.  It is not the total size of all
1273219089Spjd * variable size attributes that *may* exist on this object.
1274219089Spjd */
1275219089Spjdvoid
1276219089Spjddmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1277219089Spjd{
1278219089Spjd	uint64_t object;
1279219089Spjd	sa_os_t *sa = tx->tx_objset->os_sa;
1280219089Spjd
1281219089Spjd	ASSERT(hdl != NULL);
1282219089Spjd
1283219089Spjd	object = sa_handle_object(hdl);
1284219089Spjd
1285219089Spjd	dmu_tx_hold_bonus(tx, object);
1286219089Spjd
1287219089Spjd	if (tx->tx_objset->os_sa->sa_master_obj == 0)
1288219089Spjd		return;
1289219089Spjd
1290219089Spjd	if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1291219089Spjd	    tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1292219089Spjd		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1293219089Spjd		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1294219089Spjd		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1295219089Spjd		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1296219089Spjd	}
1297219089Spjd
1298219089Spjd	dmu_tx_sa_registration_hold(sa, tx);
1299219089Spjd
1300219089Spjd	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1301219089Spjd		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1302219089Spjd
1303219089Spjd	if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1304219089Spjd		ASSERT(tx->tx_txg == 0);
1305219089Spjd		dmu_tx_hold_spill(tx, object);
1306219089Spjd	} else {
1307219089Spjd		dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1308219089Spjd		dnode_t *dn;
1309219089Spjd
1310219089Spjd		DB_DNODE_ENTER(db);
1311219089Spjd		dn = DB_DNODE(db);
1312219089Spjd		if (dn->dn_have_spill) {
1313219089Spjd			ASSERT(tx->tx_txg == 0);
1314219089Spjd			dmu_tx_hold_spill(tx, object);
1315219089Spjd		}
1316219089Spjd		DB_DNODE_EXIT(db);
1317219089Spjd	}
1318219089Spjd}
1319