1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23226512Smm * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24321567Smav * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25296519Smav * Copyright (c) 2014 Integros [integros.com]
26226512Smm */
27168404Spjd
28168404Spjd#include <sys/dmu.h>
29168404Spjd#include <sys/dmu_impl.h>
30168404Spjd#include <sys/dbuf.h>
31168404Spjd#include <sys/dmu_tx.h>
32168404Spjd#include <sys/dmu_objset.h>
33321547Smav#include <sys/dsl_dataset.h>
34321547Smav#include <sys/dsl_dir.h>
35168404Spjd#include <sys/dsl_pool.h>
36321547Smav#include <sys/zap_impl.h>
37168404Spjd#include <sys/spa.h>
38219089Spjd#include <sys/sa.h>
39219089Spjd#include <sys/sa_impl.h>
40168404Spjd#include <sys/zfs_context.h>
41219089Spjd#include <sys/varargs.h>
42168404Spjd
43168404Spjdtypedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
44168404Spjd    uint64_t arg1, uint64_t arg2);
45168404Spjd
46168404Spjd
47168404Spjddmu_tx_t *
48168404Spjddmu_tx_create_dd(dsl_dir_t *dd)
49168404Spjd{
50168404Spjd	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
51168404Spjd	tx->tx_dir = dd;
52248571Smm	if (dd != NULL)
53168404Spjd		tx->tx_pool = dd->dd_pool;
54168404Spjd	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
55168404Spjd	    offsetof(dmu_tx_hold_t, txh_node));
56219089Spjd	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
57219089Spjd	    offsetof(dmu_tx_callback_t, dcb_node));
58258632Savg	tx->tx_start = gethrtime();
59168404Spjd	return (tx);
60168404Spjd}
61168404Spjd
62168404Spjddmu_tx_t *
63168404Spjddmu_tx_create(objset_t *os)
64168404Spjd{
65219089Spjd	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
66168404Spjd	tx->tx_objset = os;
67168404Spjd	return (tx);
68168404Spjd}
69168404Spjd
70168404Spjddmu_tx_t *
71168404Spjddmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
72168404Spjd{
73168404Spjd	dmu_tx_t *tx = dmu_tx_create_dd(NULL);
74168404Spjd
75321567Smav	txg_verify(dp->dp_spa, txg);
76168404Spjd	tx->tx_pool = dp;
77168404Spjd	tx->tx_txg = txg;
78168404Spjd	tx->tx_anyobj = TRUE;
79168404Spjd
80168404Spjd	return (tx);
81168404Spjd}
82168404Spjd
83168404Spjdint
84168404Spjddmu_tx_is_syncing(dmu_tx_t *tx)
85168404Spjd{
86168404Spjd	return (tx->tx_anyobj);
87168404Spjd}
88168404Spjd
89168404Spjdint
90168404Spjddmu_tx_private_ok(dmu_tx_t *tx)
91168404Spjd{
92168404Spjd	return (tx->tx_anyobj);
93168404Spjd}
94168404Spjd
95168404Spjdstatic dmu_tx_hold_t *
96321549Smavdmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
97321549Smav    uint64_t arg1, uint64_t arg2)
98168404Spjd{
99168404Spjd	dmu_tx_hold_t *txh;
100168404Spjd
101321549Smav	if (dn != NULL) {
102321549Smav		(void) refcount_add(&dn->dn_holds, tx);
103321549Smav		if (tx->tx_txg != 0) {
104168404Spjd			mutex_enter(&dn->dn_mtx);
105168404Spjd			/*
106168404Spjd			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
107168404Spjd			 * problem, but there's no way for it to happen (for
108168404Spjd			 * now, at least).
109168404Spjd			 */
110168404Spjd			ASSERT(dn->dn_assigned_txg == 0);
111168404Spjd			dn->dn_assigned_txg = tx->tx_txg;
112168404Spjd			(void) refcount_add(&dn->dn_tx_holds, tx);
113168404Spjd			mutex_exit(&dn->dn_mtx);
114168404Spjd		}
115168404Spjd	}
116168404Spjd
117168404Spjd	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
118168404Spjd	txh->txh_tx = tx;
119168404Spjd	txh->txh_dnode = dn;
120307049Smav	refcount_create(&txh->txh_space_towrite);
121307049Smav	refcount_create(&txh->txh_memory_tohold);
122168404Spjd	txh->txh_type = type;
123168404Spjd	txh->txh_arg1 = arg1;
124168404Spjd	txh->txh_arg2 = arg2;
125168404Spjd	list_insert_tail(&tx->tx_holds, txh);
126168404Spjd
127168404Spjd	return (txh);
128168404Spjd}
129168404Spjd
130321549Smavstatic dmu_tx_hold_t *
131321549Smavdmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
132321549Smav    enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
133321549Smav{
134321549Smav	dnode_t *dn = NULL;
135321549Smav	dmu_tx_hold_t *txh;
136321549Smav	int err;
137321549Smav
138321549Smav	if (object != DMU_NEW_OBJECT) {
139321549Smav		err = dnode_hold(os, object, FTAG, &dn);
140321549Smav		if (err != 0) {
141321549Smav			tx->tx_err = err;
142321549Smav			return (NULL);
143321549Smav		}
144321549Smav	}
145321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
146321549Smav	if (dn != NULL)
147321549Smav		dnode_rele(dn, FTAG);
148321549Smav	return (txh);
149321549Smav}
150321549Smav
151168404Spjdvoid
152321549Smavdmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
153168404Spjd{
154168404Spjd	/*
155168404Spjd	 * If we're syncing, they can manipulate any object anyhow, and
156168404Spjd	 * the hold on the dnode_t can cause problems.
157168404Spjd	 */
158321549Smav	if (!dmu_tx_is_syncing(tx))
159321549Smav		(void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
160168404Spjd}
161168404Spjd
162321547Smav/*
163321547Smav * This function reads specified data from disk.  The specified data will
164321547Smav * be needed to perform the transaction -- i.e, it will be read after
165321547Smav * we do dmu_tx_assign().  There are two reasons that we read the data now
166321547Smav * (before dmu_tx_assign()):
167321547Smav *
168321547Smav * 1. Reading it now has potentially better performance.  The transaction
169321547Smav * has not yet been assigned, so the TXG is not held open, and also the
170321547Smav * caller typically has less locks held when calling dmu_tx_hold_*() than
171321547Smav * after the transaction has been assigned.  This reduces the lock (and txg)
172321547Smav * hold times, thus reducing lock contention.
173321547Smav *
174321547Smav * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
175321547Smav * that are detected before they start making changes to the DMU state
176321547Smav * (i.e. now).  Once the transaction has been assigned, and some DMU
177321547Smav * state has been changed, it can be difficult to recover from an i/o
178321547Smav * error (e.g. to undo the changes already made in memory at the DMU
179321547Smav * layer).  Typically code to do so does not exist in the caller -- it
180321547Smav * assumes that the data has already been cached and thus i/o errors are
181321547Smav * not possible.
182321547Smav *
183321547Smav * It has been observed that the i/o initiated here can be a performance
184321547Smav * problem, and it appears to be optional, because we don't look at the
185321547Smav * data which is read.  However, removing this read would only serve to
186321547Smav * move the work elsewhere (after the dmu_tx_assign()), where it may
187321547Smav * have a greater impact on performance (in addition to the impact on
188321547Smav * fault tolerance noted above).
189321547Smav */
190168404Spjdstatic int
191168404Spjddmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
192168404Spjd{
193168404Spjd	int err;
194168404Spjd	dmu_buf_impl_t *db;
195168404Spjd
196168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
197168404Spjd	db = dbuf_hold_level(dn, level, blkid, FTAG);
198168404Spjd	rw_exit(&dn->dn_struct_rwlock);
199168404Spjd	if (db == NULL)
200249195Smm		return (SET_ERROR(EIO));
201185029Spjd	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
202168404Spjd	dbuf_rele(db, FTAG);
203168404Spjd	return (err);
204168404Spjd}
205168404Spjd
206168404Spjd/* ARGSUSED */
207168404Spjdstatic void
208168404Spjddmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
209168404Spjd{
210168404Spjd	dnode_t *dn = txh->txh_dnode;
211168404Spjd	int err = 0;
212168404Spjd
213168404Spjd	if (len == 0)
214168404Spjd		return;
215168404Spjd
216321547Smav	(void) refcount_add_many(&txh->txh_space_towrite, len, FTAG);
217168404Spjd
218321547Smav	if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
219321547Smav		err = SET_ERROR(EFBIG);
220168404Spjd
221321547Smav	if (dn == NULL)
222321547Smav		return;
223168404Spjd
224321547Smav	/*
225321547Smav	 * For i/o error checking, read the blocks that will be needed
226321547Smav	 * to perform the write: the first and last level-0 blocks (if
227321547Smav	 * they are not aligned, i.e. if they are partial-block writes),
228321547Smav	 * and all the level-1 blocks.
229321547Smav	 */
230321547Smav	if (dn->dn_maxblkid == 0) {
231321547Smav		if (off < dn->dn_datablksz &&
232321547Smav		    (off > 0 || len < dn->dn_datablksz)) {
233321547Smav			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
234321547Smav			if (err != 0) {
235321547Smav				txh->txh_tx->tx_err = err;
236168404Spjd			}
237321547Smav		}
238321547Smav	} else {
239321547Smav		zio_t *zio = zio_root(dn->dn_objset->os_spa,
240321547Smav		    NULL, NULL, ZIO_FLAG_CANFAIL);
241168404Spjd
242321547Smav		/* first level-0 block */
243321547Smav		uint64_t start = off >> dn->dn_datablkshift;
244321547Smav		if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
245321547Smav			err = dmu_tx_check_ioerr(zio, dn, 0, start);
246321547Smav			if (err != 0) {
247321547Smav				txh->txh_tx->tx_err = err;
248168404Spjd			}
249168404Spjd		}
250168404Spjd
251321547Smav		/* last level-0 block */
252321547Smav		uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
253321547Smav		if (end != start && end <= dn->dn_maxblkid &&
254321547Smav		    P2PHASE(off + len, dn->dn_datablksz)) {
255321547Smav			err = dmu_tx_check_ioerr(zio, dn, 0, end);
256321547Smav			if (err != 0) {
257219089Spjd				txh->txh_tx->tx_err = err;
258209962Smm			}
259321547Smav		}
260219089Spjd
261321547Smav		/* level-1 blocks */
262321547Smav		if (dn->dn_nlevels > 1) {
263321547Smav			int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
264321547Smav			for (uint64_t i = (start >> shft) + 1;
265321547Smav			    i < end >> shft; i++) {
266321547Smav				err = dmu_tx_check_ioerr(zio, dn, 1, i);
267321547Smav				if (err != 0) {
268321547Smav					txh->txh_tx->tx_err = err;
269307049Smav				}
270209962Smm			}
271209962Smm		}
272168404Spjd
273321547Smav		err = zio_wait(zio);
274321547Smav		if (err != 0) {
275321547Smav			txh->txh_tx->tx_err = err;
276209962Smm		}
277168404Spjd	}
278168404Spjd}
279168404Spjd
280168404Spjdstatic void
281168404Spjddmu_tx_count_dnode(dmu_tx_hold_t *txh)
282168404Spjd{
283321547Smav	(void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG);
284168404Spjd}
285168404Spjd
286168404Spjdvoid
287168404Spjddmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
288168404Spjd{
289168404Spjd	dmu_tx_hold_t *txh;
290168404Spjd
291321547Smav	ASSERT0(tx->tx_txg);
292321547Smav	ASSERT3U(len, <=, DMU_MAX_ACCESS);
293168404Spjd	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
294168404Spjd
295168404Spjd	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
296168404Spjd	    object, THT_WRITE, off, len);
297321549Smav	if (txh != NULL) {
298321549Smav		dmu_tx_count_write(txh, off, len);
299321549Smav		dmu_tx_count_dnode(txh);
300321549Smav	}
301321549Smav}
302168404Spjd
303321549Smavvoid
304332525Smavdmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object)
305332525Smav{
306332525Smav	dmu_tx_hold_t *txh;
307332525Smav
308332525Smav	ASSERT(tx->tx_txg == 0);
309332525Smav	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
310332525Smav	    object, THT_WRITE, 0, 0);
311332525Smav	if (txh == NULL)
312332525Smav		return;
313332525Smav
314332525Smav	dnode_t *dn = txh->txh_dnode;
315332525Smav	(void) refcount_add_many(&txh->txh_space_towrite,
316332525Smav	    1ULL << dn->dn_indblkshift, FTAG);
317332525Smav	dmu_tx_count_dnode(txh);
318332525Smav}
319332525Smav
320332525Smavvoid
321321549Smavdmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
322321549Smav{
323321549Smav	dmu_tx_hold_t *txh;
324321549Smav
325321549Smav	ASSERT0(tx->tx_txg);
326321549Smav	ASSERT3U(len, <=, DMU_MAX_ACCESS);
327321549Smav	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
328321549Smav
329321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
330321549Smav	if (txh != NULL) {
331321549Smav		dmu_tx_count_write(txh, off, len);
332321549Smav		dmu_tx_count_dnode(txh);
333321549Smav	}
334168404Spjd}
335168404Spjd
336268464Sdelphij/*
337268464Sdelphij * This function marks the transaction as being a "net free".  The end
338268464Sdelphij * result is that refquotas will be disabled for this transaction, and
339268464Sdelphij * this transaction will be able to use half of the pool space overhead
340268464Sdelphij * (see dsl_pool_adjustedsize()).  Therefore this function should only
341268464Sdelphij * be called for transactions that we expect will not cause a net increase
342268464Sdelphij * in the amount of space used (but it's OK if that is occasionally not true).
343268464Sdelphij */
344168404Spjdvoid
345268464Sdelphijdmu_tx_mark_netfree(dmu_tx_t *tx)
346268464Sdelphij{
347321547Smav	tx->tx_netfree = B_TRUE;
348268464Sdelphij}
349268464Sdelphij
350321549Smavstatic void
351321549Smavdmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
352168404Spjd{
353321549Smav	dmu_tx_t *tx;
354321549Smav	dnode_t *dn;
355253821Sdelphij	int err;
356168404Spjd
357321549Smav	tx = txh->txh_tx;
358168404Spjd	ASSERT(tx->tx_txg == 0);
359168404Spjd
360321549Smav	dn = txh->txh_dnode;
361258632Savg	dmu_tx_count_dnode(txh);
362168404Spjd
363321547Smav	if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
364168404Spjd		return;
365168404Spjd	if (len == DMU_OBJECT_END)
366321547Smav		len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
367168404Spjd
368253821Sdelphij
369168404Spjd	/*
370253821Sdelphij	 * For i/o error checking, we read the first and last level-0
371253821Sdelphij	 * blocks if they are not aligned, and all the level-1 blocks.
372253821Sdelphij	 *
373253821Sdelphij	 * Note:  dbuf_free_range() assumes that we have not instantiated
374253821Sdelphij	 * any level-0 dbufs that will be completely freed.  Therefore we must
375253821Sdelphij	 * exercise care to not read or count the first and last blocks
376253821Sdelphij	 * if they are blocksize-aligned.
377168404Spjd	 */
378253821Sdelphij	if (dn->dn_datablkshift == 0) {
379254753Sdelphij		if (off != 0 || len < dn->dn_datablksz)
380256259Savg			dmu_tx_count_write(txh, 0, dn->dn_datablksz);
381253821Sdelphij	} else {
382253821Sdelphij		/* first block will be modified if it is not aligned */
383253821Sdelphij		if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
384253821Sdelphij			dmu_tx_count_write(txh, off, 1);
385253821Sdelphij		/* last block will be modified if it is not aligned */
386253821Sdelphij		if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
387321547Smav			dmu_tx_count_write(txh, off + len, 1);
388253821Sdelphij	}
389253821Sdelphij
390253821Sdelphij	/*
391253821Sdelphij	 * Check level-1 blocks.
392253821Sdelphij	 */
393168404Spjd	if (dn->dn_nlevels > 1) {
394253821Sdelphij		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
395168404Spjd		    SPA_BLKPTRSHIFT;
396253821Sdelphij		uint64_t start = off >> shift;
397253821Sdelphij		uint64_t end = (off + len) >> shift;
398168404Spjd
399253821Sdelphij		ASSERT(dn->dn_indblkshift != 0);
400253821Sdelphij
401259576Spjd		/*
402259576Spjd		 * dnode_reallocate() can result in an object with indirect
403259576Spjd		 * blocks having an odd data block size.  In this case,
404259576Spjd		 * just check the single block.
405259576Spjd		 */
406259576Spjd		if (dn->dn_datablkshift == 0)
407259576Spjd			start = end = 0;
408259576Spjd
409321547Smav		zio_t *zio = zio_root(tx->tx_pool->dp_spa,
410168404Spjd		    NULL, NULL, ZIO_FLAG_CANFAIL);
411253821Sdelphij		for (uint64_t i = start; i <= end; i++) {
412168404Spjd			uint64_t ibyte = i << shift;
413185029Spjd			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
414168404Spjd			i = ibyte >> shift;
415284593Savg			if (err == ESRCH || i > end)
416168404Spjd				break;
417321547Smav			if (err != 0) {
418168404Spjd				tx->tx_err = err;
419321547Smav				(void) zio_wait(zio);
420168404Spjd				return;
421168404Spjd			}
422168404Spjd
423321547Smav			(void) refcount_add_many(&txh->txh_memory_tohold,
424321547Smav			    1 << dn->dn_indblkshift, FTAG);
425321547Smav
426168404Spjd			err = dmu_tx_check_ioerr(zio, dn, 1, i);
427321547Smav			if (err != 0) {
428168404Spjd				tx->tx_err = err;
429321547Smav				(void) zio_wait(zio);
430168404Spjd				return;
431168404Spjd			}
432168404Spjd		}
433168404Spjd		err = zio_wait(zio);
434321547Smav		if (err != 0) {
435168404Spjd			tx->tx_err = err;
436168404Spjd			return;
437168404Spjd		}
438168404Spjd	}
439168404Spjd}
440168404Spjd
441168404Spjdvoid
442321549Smavdmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
443168404Spjd{
444321549Smav	dmu_tx_hold_t *txh;
445321549Smav
446321549Smav	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
447321549Smav	    object, THT_FREE, off, len);
448321549Smav	if (txh != NULL)
449321549Smav		(void) dmu_tx_hold_free_impl(txh, off, len);
450321549Smav}
451321549Smav
452321549Smavvoid
453321549Smavdmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
454321549Smav{
455321549Smav	dmu_tx_hold_t *txh;
456321549Smav
457321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
458321549Smav	if (txh != NULL)
459321549Smav		(void) dmu_tx_hold_free_impl(txh, off, len);
460321549Smav}
461321549Smav
462321549Smavstatic void
463321550Smavdmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
464321549Smav{
465321549Smav	dmu_tx_t *tx = txh->txh_tx;
466321549Smav	dnode_t *dn;
467307049Smav	int err;
468168404Spjd
469168404Spjd	ASSERT(tx->tx_txg == 0);
470168404Spjd
471321549Smav	dn = txh->txh_dnode;
472168404Spjd
473168404Spjd	dmu_tx_count_dnode(txh);
474168404Spjd
475321547Smav	/*
476321547Smav	 * Modifying a almost-full microzap is around the worst case (128KB)
477321547Smav	 *
478321547Smav	 * If it is a fat zap, the worst case would be 7*16KB=112KB:
479321547Smav	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
480321547Smav	 * - 4 new blocks written if adding:
481321547Smav	 *    - 2 blocks for possibly split leaves,
482321547Smav	 *    - 2 grown ptrtbl blocks
483321547Smav	 */
484321547Smav	(void) refcount_add_many(&txh->txh_space_towrite,
485321547Smav	    MZAP_MAX_BLKSZ, FTAG);
486321547Smav
487321547Smav	if (dn == NULL)
488168404Spjd		return;
489168404Spjd
490236884Smm	ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
491168404Spjd
492321547Smav	if (dn->dn_maxblkid == 0 || name == NULL) {
493168404Spjd		/*
494321547Smav		 * This is a microzap (only one block), or we don't know
495321547Smav		 * the name.  Check the first block for i/o errors.
496168404Spjd		 */
497168404Spjd		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
498321547Smav		if (err != 0) {
499168404Spjd			tx->tx_err = err;
500168404Spjd		}
501321547Smav	} else {
502168404Spjd		/*
503321547Smav		 * Access the name so that we'll check for i/o errors to
504321547Smav		 * the leaf blocks, etc.  We ignore ENOENT, as this name
505321547Smav		 * may not yet exist.
506168404Spjd		 */
507307290Smav		err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
508321547Smav		if (err == EIO || err == ECKSUM || err == ENXIO) {
509168404Spjd			tx->tx_err = err;
510168404Spjd		}
511168404Spjd	}
512168404Spjd}
513168404Spjd
514168404Spjdvoid
515321549Smavdmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
516321549Smav{
517321549Smav	dmu_tx_hold_t *txh;
518321549Smav
519321549Smav	ASSERT0(tx->tx_txg);
520321549Smav
521321549Smav	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
522321549Smav	    object, THT_ZAP, add, (uintptr_t)name);
523321549Smav	if (txh != NULL)
524321550Smav		dmu_tx_hold_zap_impl(txh, name);
525321549Smav}
526321549Smav
527321549Smavvoid
528321549Smavdmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
529321549Smav{
530321549Smav	dmu_tx_hold_t *txh;
531321549Smav
532321549Smav	ASSERT0(tx->tx_txg);
533321549Smav	ASSERT(dn != NULL);
534321549Smav
535321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
536321549Smav	if (txh != NULL)
537321550Smav		dmu_tx_hold_zap_impl(txh, name);
538321549Smav}
539321549Smav
540321549Smavvoid
541168404Spjddmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
542168404Spjd{
543168404Spjd	dmu_tx_hold_t *txh;
544168404Spjd
545168404Spjd	ASSERT(tx->tx_txg == 0);
546168404Spjd
547168404Spjd	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
548168404Spjd	    object, THT_BONUS, 0, 0);
549168404Spjd	if (txh)
550168404Spjd		dmu_tx_count_dnode(txh);
551168404Spjd}
552168404Spjd
553168404Spjdvoid
554321549Smavdmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
555321549Smav{
556321549Smav	dmu_tx_hold_t *txh;
557321549Smav
558321549Smav	ASSERT0(tx->tx_txg);
559321549Smav
560321549Smav	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
561321549Smav	if (txh)
562321549Smav		dmu_tx_count_dnode(txh);
563321549Smav}
564321549Smav
565321549Smavvoid
566168404Spjddmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
567168404Spjd{
568168404Spjd	dmu_tx_hold_t *txh;
569168404Spjd	ASSERT(tx->tx_txg == 0);
570168404Spjd
571168404Spjd	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
572168404Spjd	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
573168404Spjd
574307049Smav	(void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
575168404Spjd}
576168404Spjd
577168404Spjd#ifdef ZFS_DEBUG
578168404Spjdvoid
579168404Spjddmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
580168404Spjd{
581321547Smav	boolean_t match_object = B_FALSE;
582321547Smav	boolean_t match_offset = B_FALSE;
583168404Spjd
584219089Spjd	DB_DNODE_ENTER(db);
585321547Smav	dnode_t *dn = DB_DNODE(db);
586168404Spjd	ASSERT(tx->tx_txg != 0);
587219089Spjd	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
588168404Spjd	ASSERT3U(dn->dn_object, ==, db->db.db_object);
589168404Spjd
590219089Spjd	if (tx->tx_anyobj) {
591219089Spjd		DB_DNODE_EXIT(db);
592168404Spjd		return;
593219089Spjd	}
594168404Spjd
595168404Spjd	/* XXX No checking on the meta dnode for now */
596219089Spjd	if (db->db.db_object == DMU_META_DNODE_OBJECT) {
597219089Spjd		DB_DNODE_EXIT(db);
598168404Spjd		return;
599219089Spjd	}
600168404Spjd
601321547Smav	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
602168404Spjd	    txh = list_next(&tx->tx_holds, txh)) {
603168404Spjd		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
604168404Spjd		if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
605168404Spjd			match_object = TRUE;
606168404Spjd		if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
607168404Spjd			int datablkshift = dn->dn_datablkshift ?
608168404Spjd			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
609168404Spjd			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
610168404Spjd			int shift = datablkshift + epbs * db->db_level;
611168404Spjd			uint64_t beginblk = shift >= 64 ? 0 :
612168404Spjd			    (txh->txh_arg1 >> shift);
613168404Spjd			uint64_t endblk = shift >= 64 ? 0 :
614168404Spjd			    ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
615168404Spjd			uint64_t blkid = db->db_blkid;
616168404Spjd
617168404Spjd			/* XXX txh_arg2 better not be zero... */
618168404Spjd
619168404Spjd			dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
620168404Spjd			    txh->txh_type, beginblk, endblk);
621168404Spjd
622168404Spjd			switch (txh->txh_type) {
623168404Spjd			case THT_WRITE:
624168404Spjd				if (blkid >= beginblk && blkid <= endblk)
625168404Spjd					match_offset = TRUE;
626168404Spjd				/*
627168404Spjd				 * We will let this hold work for the bonus
628219089Spjd				 * or spill buffer so that we don't need to
629219089Spjd				 * hold it when creating a new object.
630168404Spjd				 */
631219089Spjd				if (blkid == DMU_BONUS_BLKID ||
632219089Spjd				    blkid == DMU_SPILL_BLKID)
633168404Spjd					match_offset = TRUE;
634168404Spjd				/*
635168404Spjd				 * They might have to increase nlevels,
636168404Spjd				 * thus dirtying the new TLIBs.  Or the
637168404Spjd				 * might have to change the block size,
638168404Spjd				 * thus dirying the new lvl=0 blk=0.
639168404Spjd				 */
640168404Spjd				if (blkid == 0)
641168404Spjd					match_offset = TRUE;
642168404Spjd				break;
643168404Spjd			case THT_FREE:
644185029Spjd				/*
645185029Spjd				 * We will dirty all the level 1 blocks in
646185029Spjd				 * the free range and perhaps the first and
647185029Spjd				 * last level 0 block.
648185029Spjd				 */
649185029Spjd				if (blkid >= beginblk && (blkid <= endblk ||
650185029Spjd				    txh->txh_arg2 == DMU_OBJECT_END))
651168404Spjd					match_offset = TRUE;
652168404Spjd				break;
653219089Spjd			case THT_SPILL:
654219089Spjd				if (blkid == DMU_SPILL_BLKID)
655219089Spjd					match_offset = TRUE;
656219089Spjd				break;
657168404Spjd			case THT_BONUS:
658219089Spjd				if (blkid == DMU_BONUS_BLKID)
659168404Spjd					match_offset = TRUE;
660168404Spjd				break;
661168404Spjd			case THT_ZAP:
662168404Spjd				match_offset = TRUE;
663168404Spjd				break;
664168404Spjd			case THT_NEWOBJECT:
665168404Spjd				match_object = TRUE;
666168404Spjd				break;
667168404Spjd			default:
668168404Spjd				ASSERT(!"bad txh_type");
669168404Spjd			}
670168404Spjd		}
671219089Spjd		if (match_object && match_offset) {
672219089Spjd			DB_DNODE_EXIT(db);
673168404Spjd			return;
674219089Spjd		}
675168404Spjd	}
676219089Spjd	DB_DNODE_EXIT(db);
677168404Spjd	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
678168404Spjd	    (u_longlong_t)db->db.db_object, db->db_level,
679168404Spjd	    (u_longlong_t)db->db_blkid);
680168404Spjd}
681168404Spjd#endif
682168404Spjd
683258632Savg/*
684258632Savg * If we can't do 10 iops, something is wrong.  Let us go ahead
685258632Savg * and hit zfs_dirty_data_max.
686258632Savg */
687258632Savghrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
688258632Savgint zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
689258632Savg
690258632Savg/*
691258632Savg * We delay transactions when we've determined that the backend storage
692258632Savg * isn't able to accommodate the rate of incoming writes.
693258632Savg *
694258632Savg * If there is already a transaction waiting, we delay relative to when
695258632Savg * that transaction finishes waiting.  This way the calculated min_time
696258632Savg * is independent of the number of threads concurrently executing
697258632Savg * transactions.
698258632Savg *
699258632Savg * If we are the only waiter, wait relative to when the transaction
700258632Savg * started, rather than the current time.  This credits the transaction for
701258632Savg * "time already served", e.g. reading indirect blocks.
702258632Savg *
703258632Savg * The minimum time for a transaction to take is calculated as:
704258632Savg *     min_time = scale * (dirty - min) / (max - dirty)
705258632Savg *     min_time is then capped at zfs_delay_max_ns.
706258632Savg *
707258632Savg * The delay has two degrees of freedom that can be adjusted via tunables.
708258632Savg * The percentage of dirty data at which we start to delay is defined by
709258632Savg * zfs_delay_min_dirty_percent. This should typically be at or above
710258632Savg * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
711258632Savg * delay after writing at full speed has failed to keep up with the incoming
712258632Savg * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
713258632Savg * speaking, this variable determines the amount of delay at the midpoint of
714258632Savg * the curve.
715258632Savg *
716258632Savg * delay
717258632Savg *  10ms +-------------------------------------------------------------*+
718258632Savg *       |                                                             *|
719258632Savg *   9ms +                                                             *+
720258632Savg *       |                                                             *|
721258632Savg *   8ms +                                                             *+
722258632Savg *       |                                                            * |
723258632Savg *   7ms +                                                            * +
724258632Savg *       |                                                            * |
725258632Savg *   6ms +                                                            * +
726258632Savg *       |                                                            * |
727258632Savg *   5ms +                                                           *  +
728258632Savg *       |                                                           *  |
729258632Savg *   4ms +                                                           *  +
730258632Savg *       |                                                           *  |
731258632Savg *   3ms +                                                          *   +
732258632Savg *       |                                                          *   |
733258632Savg *   2ms +                                              (midpoint) *    +
734258632Savg *       |                                                  |    **     |
735258632Savg *   1ms +                                                  v ***       +
736258632Savg *       |             zfs_delay_scale ---------->     ********         |
737258632Savg *     0 +-------------------------------------*********----------------+
738258632Savg *       0%                    <- zfs_dirty_data_max ->               100%
739258632Savg *
740258632Savg * Note that since the delay is added to the outstanding time remaining on the
741258632Savg * most recent transaction, the delay is effectively the inverse of IOPS.
742258632Savg * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
743258632Savg * was chosen such that small changes in the amount of accumulated dirty data
744258632Savg * in the first 3/4 of the curve yield relatively small differences in the
745258632Savg * amount of delay.
746258632Savg *
747258632Savg * The effects can be easier to understand when the amount of delay is
748258632Savg * represented on a log scale:
749258632Savg *
750258632Savg * delay
751258632Savg * 100ms +-------------------------------------------------------------++
752258632Savg *       +                                                              +
753258632Savg *       |                                                              |
754258632Savg *       +                                                             *+
755258632Savg *  10ms +                                                             *+
756258632Savg *       +                                                           ** +
757258632Savg *       |                                              (midpoint)  **  |
758258632Savg *       +                                                  |     **    +
759258632Savg *   1ms +                                                  v ****      +
760258632Savg *       +             zfs_delay_scale ---------->        *****         +
761258632Savg *       |                                             ****             |
762258632Savg *       +                                          ****                +
763258632Savg * 100us +                                        **                    +
764258632Savg *       +                                       *                      +
765258632Savg *       |                                      *                       |
766258632Savg *       +                                     *                        +
767258632Savg *  10us +                                     *                        +
768258632Savg *       +                                                              +
769258632Savg *       |                                                              |
770258632Savg *       +                                                              +
771258632Savg *       +--------------------------------------------------------------+
772258632Savg *       0%                    <- zfs_dirty_data_max ->               100%
773258632Savg *
774258632Savg * Note here that only as the amount of dirty data approaches its limit does
775258632Savg * the delay start to increase rapidly. The goal of a properly tuned system
776258632Savg * should be to keep the amount of dirty data out of that range by first
777258632Savg * ensuring that the appropriate limits are set for the I/O scheduler to reach
778258632Savg * optimal throughput on the backend storage, and then by changing the value
779258632Savg * of zfs_delay_scale to increase the steepness of the curve.
780258632Savg */
781258632Savgstatic void
782258632Savgdmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
783258632Savg{
784258632Savg	dsl_pool_t *dp = tx->tx_pool;
785258632Savg	uint64_t delay_min_bytes =
786258632Savg	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
787258632Savg	hrtime_t wakeup, min_tx_time, now;
788258632Savg
789258632Savg	if (dirty <= delay_min_bytes)
790258632Savg		return;
791258632Savg
792258632Savg	/*
793258632Savg	 * The caller has already waited until we are under the max.
794258632Savg	 * We make them pass us the amount of dirty data so we don't
795258632Savg	 * have to handle the case of it being >= the max, which could
796258632Savg	 * cause a divide-by-zero if it's == the max.
797258632Savg	 */
798258632Savg	ASSERT3U(dirty, <, zfs_dirty_data_max);
799258632Savg
800258632Savg	now = gethrtime();
801258632Savg	min_tx_time = zfs_delay_scale *
802258632Savg	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
803258632Savg	if (now > tx->tx_start + min_tx_time)
804258632Savg		return;
805258632Savg
806258632Savg	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
807258632Savg
808258632Savg	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
809258632Savg	    uint64_t, min_tx_time);
810258632Savg
811258632Savg	mutex_enter(&dp->dp_lock);
812258632Savg	wakeup = MAX(tx->tx_start + min_tx_time,
813258632Savg	    dp->dp_last_wakeup + min_tx_time);
814258632Savg	dp->dp_last_wakeup = wakeup;
815258632Savg	mutex_exit(&dp->dp_lock);
816258632Savg
817258632Savg#ifdef _KERNEL
818258632Savg#ifdef illumos
819258632Savg	mutex_enter(&curthread->t_delay_lock);
820258632Savg	while (cv_timedwait_hires(&curthread->t_delay_cv,
821258632Savg	    &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
822258632Savg	    CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
823258632Savg		continue;
824258632Savg	mutex_exit(&curthread->t_delay_lock);
825258632Savg#else
826324756Sian	pause_sbt("dmu_tx_delay", nstosbt(wakeup),
827324756Sian	    nstosbt(zfs_delay_resolution_ns), C_ABSOLUTE);
828258632Savg#endif
829258632Savg#else
830258632Savg	hrtime_t delta = wakeup - gethrtime();
831258632Savg	struct timespec ts;
832258632Savg	ts.tv_sec = delta / NANOSEC;
833258632Savg	ts.tv_nsec = delta % NANOSEC;
834258632Savg	(void) nanosleep(&ts, NULL);
835258632Savg#endif
836258632Savg}
837258632Savg
838321547Smav/*
839321547Smav * This routine attempts to assign the transaction to a transaction group.
840321547Smav * To do so, we must determine if there is sufficient free space on disk.
841321547Smav *
842321547Smav * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
843321547Smav * on it), then it is assumed that there is sufficient free space,
844321547Smav * unless there's insufficient slop space in the pool (see the comment
845321547Smav * above spa_slop_shift in spa_misc.c).
846321547Smav *
847321547Smav * If it is not a "netfree" transaction, then if the data already on disk
848321547Smav * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
849321547Smav * ENOSPC.  Otherwise, if the current rough estimate of pending changes,
850321547Smav * plus the rough estimate of this transaction's changes, may exceed the
851321547Smav * allowed usage, then this will fail with ERESTART, which will cause the
852321547Smav * caller to wait for the pending changes to be written to disk (by waiting
853321547Smav * for the next TXG to open), and then check the space usage again.
854321547Smav *
855321547Smav * The rough estimate of pending changes is comprised of the sum of:
856321547Smav *
857321547Smav *  - this transaction's holds' txh_space_towrite
858321547Smav *
859321547Smav *  - dd_tempreserved[], which is the sum of in-flight transactions'
860321547Smav *    holds' txh_space_towrite (i.e. those transactions that have called
861321547Smav *    dmu_tx_assign() but not yet called dmu_tx_commit()).
862321547Smav *
863321547Smav *  - dd_space_towrite[], which is the amount of dirtied dbufs.
864321547Smav *
865321547Smav * Note that all of these values are inflated by spa_get_worst_case_asize(),
866321547Smav * which means that we may get ERESTART well before we are actually in danger
867321547Smav * of running out of space, but this also mitigates any small inaccuracies
868321547Smav * in the rough estimate (e.g. txh_space_towrite doesn't take into account
869321547Smav * indirect blocks, and dd_space_towrite[] doesn't take into account changes
870321547Smav * to the MOS).
871321547Smav *
872321547Smav * Note that due to this algorithm, it is possible to exceed the allowed
873321547Smav * usage by one transaction.  Also, as we approach the allowed usage,
874321547Smav * we will allow a very limited amount of changes into each TXG, thus
875321547Smav * decreasing performance.
876321547Smav */
877168404Spjdstatic int
878330986Savgdmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
879168404Spjd{
880185029Spjd	spa_t *spa = tx->tx_pool->dp_spa;
881168404Spjd
882240415Smm	ASSERT0(tx->tx_txg);
883185029Spjd
884168404Spjd	if (tx->tx_err)
885168404Spjd		return (tx->tx_err);
886168404Spjd
887185029Spjd	if (spa_suspended(spa)) {
888185029Spjd		/*
889185029Spjd		 * If the user has indicated a blocking failure mode
890185029Spjd		 * then return ERESTART which will block in dmu_tx_wait().
891185029Spjd		 * Otherwise, return EIO so that an error can get
892185029Spjd		 * propagated back to the VOP calls.
893185029Spjd		 *
894185029Spjd		 * Note that we always honor the txg_how flag regardless
895185029Spjd		 * of the failuremode setting.
896185029Spjd		 */
897185029Spjd		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
898330986Savg		    !(txg_how & TXG_WAIT))
899249195Smm			return (SET_ERROR(EIO));
900185029Spjd
901249195Smm		return (SET_ERROR(ERESTART));
902185029Spjd	}
903185029Spjd
904330986Savg	if (!tx->tx_dirty_delayed &&
905258632Savg	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
906258632Savg		tx->tx_wait_dirty = B_TRUE;
907258632Savg		return (SET_ERROR(ERESTART));
908258632Savg	}
909258632Savg
910168404Spjd	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
911168404Spjd	tx->tx_needassign_txh = NULL;
912168404Spjd
913168404Spjd	/*
914168404Spjd	 * NB: No error returns are allowed after txg_hold_open, but
915168404Spjd	 * before processing the dnode holds, due to the
916168404Spjd	 * dmu_tx_unassign() logic.
917168404Spjd	 */
918168404Spjd
919321547Smav	uint64_t towrite = 0;
920321547Smav	uint64_t tohold = 0;
921321547Smav	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
922168404Spjd	    txh = list_next(&tx->tx_holds, txh)) {
923168404Spjd		dnode_t *dn = txh->txh_dnode;
924168404Spjd		if (dn != NULL) {
925168404Spjd			mutex_enter(&dn->dn_mtx);
926168404Spjd			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
927168404Spjd				mutex_exit(&dn->dn_mtx);
928168404Spjd				tx->tx_needassign_txh = txh;
929249195Smm				return (SET_ERROR(ERESTART));
930168404Spjd			}
931168404Spjd			if (dn->dn_assigned_txg == 0)
932168404Spjd				dn->dn_assigned_txg = tx->tx_txg;
933168404Spjd			ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
934168404Spjd			(void) refcount_add(&dn->dn_tx_holds, tx);
935168404Spjd			mutex_exit(&dn->dn_mtx);
936168404Spjd		}
937307049Smav		towrite += refcount_count(&txh->txh_space_towrite);
938307049Smav		tohold += refcount_count(&txh->txh_memory_tohold);
939168404Spjd	}
940168404Spjd
941185029Spjd	/* needed allocation: worst-case estimate of write space */
942321547Smav	uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
943185029Spjd	/* calculate memory footprint estimate */
944321547Smav	uint64_t memory = towrite + tohold;
945168404Spjd
946321547Smav	if (tx->tx_dir != NULL && asize != 0) {
947185029Spjd		int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
948321547Smav		    asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
949321547Smav		if (err != 0)
950168404Spjd			return (err);
951168404Spjd	}
952168404Spjd
953168404Spjd	return (0);
954168404Spjd}
955168404Spjd
956168404Spjdstatic void
957168404Spjddmu_tx_unassign(dmu_tx_t *tx)
958168404Spjd{
959168404Spjd	if (tx->tx_txg == 0)
960168404Spjd		return;
961168404Spjd
962168404Spjd	txg_rele_to_quiesce(&tx->tx_txgh);
963168404Spjd
964251629Sdelphij	/*
965251629Sdelphij	 * Walk the transaction's hold list, removing the hold on the
966251629Sdelphij	 * associated dnode, and notifying waiters if the refcount drops to 0.
967251629Sdelphij	 */
968321547Smav	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
969321547Smav	    txh != tx->tx_needassign_txh;
970168404Spjd	    txh = list_next(&tx->tx_holds, txh)) {
971168404Spjd		dnode_t *dn = txh->txh_dnode;
972168404Spjd
973168404Spjd		if (dn == NULL)
974168404Spjd			continue;
975168404Spjd		mutex_enter(&dn->dn_mtx);
976168404Spjd		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
977168404Spjd
978168404Spjd		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
979168404Spjd			dn->dn_assigned_txg = 0;
980168404Spjd			cv_broadcast(&dn->dn_notxholds);
981168404Spjd		}
982168404Spjd		mutex_exit(&dn->dn_mtx);
983168404Spjd	}
984168404Spjd
985168404Spjd	txg_rele_to_sync(&tx->tx_txgh);
986168404Spjd
987168404Spjd	tx->tx_lasttried_txg = tx->tx_txg;
988168404Spjd	tx->tx_txg = 0;
989168404Spjd}
990168404Spjd
991168404Spjd/*
992330986Savg * Assign tx to a transaction group; txg_how is a bitmask:
993168404Spjd *
994330986Savg * If TXG_WAIT is set and the currently open txg is full, this function
995330986Savg * will wait until there's a new txg. This should be used when no locks
996330986Savg * are being held. With this bit set, this function will only fail if
997330986Savg * we're truly out of space (or over quota).
998168404Spjd *
999330986Savg * If TXG_WAIT is *not* set and we can't assign into the currently open
1000330986Savg * txg without blocking, this function will return immediately with
1001330986Savg * ERESTART. This should be used whenever locks are being held.  On an
1002330986Savg * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
1003330986Savg * and try again.
1004258632Savg *
1005330986Savg * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
1006330986Savg * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
1007330986Savg * details on the throttle). This is used by the VFS operations, after
1008330986Savg * they have already called dmu_tx_wait() (though most likely on a
1009330986Savg * different tx).
1010168404Spjd */
1011168404Spjdint
1012330986Savgdmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
1013168404Spjd{
1014168404Spjd	int err;
1015168404Spjd
1016168404Spjd	ASSERT(tx->tx_txg == 0);
1017330986Savg	ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
1018168404Spjd	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1019168404Spjd
1020248571Smm	/* If we might wait, we must not hold the config lock. */
1021330986Savg	IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
1022248571Smm
1023330986Savg	if ((txg_how & TXG_NOTHROTTLE))
1024330986Savg		tx->tx_dirty_delayed = B_TRUE;
1025258632Savg
1026168404Spjd	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1027168404Spjd		dmu_tx_unassign(tx);
1028168404Spjd
1029330986Savg		if (err != ERESTART || !(txg_how & TXG_WAIT))
1030168404Spjd			return (err);
1031168404Spjd
1032168404Spjd		dmu_tx_wait(tx);
1033168404Spjd	}
1034168404Spjd
1035168404Spjd	txg_rele_to_quiesce(&tx->tx_txgh);
1036168404Spjd
1037168404Spjd	return (0);
1038168404Spjd}
1039168404Spjd
1040168404Spjdvoid
1041168404Spjddmu_tx_wait(dmu_tx_t *tx)
1042168404Spjd{
1043185029Spjd	spa_t *spa = tx->tx_pool->dp_spa;
1044258632Savg	dsl_pool_t *dp = tx->tx_pool;
1045185029Spjd
1046168404Spjd	ASSERT(tx->tx_txg == 0);
1047248571Smm	ASSERT(!dsl_pool_config_held(tx->tx_pool));
1048168404Spjd
1049258632Savg	if (tx->tx_wait_dirty) {
1050258632Savg		/*
1051258632Savg		 * dmu_tx_try_assign() has determined that we need to wait
1052258632Savg		 * because we've consumed much or all of the dirty buffer
1053258632Savg		 * space.
1054258632Savg		 */
1055258632Savg		mutex_enter(&dp->dp_lock);
1056258632Savg		while (dp->dp_dirty_total >= zfs_dirty_data_max)
1057258632Savg			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1058258632Savg		uint64_t dirty = dp->dp_dirty_total;
1059258632Savg		mutex_exit(&dp->dp_lock);
1060258632Savg
1061258632Savg		dmu_tx_delay(tx, dirty);
1062258632Savg
1063258632Savg		tx->tx_wait_dirty = B_FALSE;
1064258632Savg
1065258632Savg		/*
1066330986Savg		 * Note: setting tx_dirty_delayed only has effect if the
1067330986Savg		 * caller used TX_WAIT.  Otherwise they are going to
1068330986Savg		 * destroy this tx and try again.  The common case,
1069330986Savg		 * zfs_write(), uses TX_WAIT.
1070258632Savg		 */
1071330986Savg		tx->tx_dirty_delayed = B_TRUE;
1072258632Savg	} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1073258632Savg		/*
1074258632Savg		 * If the pool is suspended we need to wait until it
1075258632Savg		 * is resumed.  Note that it's possible that the pool
1076258632Savg		 * has become active after this thread has tried to
1077258632Savg		 * obtain a tx.  If that's the case then tx_lasttried_txg
1078258632Savg		 * would not have been set.
1079258632Savg		 */
1080258632Savg		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1081185029Spjd	} else if (tx->tx_needassign_txh) {
1082258632Savg		/*
1083258632Savg		 * A dnode is assigned to the quiescing txg.  Wait for its
1084258632Savg		 * transaction to complete.
1085258632Savg		 */
1086168404Spjd		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1087168404Spjd
1088168404Spjd		mutex_enter(&dn->dn_mtx);
1089168404Spjd		while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1090168404Spjd			cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1091168404Spjd		mutex_exit(&dn->dn_mtx);
1092168404Spjd		tx->tx_needassign_txh = NULL;
1093168404Spjd	} else {
1094339125Smav		/*
1095339125Smav		 * If we have a lot of dirty data just wait until we sync
1096339125Smav		 * out a TXG at which point we'll hopefully have synced
1097339125Smav		 * a portion of the changes.
1098339125Smav		 */
1099339125Smav		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1100168404Spjd	}
1101168404Spjd}
1102168404Spjd
1103307049Smavstatic void
1104307049Smavdmu_tx_destroy(dmu_tx_t *tx)
1105307049Smav{
1106307049Smav	dmu_tx_hold_t *txh;
1107307049Smav
1108307049Smav	while ((txh = list_head(&tx->tx_holds)) != NULL) {
1109307049Smav		dnode_t *dn = txh->txh_dnode;
1110307049Smav
1111307049Smav		list_remove(&tx->tx_holds, txh);
1112307049Smav		refcount_destroy_many(&txh->txh_space_towrite,
1113307049Smav		    refcount_count(&txh->txh_space_towrite));
1114307049Smav		refcount_destroy_many(&txh->txh_memory_tohold,
1115307049Smav		    refcount_count(&txh->txh_memory_tohold));
1116307049Smav		kmem_free(txh, sizeof (dmu_tx_hold_t));
1117307049Smav		if (dn != NULL)
1118307049Smav			dnode_rele(dn, tx);
1119307049Smav	}
1120307049Smav
1121307049Smav	list_destroy(&tx->tx_callbacks);
1122307049Smav	list_destroy(&tx->tx_holds);
1123307049Smav	kmem_free(tx, sizeof (dmu_tx_t));
1124307049Smav}
1125307049Smav
1126168404Spjdvoid
1127168404Spjddmu_tx_commit(dmu_tx_t *tx)
1128168404Spjd{
1129168404Spjd	ASSERT(tx->tx_txg != 0);
1130168404Spjd
1131251629Sdelphij	/*
1132251629Sdelphij	 * Go through the transaction's hold list and remove holds on
1133251629Sdelphij	 * associated dnodes, notifying waiters if no holds remain.
1134251629Sdelphij	 */
1135307049Smav	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
1136307049Smav	    txh = list_next(&tx->tx_holds, txh)) {
1137168404Spjd		dnode_t *dn = txh->txh_dnode;
1138168404Spjd
1139168404Spjd		if (dn == NULL)
1140168404Spjd			continue;
1141307049Smav
1142168404Spjd		mutex_enter(&dn->dn_mtx);
1143168404Spjd		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1144168404Spjd
1145168404Spjd		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1146168404Spjd			dn->dn_assigned_txg = 0;
1147168404Spjd			cv_broadcast(&dn->dn_notxholds);
1148168404Spjd		}
1149168404Spjd		mutex_exit(&dn->dn_mtx);
1150168404Spjd	}
1151168404Spjd
1152168404Spjd	if (tx->tx_tempreserve_cookie)
1153168404Spjd		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1154168404Spjd
1155219089Spjd	if (!list_is_empty(&tx->tx_callbacks))
1156219089Spjd		txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1157219089Spjd
1158168404Spjd	if (tx->tx_anyobj == FALSE)
1159168404Spjd		txg_rele_to_sync(&tx->tx_txgh);
1160219089Spjd
1161307049Smav	dmu_tx_destroy(tx);
1162168404Spjd}
1163168404Spjd
1164168404Spjdvoid
1165168404Spjddmu_tx_abort(dmu_tx_t *tx)
1166168404Spjd{
1167168404Spjd	ASSERT(tx->tx_txg == 0);
1168168404Spjd
1169219089Spjd	/*
1170219089Spjd	 * Call any registered callbacks with an error code.
1171219089Spjd	 */
1172219089Spjd	if (!list_is_empty(&tx->tx_callbacks))
1173219089Spjd		dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1174219089Spjd
1175307049Smav	dmu_tx_destroy(tx);
1176168404Spjd}
1177168404Spjd
1178168404Spjduint64_t
1179168404Spjddmu_tx_get_txg(dmu_tx_t *tx)
1180168404Spjd{
1181168404Spjd	ASSERT(tx->tx_txg != 0);
1182168404Spjd	return (tx->tx_txg);
1183168404Spjd}
1184219089Spjd
1185248571Smmdsl_pool_t *
1186248571Smmdmu_tx_pool(dmu_tx_t *tx)
1187248571Smm{
1188248571Smm	ASSERT(tx->tx_pool != NULL);
1189248571Smm	return (tx->tx_pool);
1190248571Smm}
1191248571Smm
1192219089Spjdvoid
1193219089Spjddmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1194219089Spjd{
1195219089Spjd	dmu_tx_callback_t *dcb;
1196219089Spjd
1197219089Spjd	dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1198219089Spjd
1199219089Spjd	dcb->dcb_func = func;
1200219089Spjd	dcb->dcb_data = data;
1201219089Spjd
1202219089Spjd	list_insert_tail(&tx->tx_callbacks, dcb);
1203219089Spjd}
1204219089Spjd
1205219089Spjd/*
1206219089Spjd * Call all the commit callbacks on a list, with a given error code.
1207219089Spjd */
1208219089Spjdvoid
1209219089Spjddmu_tx_do_callbacks(list_t *cb_list, int error)
1210219089Spjd{
1211219089Spjd	dmu_tx_callback_t *dcb;
1212219089Spjd
1213307049Smav	while ((dcb = list_head(cb_list)) != NULL) {
1214219089Spjd		list_remove(cb_list, dcb);
1215219089Spjd		dcb->dcb_func(dcb->dcb_data, error);
1216219089Spjd		kmem_free(dcb, sizeof (dmu_tx_callback_t));
1217219089Spjd	}
1218219089Spjd}
1219219089Spjd
1220219089Spjd/*
1221219089Spjd * Interface to hold a bunch of attributes.
1222219089Spjd * used for creating new files.
1223219089Spjd * attrsize is the total size of all attributes
1224219089Spjd * to be added during object creation
1225219089Spjd *
1226219089Spjd * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1227219089Spjd */
1228219089Spjd
1229219089Spjd/*
1230219089Spjd * hold necessary attribute name for attribute registration.
1231219089Spjd * should be a very rare case where this is needed.  If it does
1232219089Spjd * happen it would only happen on the first write to the file system.
1233219089Spjd */
1234219089Spjdstatic void
1235219089Spjddmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1236219089Spjd{
1237219089Spjd	if (!sa->sa_need_attr_registration)
1238219089Spjd		return;
1239219089Spjd
1240321547Smav	for (int i = 0; i != sa->sa_num_attrs; i++) {
1241219089Spjd		if (!sa->sa_attr_table[i].sa_registered) {
1242219089Spjd			if (sa->sa_reg_attr_obj)
1243219089Spjd				dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1244219089Spjd				    B_TRUE, sa->sa_attr_table[i].sa_name);
1245219089Spjd			else
1246219089Spjd				dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1247219089Spjd				    B_TRUE, sa->sa_attr_table[i].sa_name);
1248219089Spjd		}
1249219089Spjd	}
1250219089Spjd}
1251219089Spjd
1252219089Spjdvoid
1253219089Spjddmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1254219089Spjd{
1255321547Smav	dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
1256321547Smav	    tx->tx_objset, object, THT_SPILL, 0, 0);
1257219089Spjd
1258321547Smav	(void) refcount_add_many(&txh->txh_space_towrite,
1259321547Smav	    SPA_OLD_MAXBLOCKSIZE, FTAG);
1260219089Spjd}
1261219089Spjd
1262219089Spjdvoid
1263219089Spjddmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1264219089Spjd{
1265219089Spjd	sa_os_t *sa = tx->tx_objset->os_sa;
1266219089Spjd
1267219089Spjd	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1268219089Spjd
1269219089Spjd	if (tx->tx_objset->os_sa->sa_master_obj == 0)
1270219089Spjd		return;
1271219089Spjd
1272321547Smav	if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
1273219089Spjd		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1274321547Smav	} else {
1275219089Spjd		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1276219089Spjd		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1277219089Spjd		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1278219089Spjd		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1279219089Spjd	}
1280219089Spjd
1281219089Spjd	dmu_tx_sa_registration_hold(sa, tx);
1282219089Spjd
1283219089Spjd	if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1284219089Spjd		return;
1285219089Spjd
1286219089Spjd	(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1287219089Spjd	    THT_SPILL, 0, 0);
1288219089Spjd}
1289219089Spjd
1290219089Spjd/*
1291219089Spjd * Hold SA attribute
1292219089Spjd *
1293219089Spjd * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1294219089Spjd *
1295219089Spjd * variable_size is the total size of all variable sized attributes
1296219089Spjd * passed to this function.  It is not the total size of all
1297219089Spjd * variable size attributes that *may* exist on this object.
1298219089Spjd */
1299219089Spjdvoid
1300219089Spjddmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1301219089Spjd{
1302219089Spjd	uint64_t object;
1303219089Spjd	sa_os_t *sa = tx->tx_objset->os_sa;
1304219089Spjd
1305219089Spjd	ASSERT(hdl != NULL);
1306219089Spjd
1307219089Spjd	object = sa_handle_object(hdl);
1308219089Spjd
1309219089Spjd	dmu_tx_hold_bonus(tx, object);
1310219089Spjd
1311219089Spjd	if (tx->tx_objset->os_sa->sa_master_obj == 0)
1312219089Spjd		return;
1313219089Spjd
1314219089Spjd	if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1315219089Spjd	    tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1316219089Spjd		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1317219089Spjd		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1318219089Spjd		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1319219089Spjd		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1320219089Spjd	}
1321219089Spjd
1322219089Spjd	dmu_tx_sa_registration_hold(sa, tx);
1323219089Spjd
1324219089Spjd	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1325219089Spjd		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1326219089Spjd
1327219089Spjd	if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1328219089Spjd		ASSERT(tx->tx_txg == 0);
1329219089Spjd		dmu_tx_hold_spill(tx, object);
1330219089Spjd	} else {
1331219089Spjd		dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1332219089Spjd		dnode_t *dn;
1333219089Spjd
1334219089Spjd		DB_DNODE_ENTER(db);
1335219089Spjd		dn = DB_DNODE(db);
1336219089Spjd		if (dn->dn_have_spill) {
1337219089Spjd			ASSERT(tx->tx_txg == 0);
1338219089Spjd			dmu_tx_hold_spill(tx, object);
1339219089Spjd		}
1340219089Spjd		DB_DNODE_EXIT(db);
1341219089Spjd	}
1342219089Spjd}
1343