dmu_tx.c revision 209962
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/dmu.h>
27#include <sys/dmu_impl.h>
28#include <sys/dbuf.h>
29#include <sys/dmu_tx.h>
30#include <sys/dmu_objset.h>
31#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
32#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
33#include <sys/dsl_pool.h>
34#include <sys/zap_impl.h> /* for fzap_default_block_shift */
35#include <sys/spa.h>
36#include <sys/zfs_context.h>
37
38typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
39    uint64_t arg1, uint64_t arg2);
40
41
42dmu_tx_t *
43dmu_tx_create_dd(dsl_dir_t *dd)
44{
45	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
46	tx->tx_dir = dd;
47	if (dd)
48		tx->tx_pool = dd->dd_pool;
49	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
50	    offsetof(dmu_tx_hold_t, txh_node));
51#ifdef ZFS_DEBUG
52	refcount_create(&tx->tx_space_written);
53	refcount_create(&tx->tx_space_freed);
54#endif
55	return (tx);
56}
57
58dmu_tx_t *
59dmu_tx_create(objset_t *os)
60{
61	dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir);
62	tx->tx_objset = os;
63	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
64	return (tx);
65}
66
67dmu_tx_t *
68dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
69{
70	dmu_tx_t *tx = dmu_tx_create_dd(NULL);
71
72	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
73	tx->tx_pool = dp;
74	tx->tx_txg = txg;
75	tx->tx_anyobj = TRUE;
76
77	return (tx);
78}
79
80int
81dmu_tx_is_syncing(dmu_tx_t *tx)
82{
83	return (tx->tx_anyobj);
84}
85
86int
87dmu_tx_private_ok(dmu_tx_t *tx)
88{
89	return (tx->tx_anyobj);
90}
91
92static dmu_tx_hold_t *
93dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
94    enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
95{
96	dmu_tx_hold_t *txh;
97	dnode_t *dn = NULL;
98	int err;
99
100	if (object != DMU_NEW_OBJECT) {
101		err = dnode_hold(os->os, object, tx, &dn);
102		if (err) {
103			tx->tx_err = err;
104			return (NULL);
105		}
106
107		if (err == 0 && tx->tx_txg != 0) {
108			mutex_enter(&dn->dn_mtx);
109			/*
110			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
111			 * problem, but there's no way for it to happen (for
112			 * now, at least).
113			 */
114			ASSERT(dn->dn_assigned_txg == 0);
115			dn->dn_assigned_txg = tx->tx_txg;
116			(void) refcount_add(&dn->dn_tx_holds, tx);
117			mutex_exit(&dn->dn_mtx);
118		}
119	}
120
121	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
122	txh->txh_tx = tx;
123	txh->txh_dnode = dn;
124#ifdef ZFS_DEBUG
125	txh->txh_type = type;
126	txh->txh_arg1 = arg1;
127	txh->txh_arg2 = arg2;
128#endif
129	list_insert_tail(&tx->tx_holds, txh);
130
131	return (txh);
132}
133
134void
135dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
136{
137	/*
138	 * If we're syncing, they can manipulate any object anyhow, and
139	 * the hold on the dnode_t can cause problems.
140	 */
141	if (!dmu_tx_is_syncing(tx)) {
142		(void) dmu_tx_hold_object_impl(tx, os,
143		    object, THT_NEWOBJECT, 0, 0);
144	}
145}
146
147static int
148dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
149{
150	int err;
151	dmu_buf_impl_t *db;
152
153	rw_enter(&dn->dn_struct_rwlock, RW_READER);
154	db = dbuf_hold_level(dn, level, blkid, FTAG);
155	rw_exit(&dn->dn_struct_rwlock);
156	if (db == NULL)
157		return (EIO);
158	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
159	dbuf_rele(db, FTAG);
160	return (err);
161}
162
163static void
164dmu_tx_count_indirects(dmu_tx_hold_t *txh, dmu_buf_impl_t *db,
165    boolean_t freeable, dmu_buf_impl_t **history)
166{
167	int i = db->db_level + 1;
168	dnode_t *dn = db->db_dnode;
169
170	if (i >= dn->dn_nlevels)
171		return;
172
173	db = db->db_parent;
174	if (db == NULL) {
175		uint64_t lvls = dn->dn_nlevels - i;
176
177		txh->txh_space_towrite += lvls << dn->dn_indblkshift;
178		return;
179	}
180
181	if (db != history[i]) {
182		dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
183		uint64_t space = 1ULL << dn->dn_indblkshift;
184
185		freeable = (db->db_blkptr && (freeable ||
186		    dsl_dataset_block_freeable(ds, db->db_blkptr->blk_birth)));
187		if (freeable)
188			txh->txh_space_tooverwrite += space;
189		else
190			txh->txh_space_towrite += space;
191		if (db->db_blkptr)
192			txh->txh_space_tounref += space;
193		history[i] = db;
194		dmu_tx_count_indirects(txh, db, freeable, history);
195	}
196}
197
198/* ARGSUSED */
199static void
200dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
201{
202	dnode_t *dn = txh->txh_dnode;
203	uint64_t start, end, i;
204	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
205	int err = 0;
206
207	if (len == 0)
208		return;
209
210	min_bs = SPA_MINBLOCKSHIFT;
211	max_bs = SPA_MAXBLOCKSHIFT;
212	min_ibs = DN_MIN_INDBLKSHIFT;
213	max_ibs = DN_MAX_INDBLKSHIFT;
214
215	if (dn) {
216		dmu_buf_impl_t *last[DN_MAX_LEVELS];
217		int nlvls = dn->dn_nlevels;
218		int delta;
219
220		/*
221		 * For i/o error checking, read the first and last level-0
222		 * blocks (if they are not aligned), and all the level-1 blocks.
223		 */
224
225		if (dn->dn_maxblkid == 0) {
226			delta = dn->dn_datablksz;
227			start = (off < dn->dn_datablksz) ? 0 : 1;
228			end = (off+len <= dn->dn_datablksz) ? 0 : 1;
229			if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
230				err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
231				if (err)
232					goto out;
233				delta -= off;
234			}
235		} else {
236			zio_t *zio = zio_root(dn->dn_objset->os_spa,
237			    NULL, NULL, ZIO_FLAG_CANFAIL);
238
239			/* first level-0 block */
240			start = off >> dn->dn_datablkshift;
241			if (P2PHASE(off, dn->dn_datablksz) ||
242			    len < dn->dn_datablksz) {
243				err = dmu_tx_check_ioerr(zio, dn, 0, start);
244				if (err)
245					goto out;
246			}
247
248			/* last level-0 block */
249			end = (off+len-1) >> dn->dn_datablkshift;
250			if (end != start &&
251			    P2PHASE(off+len, dn->dn_datablksz)) {
252				err = dmu_tx_check_ioerr(zio, dn, 0, end);
253				if (err)
254					goto out;
255			}
256
257			/* level-1 blocks */
258			if (nlvls > 1) {
259				int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
260				for (i = (start>>shft)+1; i < end>>shft; i++) {
261					err = dmu_tx_check_ioerr(zio, dn, 1, i);
262					if (err)
263						goto out;
264				}
265			}
266
267			err = zio_wait(zio);
268			if (err)
269				goto out;
270			delta = P2NPHASE(off, dn->dn_datablksz);
271		}
272
273		if (dn->dn_maxblkid > 0) {
274			/*
275			 * The blocksize can't change,
276			 * so we can make a more precise estimate.
277			 */
278			ASSERT(dn->dn_datablkshift != 0);
279			min_bs = max_bs = dn->dn_datablkshift;
280			min_ibs = max_ibs = dn->dn_indblkshift;
281		} else if (dn->dn_indblkshift > max_ibs) {
282			/*
283			 * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
284			 * the code will still work correctly on older pools.
285			 */
286			min_ibs = max_ibs = dn->dn_indblkshift;
287		}
288
289		/*
290		 * If this write is not off the end of the file
291		 * we need to account for overwrites/unref.
292		 */
293		if (start <= dn->dn_maxblkid)
294			bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS);
295		while (start <= dn->dn_maxblkid) {
296			spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
297			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
298			dmu_buf_impl_t *db;
299
300			rw_enter(&dn->dn_struct_rwlock, RW_READER);
301			db = dbuf_hold_level(dn, 0, start, FTAG);
302			rw_exit(&dn->dn_struct_rwlock);
303			if (db->db_blkptr && dsl_dataset_block_freeable(ds,
304			    db->db_blkptr->blk_birth)) {
305				dprintf_bp(db->db_blkptr, "can free old%s", "");
306				txh->txh_space_tooverwrite += dn->dn_datablksz;
307				txh->txh_space_tounref += dn->dn_datablksz;
308				dmu_tx_count_indirects(txh, db, TRUE, last);
309			} else {
310				txh->txh_space_towrite += dn->dn_datablksz;
311				if (db->db_blkptr)
312					txh->txh_space_tounref +=
313					    bp_get_dasize(spa, db->db_blkptr);
314				dmu_tx_count_indirects(txh, db, FALSE, last);
315			}
316			dbuf_rele(db, FTAG);
317			if (++start > end) {
318				/*
319				 * Account for new indirects appearing
320				 * before this IO gets assigned into a txg.
321				 */
322				bits = 64 - min_bs;
323				epbs = min_ibs - SPA_BLKPTRSHIFT;
324				for (bits -= epbs * (nlvls - 1);
325				    bits >= 0; bits -= epbs)
326					txh->txh_fudge += 1ULL << max_ibs;
327				goto out;
328			}
329			off += delta;
330			if (len >= delta)
331				len -= delta;
332			delta = dn->dn_datablksz;
333		}
334	}
335
336	/*
337	 * 'end' is the last thing we will access, not one past.
338	 * This way we won't overflow when accessing the last byte.
339	 */
340	start = P2ALIGN(off, 1ULL << max_bs);
341	end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
342	txh->txh_space_towrite += end - start + 1;
343
344	start >>= min_bs;
345	end >>= min_bs;
346
347	epbs = min_ibs - SPA_BLKPTRSHIFT;
348
349	/*
350	 * The object contains at most 2^(64 - min_bs) blocks,
351	 * and each indirect level maps 2^epbs.
352	 */
353	for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
354		start >>= epbs;
355		end >>= epbs;
356		ASSERT3U(end, >=, start);
357		txh->txh_space_towrite += (end - start + 1) << max_ibs;
358		if (start != 0) {
359			/*
360			 * We also need a new blkid=0 indirect block
361			 * to reference any existing file data.
362			 */
363			txh->txh_space_towrite += 1ULL << max_ibs;
364		}
365	}
366
367out:
368	if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
369	    2 * DMU_MAX_ACCESS)
370		err = EFBIG;
371
372	if (err)
373		txh->txh_tx->tx_err = err;
374}
375
376static void
377dmu_tx_count_dnode(dmu_tx_hold_t *txh)
378{
379	dnode_t *dn = txh->txh_dnode;
380	dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode;
381	uint64_t space = mdn->dn_datablksz +
382	    ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
383
384	if (dn && dn->dn_dbuf->db_blkptr &&
385	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
386	    dn->dn_dbuf->db_blkptr->blk_birth)) {
387		txh->txh_space_tooverwrite += space;
388		txh->txh_space_tounref += space;
389	} else {
390		txh->txh_space_towrite += space;
391		if (dn && dn->dn_dbuf->db_blkptr)
392			txh->txh_space_tounref += space;
393	}
394}
395
396void
397dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
398{
399	dmu_tx_hold_t *txh;
400
401	ASSERT(tx->tx_txg == 0);
402	ASSERT(len < DMU_MAX_ACCESS);
403	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
404
405	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
406	    object, THT_WRITE, off, len);
407	if (txh == NULL)
408		return;
409
410	dmu_tx_count_write(txh, off, len);
411	dmu_tx_count_dnode(txh);
412}
413
414static void
415dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
416{
417	uint64_t blkid, nblks, lastblk;
418	uint64_t space = 0, unref = 0, skipped = 0;
419	dnode_t *dn = txh->txh_dnode;
420	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
421	spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
422	int epbs;
423
424	if (dn->dn_nlevels == 0)
425		return;
426
427	/*
428	 * The struct_rwlock protects us against dn_nlevels
429	 * changing, in case (against all odds) we manage to dirty &
430	 * sync out the changes after we check for being dirty.
431	 * Also, dbuf_hold_level() wants us to have the struct_rwlock.
432	 */
433	rw_enter(&dn->dn_struct_rwlock, RW_READER);
434	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
435	if (dn->dn_maxblkid == 0) {
436		if (off == 0 && len >= dn->dn_datablksz) {
437			blkid = 0;
438			nblks = 1;
439		} else {
440			rw_exit(&dn->dn_struct_rwlock);
441			return;
442		}
443	} else {
444		blkid = off >> dn->dn_datablkshift;
445		nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
446
447		if (blkid >= dn->dn_maxblkid) {
448			rw_exit(&dn->dn_struct_rwlock);
449			return;
450		}
451		if (blkid + nblks > dn->dn_maxblkid)
452			nblks = dn->dn_maxblkid - blkid;
453
454	}
455	if (dn->dn_nlevels == 1) {
456		int i;
457		for (i = 0; i < nblks; i++) {
458			blkptr_t *bp = dn->dn_phys->dn_blkptr;
459			ASSERT3U(blkid + i, <, dn->dn_nblkptr);
460			bp += blkid + i;
461			if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
462				dprintf_bp(bp, "can free old%s", "");
463				space += bp_get_dasize(spa, bp);
464			}
465			unref += BP_GET_ASIZE(bp);
466		}
467		nblks = 0;
468	}
469
470	/*
471	 * Add in memory requirements of higher-level indirects.
472	 * This assumes a worst-possible scenario for dn_nlevels.
473	 */
474	{
475		uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
476		int level = (dn->dn_nlevels > 1) ? 2 : 1;
477
478		while (level++ < DN_MAX_LEVELS) {
479			txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
480			blkcnt = 1 + (blkcnt >> epbs);
481		}
482		ASSERT(blkcnt <= dn->dn_nblkptr);
483	}
484
485	lastblk = blkid + nblks - 1;
486	while (nblks) {
487		dmu_buf_impl_t *dbuf;
488		uint64_t ibyte, new_blkid;
489		int epb = 1 << epbs;
490		int err, i, blkoff, tochk;
491		blkptr_t *bp;
492
493		ibyte = blkid << dn->dn_datablkshift;
494		err = dnode_next_offset(dn,
495		    DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
496		new_blkid = ibyte >> dn->dn_datablkshift;
497		if (err == ESRCH) {
498			skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
499			break;
500		}
501		if (err) {
502			txh->txh_tx->tx_err = err;
503			break;
504		}
505		if (new_blkid > lastblk) {
506			skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
507			break;
508		}
509
510		if (new_blkid > blkid) {
511			ASSERT((new_blkid >> epbs) > (blkid >> epbs));
512			skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
513			nblks -= new_blkid - blkid;
514			blkid = new_blkid;
515		}
516		blkoff = P2PHASE(blkid, epb);
517		tochk = MIN(epb - blkoff, nblks);
518
519		dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG);
520
521		txh->txh_memory_tohold += dbuf->db.db_size;
522		if (txh->txh_memory_tohold > DMU_MAX_ACCESS) {
523			txh->txh_tx->tx_err = E2BIG;
524			dbuf_rele(dbuf, FTAG);
525			break;
526		}
527		err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
528		if (err != 0) {
529			txh->txh_tx->tx_err = err;
530			dbuf_rele(dbuf, FTAG);
531			break;
532		}
533
534		bp = dbuf->db.db_data;
535		bp += blkoff;
536
537		for (i = 0; i < tochk; i++) {
538			if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) {
539				dprintf_bp(&bp[i], "can free old%s", "");
540				space += bp_get_dasize(spa, &bp[i]);
541			}
542			unref += BP_GET_ASIZE(bp);
543		}
544		dbuf_rele(dbuf, FTAG);
545
546		blkid += tochk;
547		nblks -= tochk;
548	}
549	rw_exit(&dn->dn_struct_rwlock);
550
551	/* account for new level 1 indirect blocks that might show up */
552	if (skipped > 0) {
553		txh->txh_fudge += skipped << dn->dn_indblkshift;
554		skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
555		txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
556	}
557	txh->txh_space_tofree += space;
558	txh->txh_space_tounref += unref;
559}
560
561void
562dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
563{
564	dmu_tx_hold_t *txh;
565	dnode_t *dn;
566	uint64_t start, end, i;
567	int err, shift;
568	zio_t *zio;
569
570	ASSERT(tx->tx_txg == 0);
571
572	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
573	    object, THT_FREE, off, len);
574	if (txh == NULL)
575		return;
576	dn = txh->txh_dnode;
577
578	/* first block */
579	if (off != 0)
580		dmu_tx_count_write(txh, off, 1);
581	/* last block */
582	if (len != DMU_OBJECT_END)
583		dmu_tx_count_write(txh, off+len, 1);
584
585	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
586		return;
587	if (len == DMU_OBJECT_END)
588		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
589
590	/*
591	 * For i/o error checking, read the first and last level-0
592	 * blocks, and all the level-1 blocks.  The above count_write's
593	 * have already taken care of the level-0 blocks.
594	 */
595	if (dn->dn_nlevels > 1) {
596		shift = dn->dn_datablkshift + dn->dn_indblkshift -
597		    SPA_BLKPTRSHIFT;
598		start = off >> shift;
599		end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
600
601		zio = zio_root(tx->tx_pool->dp_spa,
602		    NULL, NULL, ZIO_FLAG_CANFAIL);
603		for (i = start; i <= end; i++) {
604			uint64_t ibyte = i << shift;
605			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
606			i = ibyte >> shift;
607			if (err == ESRCH)
608				break;
609			if (err) {
610				tx->tx_err = err;
611				return;
612			}
613
614			err = dmu_tx_check_ioerr(zio, dn, 1, i);
615			if (err) {
616				tx->tx_err = err;
617				return;
618			}
619		}
620		err = zio_wait(zio);
621		if (err) {
622			tx->tx_err = err;
623			return;
624		}
625	}
626
627	dmu_tx_count_dnode(txh);
628	dmu_tx_count_free(txh, off, len);
629}
630
631void
632dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
633{
634	dmu_tx_hold_t *txh;
635	dnode_t *dn;
636	uint64_t nblocks;
637	int epbs, err;
638
639	ASSERT(tx->tx_txg == 0);
640
641	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
642	    object, THT_ZAP, add, (uintptr_t)name);
643	if (txh == NULL)
644		return;
645	dn = txh->txh_dnode;
646
647	dmu_tx_count_dnode(txh);
648
649	if (dn == NULL) {
650		/*
651		 * We will be able to fit a new object's entries into one leaf
652		 * block.  So there will be at most 2 blocks total,
653		 * including the header block.
654		 */
655		dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
656		return;
657	}
658
659	ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
660
661	if (dn->dn_maxblkid == 0 && !add) {
662		/*
663		 * If there is only one block  (i.e. this is a micro-zap)
664		 * and we are not adding anything, the accounting is simple.
665		 */
666		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
667		if (err) {
668			tx->tx_err = err;
669			return;
670		}
671
672		/*
673		 * Use max block size here, since we don't know how much
674		 * the size will change between now and the dbuf dirty call.
675		 */
676		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
677		    dn->dn_phys->dn_blkptr[0].blk_birth)) {
678			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
679		} else {
680			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
681		}
682		if (dn->dn_phys->dn_blkptr[0].blk_birth)
683			txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
684		return;
685	}
686
687	if (dn->dn_maxblkid > 0 && name) {
688		/*
689		 * access the name in this fat-zap so that we'll check
690		 * for i/o errors to the leaf blocks, etc.
691		 */
692		err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
693		    8, 0, NULL);
694		if (err == EIO) {
695			tx->tx_err = err;
696			return;
697		}
698	}
699
700	err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add,
701	    &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
702
703	/*
704	 * If the modified blocks are scattered to the four winds,
705	 * we'll have to modify an indirect twig for each.
706	 */
707	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
708	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
709		if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
710			txh->txh_space_towrite += 3 << dn->dn_indblkshift;
711		else
712			txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
713}
714
715void
716dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
717{
718	dmu_tx_hold_t *txh;
719
720	ASSERT(tx->tx_txg == 0);
721
722	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
723	    object, THT_BONUS, 0, 0);
724	if (txh)
725		dmu_tx_count_dnode(txh);
726}
727
728void
729dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
730{
731	dmu_tx_hold_t *txh;
732	ASSERT(tx->tx_txg == 0);
733
734	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
735	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
736
737	txh->txh_space_towrite += space;
738}
739
740int
741dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
742{
743	dmu_tx_hold_t *txh;
744	int holds = 0;
745
746	/*
747	 * By asserting that the tx is assigned, we're counting the
748	 * number of dn_tx_holds, which is the same as the number of
749	 * dn_holds.  Otherwise, we'd be counting dn_holds, but
750	 * dn_tx_holds could be 0.
751	 */
752	ASSERT(tx->tx_txg != 0);
753
754	/* if (tx->tx_anyobj == TRUE) */
755		/* return (0); */
756
757	for (txh = list_head(&tx->tx_holds); txh;
758	    txh = list_next(&tx->tx_holds, txh)) {
759		if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
760			holds++;
761	}
762
763	return (holds);
764}
765
766#ifdef ZFS_DEBUG
767void
768dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
769{
770	dmu_tx_hold_t *txh;
771	int match_object = FALSE, match_offset = FALSE;
772	dnode_t *dn = db->db_dnode;
773
774	ASSERT(tx->tx_txg != 0);
775	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
776	ASSERT3U(dn->dn_object, ==, db->db.db_object);
777
778	if (tx->tx_anyobj)
779		return;
780
781	/* XXX No checking on the meta dnode for now */
782	if (db->db.db_object == DMU_META_DNODE_OBJECT)
783		return;
784
785	for (txh = list_head(&tx->tx_holds); txh;
786	    txh = list_next(&tx->tx_holds, txh)) {
787		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
788		if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
789			match_object = TRUE;
790		if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
791			int datablkshift = dn->dn_datablkshift ?
792			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
793			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
794			int shift = datablkshift + epbs * db->db_level;
795			uint64_t beginblk = shift >= 64 ? 0 :
796			    (txh->txh_arg1 >> shift);
797			uint64_t endblk = shift >= 64 ? 0 :
798			    ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
799			uint64_t blkid = db->db_blkid;
800
801			/* XXX txh_arg2 better not be zero... */
802
803			dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
804			    txh->txh_type, beginblk, endblk);
805
806			switch (txh->txh_type) {
807			case THT_WRITE:
808				if (blkid >= beginblk && blkid <= endblk)
809					match_offset = TRUE;
810				/*
811				 * We will let this hold work for the bonus
812				 * buffer so that we don't need to hold it
813				 * when creating a new object.
814				 */
815				if (blkid == DB_BONUS_BLKID)
816					match_offset = TRUE;
817				/*
818				 * They might have to increase nlevels,
819				 * thus dirtying the new TLIBs.  Or the
820				 * might have to change the block size,
821				 * thus dirying the new lvl=0 blk=0.
822				 */
823				if (blkid == 0)
824					match_offset = TRUE;
825				break;
826			case THT_FREE:
827				/*
828				 * We will dirty all the level 1 blocks in
829				 * the free range and perhaps the first and
830				 * last level 0 block.
831				 */
832				if (blkid >= beginblk && (blkid <= endblk ||
833				    txh->txh_arg2 == DMU_OBJECT_END))
834					match_offset = TRUE;
835				break;
836			case THT_BONUS:
837				if (blkid == DB_BONUS_BLKID)
838					match_offset = TRUE;
839				break;
840			case THT_ZAP:
841				match_offset = TRUE;
842				break;
843			case THT_NEWOBJECT:
844				match_object = TRUE;
845				break;
846			default:
847				ASSERT(!"bad txh_type");
848			}
849		}
850		if (match_object && match_offset)
851			return;
852	}
853	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
854	    (u_longlong_t)db->db.db_object, db->db_level,
855	    (u_longlong_t)db->db_blkid);
856}
857#endif
858
859static int
860dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
861{
862	dmu_tx_hold_t *txh;
863	spa_t *spa = tx->tx_pool->dp_spa;
864	uint64_t memory, asize, fsize, usize;
865	uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
866
867	ASSERT3U(tx->tx_txg, ==, 0);
868
869	if (tx->tx_err)
870		return (tx->tx_err);
871
872	if (spa_suspended(spa)) {
873		/*
874		 * If the user has indicated a blocking failure mode
875		 * then return ERESTART which will block in dmu_tx_wait().
876		 * Otherwise, return EIO so that an error can get
877		 * propagated back to the VOP calls.
878		 *
879		 * Note that we always honor the txg_how flag regardless
880		 * of the failuremode setting.
881		 */
882		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
883		    txg_how != TXG_WAIT)
884			return (EIO);
885
886		return (ERESTART);
887	}
888
889	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
890	tx->tx_needassign_txh = NULL;
891
892	/*
893	 * NB: No error returns are allowed after txg_hold_open, but
894	 * before processing the dnode holds, due to the
895	 * dmu_tx_unassign() logic.
896	 */
897
898	towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
899	for (txh = list_head(&tx->tx_holds); txh;
900	    txh = list_next(&tx->tx_holds, txh)) {
901		dnode_t *dn = txh->txh_dnode;
902		if (dn != NULL) {
903			mutex_enter(&dn->dn_mtx);
904			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
905				mutex_exit(&dn->dn_mtx);
906				tx->tx_needassign_txh = txh;
907				return (ERESTART);
908			}
909			if (dn->dn_assigned_txg == 0)
910				dn->dn_assigned_txg = tx->tx_txg;
911			ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
912			(void) refcount_add(&dn->dn_tx_holds, tx);
913			mutex_exit(&dn->dn_mtx);
914		}
915		towrite += txh->txh_space_towrite;
916		tofree += txh->txh_space_tofree;
917		tooverwrite += txh->txh_space_tooverwrite;
918		tounref += txh->txh_space_tounref;
919		tohold += txh->txh_memory_tohold;
920		fudge += txh->txh_fudge;
921	}
922
923	/*
924	 * NB: This check must be after we've held the dnodes, so that
925	 * the dmu_tx_unassign() logic will work properly
926	 */
927	if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
928		return (ERESTART);
929
930	/*
931	 * If a snapshot has been taken since we made our estimates,
932	 * assume that we won't be able to free or overwrite anything.
933	 */
934	if (tx->tx_objset &&
935	    dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
936	    tx->tx_lastsnap_txg) {
937		towrite += tooverwrite;
938		tooverwrite = tofree = 0;
939	}
940
941	/* needed allocation: worst-case estimate of write space */
942	asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
943	/* freed space estimate: worst-case overwrite + free estimate */
944	fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
945	/* convert unrefd space to worst-case estimate */
946	usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
947	/* calculate memory footprint estimate */
948	memory = towrite + tooverwrite + tohold;
949
950#ifdef ZFS_DEBUG
951	/*
952	 * Add in 'tohold' to account for our dirty holds on this memory
953	 * XXX - the "fudge" factor is to account for skipped blocks that
954	 * we missed because dnode_next_offset() misses in-core-only blocks.
955	 */
956	tx->tx_space_towrite = asize +
957	    spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
958	tx->tx_space_tofree = tofree;
959	tx->tx_space_tooverwrite = tooverwrite;
960	tx->tx_space_tounref = tounref;
961#endif
962
963	if (tx->tx_dir && asize != 0) {
964		int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
965		    asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
966		if (err)
967			return (err);
968	}
969
970	return (0);
971}
972
973static void
974dmu_tx_unassign(dmu_tx_t *tx)
975{
976	dmu_tx_hold_t *txh;
977
978	if (tx->tx_txg == 0)
979		return;
980
981	txg_rele_to_quiesce(&tx->tx_txgh);
982
983	for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
984	    txh = list_next(&tx->tx_holds, txh)) {
985		dnode_t *dn = txh->txh_dnode;
986
987		if (dn == NULL)
988			continue;
989		mutex_enter(&dn->dn_mtx);
990		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
991
992		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
993			dn->dn_assigned_txg = 0;
994			cv_broadcast(&dn->dn_notxholds);
995		}
996		mutex_exit(&dn->dn_mtx);
997	}
998
999	txg_rele_to_sync(&tx->tx_txgh);
1000
1001	tx->tx_lasttried_txg = tx->tx_txg;
1002	tx->tx_txg = 0;
1003}
1004
1005/*
1006 * Assign tx to a transaction group.  txg_how can be one of:
1007 *
1008 * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
1009 *	a new one.  This should be used when you're not holding locks.
1010 *	If will only fail if we're truly out of space (or over quota).
1011 *
1012 * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
1013 *	blocking, returns immediately with ERESTART.  This should be used
1014 *	whenever you're holding locks.  On an ERESTART error, the caller
1015 *	should drop locks, do a dmu_tx_wait(tx), and try again.
1016 *
1017 * (3)	A specific txg.  Use this if you need to ensure that multiple
1018 *	transactions all sync in the same txg.  Like TXG_NOWAIT, it
1019 *	returns ERESTART if it can't assign you into the requested txg.
1020 */
1021int
1022dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
1023{
1024	int err;
1025
1026	ASSERT(tx->tx_txg == 0);
1027	ASSERT(txg_how != 0);
1028	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1029
1030	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1031		dmu_tx_unassign(tx);
1032
1033		if (err != ERESTART || txg_how != TXG_WAIT)
1034			return (err);
1035
1036		dmu_tx_wait(tx);
1037	}
1038
1039	txg_rele_to_quiesce(&tx->tx_txgh);
1040
1041	return (0);
1042}
1043
1044void
1045dmu_tx_wait(dmu_tx_t *tx)
1046{
1047	spa_t *spa = tx->tx_pool->dp_spa;
1048
1049	ASSERT(tx->tx_txg == 0);
1050
1051	/*
1052	 * It's possible that the pool has become active after this thread
1053	 * has tried to obtain a tx. If that's the case then his
1054	 * tx_lasttried_txg would not have been assigned.
1055	 */
1056	if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1057		txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
1058	} else if (tx->tx_needassign_txh) {
1059		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1060
1061		mutex_enter(&dn->dn_mtx);
1062		while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1063			cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1064		mutex_exit(&dn->dn_mtx);
1065		tx->tx_needassign_txh = NULL;
1066	} else {
1067		txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1068	}
1069}
1070
1071void
1072dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1073{
1074#ifdef ZFS_DEBUG
1075	if (tx->tx_dir == NULL || delta == 0)
1076		return;
1077
1078	if (delta > 0) {
1079		ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
1080		    tx->tx_space_towrite);
1081		(void) refcount_add_many(&tx->tx_space_written, delta, NULL);
1082	} else {
1083		(void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
1084	}
1085#endif
1086}
1087
1088void
1089dmu_tx_commit(dmu_tx_t *tx)
1090{
1091	dmu_tx_hold_t *txh;
1092
1093	ASSERT(tx->tx_txg != 0);
1094
1095	while (txh = list_head(&tx->tx_holds)) {
1096		dnode_t *dn = txh->txh_dnode;
1097
1098		list_remove(&tx->tx_holds, txh);
1099		kmem_free(txh, sizeof (dmu_tx_hold_t));
1100		if (dn == NULL)
1101			continue;
1102		mutex_enter(&dn->dn_mtx);
1103		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1104
1105		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1106			dn->dn_assigned_txg = 0;
1107			cv_broadcast(&dn->dn_notxholds);
1108		}
1109		mutex_exit(&dn->dn_mtx);
1110		dnode_rele(dn, tx);
1111	}
1112
1113	if (tx->tx_tempreserve_cookie)
1114		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1115
1116	if (tx->tx_anyobj == FALSE)
1117		txg_rele_to_sync(&tx->tx_txgh);
1118	list_destroy(&tx->tx_holds);
1119#ifdef ZFS_DEBUG
1120	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
1121	    tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
1122	    tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
1123	refcount_destroy_many(&tx->tx_space_written,
1124	    refcount_count(&tx->tx_space_written));
1125	refcount_destroy_many(&tx->tx_space_freed,
1126	    refcount_count(&tx->tx_space_freed));
1127#endif
1128	kmem_free(tx, sizeof (dmu_tx_t));
1129}
1130
1131void
1132dmu_tx_abort(dmu_tx_t *tx)
1133{
1134	dmu_tx_hold_t *txh;
1135
1136	ASSERT(tx->tx_txg == 0);
1137
1138	while (txh = list_head(&tx->tx_holds)) {
1139		dnode_t *dn = txh->txh_dnode;
1140
1141		list_remove(&tx->tx_holds, txh);
1142		kmem_free(txh, sizeof (dmu_tx_hold_t));
1143		if (dn != NULL)
1144			dnode_rele(dn, tx);
1145	}
1146	list_destroy(&tx->tx_holds);
1147#ifdef ZFS_DEBUG
1148	refcount_destroy_many(&tx->tx_space_written,
1149	    refcount_count(&tx->tx_space_written));
1150	refcount_destroy_many(&tx->tx_space_freed,
1151	    refcount_count(&tx->tx_space_freed));
1152#endif
1153	kmem_free(tx, sizeof (dmu_tx_t));
1154}
1155
1156uint64_t
1157dmu_tx_get_txg(dmu_tx_t *tx)
1158{
1159	ASSERT(tx->tx_txg != 0);
1160	return (tx->tx_txg);
1161}
1162