1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/zfs_context.h>
27#include <sys/dmu.h>
28#include <sys/dmu_impl.h>
29#include <sys/dbuf.h>
30#include <sys/dmu_objset.h>
31#include <sys/dsl_dataset.h>
32#include <sys/dsl_dir.h>
33#include <sys/dmu_tx.h>
34#include <sys/spa.h>
35#include <sys/zio.h>
36#include <sys/dmu_zfetch.h>
37
38static void dbuf_destroy(dmu_buf_impl_t *db);
39static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
40static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
41
42/*
43 * Global data structures and functions for the dbuf cache.
44 */
45static kmem_cache_t *dbuf_cache;
46
47/* ARGSUSED */
48static int
49dbuf_cons(void *vdb, void *unused, int kmflag)
50{
51	dmu_buf_impl_t *db = unused;
52	bzero(db, sizeof (dmu_buf_impl_t));
53
54	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
55	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
56	refcount_create(&db->db_holds);
57	return (0);
58}
59
60/* ARGSUSED */
61static void
62dbuf_dest(void *vdb, void *unused)
63{
64	dmu_buf_impl_t *db = unused;
65	mutex_destroy(&db->db_mtx);
66	cv_destroy(&db->db_changed);
67	refcount_destroy(&db->db_holds);
68}
69
70/*
71 * dbuf hash table routines
72 */
73static dbuf_hash_table_t dbuf_hash_table;
74
75static uint64_t dbuf_hash_count;
76
77static uint64_t
78dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
79{
80	uintptr_t osv = (uintptr_t)os;
81	uint64_t crc = -1ULL;
82
83	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
84	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
85	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
86	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
87	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
88	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
89	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
90
91	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
92
93	return (crc);
94}
95
96#define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
97
98#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
99	((dbuf)->db.db_object == (obj) &&		\
100	(dbuf)->db_objset == (os) &&			\
101	(dbuf)->db_level == (level) &&			\
102	(dbuf)->db_blkid == (blkid))
103
104dmu_buf_impl_t *
105dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
106{
107	dbuf_hash_table_t *h = &dbuf_hash_table;
108	objset_t *os = dn->dn_objset;
109	uint64_t obj = dn->dn_object;
110	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
111	uint64_t idx = hv & h->hash_table_mask;
112	dmu_buf_impl_t *db;
113
114	mutex_enter(DBUF_HASH_MUTEX(h, idx));
115	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
116		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
117			mutex_enter(&db->db_mtx);
118			if (db->db_state != DB_EVICTING) {
119				mutex_exit(DBUF_HASH_MUTEX(h, idx));
120				return (db);
121			}
122			mutex_exit(&db->db_mtx);
123		}
124	}
125	mutex_exit(DBUF_HASH_MUTEX(h, idx));
126	return (NULL);
127}
128
129/*
130 * Insert an entry into the hash table.  If there is already an element
131 * equal to elem in the hash table, then the already existing element
132 * will be returned and the new element will not be inserted.
133 * Otherwise returns NULL.
134 */
135static dmu_buf_impl_t *
136dbuf_hash_insert(dmu_buf_impl_t *db)
137{
138	dbuf_hash_table_t *h = &dbuf_hash_table;
139	objset_t *os = db->db_objset;
140	uint64_t obj = db->db.db_object;
141	int level = db->db_level;
142	uint64_t blkid = db->db_blkid;
143	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
144	uint64_t idx = hv & h->hash_table_mask;
145	dmu_buf_impl_t *dbf;
146
147	mutex_enter(DBUF_HASH_MUTEX(h, idx));
148	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
149		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
150			mutex_enter(&dbf->db_mtx);
151			if (dbf->db_state != DB_EVICTING) {
152				mutex_exit(DBUF_HASH_MUTEX(h, idx));
153				return (dbf);
154			}
155			mutex_exit(&dbf->db_mtx);
156		}
157	}
158
159	mutex_enter(&db->db_mtx);
160	db->db_hash_next = h->hash_table[idx];
161	h->hash_table[idx] = db;
162	mutex_exit(DBUF_HASH_MUTEX(h, idx));
163	atomic_add_64(&dbuf_hash_count, 1);
164
165	return (NULL);
166}
167
168/*
169 * Remove an entry from the hash table.  This operation will
170 * fail if there are any existing holds on the db.
171 */
172static void
173dbuf_hash_remove(dmu_buf_impl_t *db)
174{
175	dbuf_hash_table_t *h = &dbuf_hash_table;
176	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
177	    db->db_level, db->db_blkid);
178	uint64_t idx = hv & h->hash_table_mask;
179	dmu_buf_impl_t *dbf, **dbp;
180
181	/*
182	 * We musn't hold db_mtx to maintin lock ordering:
183	 * DBUF_HASH_MUTEX > db_mtx.
184	 */
185	ASSERT(refcount_is_zero(&db->db_holds));
186	ASSERT(db->db_state == DB_EVICTING);
187	ASSERT(!MUTEX_HELD(&db->db_mtx));
188
189	mutex_enter(DBUF_HASH_MUTEX(h, idx));
190	dbp = &h->hash_table[idx];
191	while ((dbf = *dbp) != db) {
192		dbp = &dbf->db_hash_next;
193		ASSERT(dbf != NULL);
194	}
195	*dbp = db->db_hash_next;
196	db->db_hash_next = NULL;
197	mutex_exit(DBUF_HASH_MUTEX(h, idx));
198	atomic_add_64(&dbuf_hash_count, -1);
199}
200
201static arc_evict_func_t dbuf_do_evict;
202
203static void
204dbuf_evict_user(dmu_buf_impl_t *db)
205{
206	ASSERT(MUTEX_HELD(&db->db_mtx));
207
208	if (db->db_level != 0 || db->db_evict_func == NULL)
209		return;
210
211	if (db->db_user_data_ptr_ptr)
212		*db->db_user_data_ptr_ptr = db->db.db_data;
213	db->db_evict_func(&db->db, db->db_user_ptr);
214	db->db_user_ptr = NULL;
215	db->db_user_data_ptr_ptr = NULL;
216	db->db_evict_func = NULL;
217}
218
219void
220dbuf_evict(dmu_buf_impl_t *db)
221{
222	ASSERT(MUTEX_HELD(&db->db_mtx));
223	ASSERT(db->db_buf == NULL);
224	ASSERT(db->db_data_pending == NULL);
225
226	dbuf_clear(db);
227	dbuf_destroy(db);
228}
229
230void
231dbuf_init(void)
232{
233	uint64_t hsize = 1ULL << 16;
234	dbuf_hash_table_t *h = &dbuf_hash_table;
235	int i;
236
237	/*
238	 * The hash table is big enough to fill all of physical memory
239	 * with an average 4K block size.  The table will take up
240	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
241	 */
242	while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
243		hsize <<= 1;
244
245retry:
246	h->hash_table_mask = hsize - 1;
247	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
248	if (h->hash_table == NULL) {
249		/* XXX - we should really return an error instead of assert */
250		ASSERT(hsize > (1ULL << 10));
251		hsize >>= 1;
252		goto retry;
253	}
254
255	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
256	    sizeof (dmu_buf_impl_t),
257	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
258
259	for (i = 0; i < DBUF_MUTEXES; i++)
260		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
261}
262
263void
264dbuf_fini(void)
265{
266	dbuf_hash_table_t *h = &dbuf_hash_table;
267	int i;
268
269	for (i = 0; i < DBUF_MUTEXES; i++)
270		mutex_destroy(&h->hash_mutexes[i]);
271	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
272	kmem_cache_destroy(dbuf_cache);
273}
274
275/*
276 * Other stuff.
277 */
278
279#ifdef ZFS_DEBUG
280static void
281dbuf_verify(dmu_buf_impl_t *db)
282{
283	dnode_t *dn = db->db_dnode;
284	dbuf_dirty_record_t *dr;
285
286	ASSERT(MUTEX_HELD(&db->db_mtx));
287
288	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
289		return;
290
291	ASSERT(db->db_objset != NULL);
292	if (dn == NULL) {
293		ASSERT(db->db_parent == NULL);
294		ASSERT(db->db_blkptr == NULL);
295	} else {
296		ASSERT3U(db->db.db_object, ==, dn->dn_object);
297		ASSERT3P(db->db_objset, ==, dn->dn_objset);
298		ASSERT3U(db->db_level, <, dn->dn_nlevels);
299		ASSERT(db->db_blkid == DB_BONUS_BLKID ||
300		    list_head(&dn->dn_dbufs));
301	}
302	if (db->db_blkid == DB_BONUS_BLKID) {
303		ASSERT(dn != NULL);
304		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
305		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
306	} else {
307		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
308	}
309
310	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
311		ASSERT(dr->dr_dbuf == db);
312
313	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
314		ASSERT(dr->dr_dbuf == db);
315
316	/*
317	 * We can't assert that db_size matches dn_datablksz because it
318	 * can be momentarily different when another thread is doing
319	 * dnode_set_blksz().
320	 */
321	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
322		dr = db->db_data_pending;
323		/*
324		 * It should only be modified in syncing context, so
325		 * make sure we only have one copy of the data.
326		 */
327		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
328	}
329
330	/* verify db->db_blkptr */
331	if (db->db_blkptr) {
332		if (db->db_parent == dn->dn_dbuf) {
333			/* db is pointed to by the dnode */
334			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
335			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
336				ASSERT(db->db_parent == NULL);
337			else
338				ASSERT(db->db_parent != NULL);
339			ASSERT3P(db->db_blkptr, ==,
340			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
341		} else {
342			/* db is pointed to by an indirect block */
343			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
344			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
345			ASSERT3U(db->db_parent->db.db_object, ==,
346			    db->db.db_object);
347			/*
348			 * dnode_grow_indblksz() can make this fail if we don't
349			 * have the struct_rwlock.  XXX indblksz no longer
350			 * grows.  safe to do this now?
351			 */
352			if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
353				ASSERT3P(db->db_blkptr, ==,
354				    ((blkptr_t *)db->db_parent->db.db_data +
355				    db->db_blkid % epb));
356			}
357		}
358	}
359	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
360	    db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
361	    db->db_state != DB_FILL && !dn->dn_free_txg) {
362		/*
363		 * If the blkptr isn't set but they have nonzero data,
364		 * it had better be dirty, otherwise we'll lose that
365		 * data when we evict this buffer.
366		 */
367		if (db->db_dirtycnt == 0) {
368			uint64_t *buf = db->db.db_data;
369			int i;
370
371			for (i = 0; i < db->db.db_size >> 3; i++) {
372				ASSERT(buf[i] == 0);
373			}
374		}
375	}
376}
377#endif
378
379static void
380dbuf_update_data(dmu_buf_impl_t *db)
381{
382	ASSERT(MUTEX_HELD(&db->db_mtx));
383	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
384		ASSERT(!refcount_is_zero(&db->db_holds));
385		*db->db_user_data_ptr_ptr = db->db.db_data;
386	}
387}
388
389static void
390dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
391{
392	ASSERT(MUTEX_HELD(&db->db_mtx));
393	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
394	db->db_buf = buf;
395	if (buf != NULL) {
396		ASSERT(buf->b_data != NULL);
397		db->db.db_data = buf->b_data;
398		if (!arc_released(buf))
399			arc_set_callback(buf, dbuf_do_evict, db);
400		dbuf_update_data(db);
401	} else {
402		dbuf_evict_user(db);
403		db->db.db_data = NULL;
404		if (db->db_state != DB_NOFILL)
405			db->db_state = DB_UNCACHED;
406	}
407}
408
409/*
410 * Loan out an arc_buf for read.  Return the loaned arc_buf.
411 */
412arc_buf_t *
413dbuf_loan_arcbuf(dmu_buf_impl_t *db)
414{
415	arc_buf_t *abuf;
416
417	mutex_enter(&db->db_mtx);
418	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
419		int blksz = db->db.db_size;
420		mutex_exit(&db->db_mtx);
421		abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz);
422		bcopy(db->db.db_data, abuf->b_data, blksz);
423	} else {
424		abuf = db->db_buf;
425		arc_loan_inuse_buf(abuf, db);
426		dbuf_set_data(db, NULL);
427		mutex_exit(&db->db_mtx);
428	}
429	return (abuf);
430}
431
432uint64_t
433dbuf_whichblock(dnode_t *dn, uint64_t offset)
434{
435	if (dn->dn_datablkshift) {
436		return (offset >> dn->dn_datablkshift);
437	} else {
438		ASSERT3U(offset, <, dn->dn_datablksz);
439		return (0);
440	}
441}
442
443static void
444dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
445{
446	dmu_buf_impl_t *db = vdb;
447
448	mutex_enter(&db->db_mtx);
449	ASSERT3U(db->db_state, ==, DB_READ);
450	/*
451	 * All reads are synchronous, so we must have a hold on the dbuf
452	 */
453	ASSERT(refcount_count(&db->db_holds) > 0);
454	ASSERT(db->db_buf == NULL);
455	ASSERT(db->db.db_data == NULL);
456	if (db->db_level == 0 && db->db_freed_in_flight) {
457		/* we were freed in flight; disregard any error */
458		arc_release(buf, db);
459		bzero(buf->b_data, db->db.db_size);
460		arc_buf_freeze(buf);
461		db->db_freed_in_flight = FALSE;
462		dbuf_set_data(db, buf);
463		db->db_state = DB_CACHED;
464	} else if (zio == NULL || zio->io_error == 0) {
465		dbuf_set_data(db, buf);
466		db->db_state = DB_CACHED;
467	} else {
468		ASSERT(db->db_blkid != DB_BONUS_BLKID);
469		ASSERT3P(db->db_buf, ==, NULL);
470		VERIFY(arc_buf_remove_ref(buf, db) == 1);
471		db->db_state = DB_UNCACHED;
472	}
473	cv_broadcast(&db->db_changed);
474	mutex_exit(&db->db_mtx);
475	dbuf_rele(db, NULL);
476}
477
478static void
479dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
480{
481	dnode_t *dn = db->db_dnode;
482	zbookmark_t zb;
483	uint32_t aflags = ARC_NOWAIT;
484	arc_buf_t *pbuf;
485
486	ASSERT(!refcount_is_zero(&db->db_holds));
487	/* We need the struct_rwlock to prevent db_blkptr from changing. */
488	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
489	ASSERT(MUTEX_HELD(&db->db_mtx));
490	ASSERT(db->db_state == DB_UNCACHED);
491	ASSERT(db->db_buf == NULL);
492
493	if (db->db_blkid == DB_BONUS_BLKID) {
494		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
495
496		ASSERT3U(bonuslen, <=, db->db.db_size);
497		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
498		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
499		if (bonuslen < DN_MAX_BONUSLEN)
500			bzero(db->db.db_data, DN_MAX_BONUSLEN);
501		if (bonuslen)
502			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
503		dbuf_update_data(db);
504		db->db_state = DB_CACHED;
505		mutex_exit(&db->db_mtx);
506		return;
507	}
508
509	/*
510	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
511	 * processes the delete record and clears the bp while we are waiting
512	 * for the dn_mtx (resulting in a "no" from block_freed).
513	 */
514	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
515	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
516	    BP_IS_HOLE(db->db_blkptr)))) {
517		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
518
519		dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
520		    db->db.db_size, db, type));
521		bzero(db->db.db_data, db->db.db_size);
522		db->db_state = DB_CACHED;
523		*flags |= DB_RF_CACHED;
524		mutex_exit(&db->db_mtx);
525		return;
526	}
527
528	db->db_state = DB_READ;
529	mutex_exit(&db->db_mtx);
530
531	if (DBUF_IS_L2CACHEABLE(db))
532		aflags |= ARC_L2CACHE;
533
534	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
535	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
536	    db->db.db_object, db->db_level, db->db_blkid);
537
538	dbuf_add_ref(db, NULL);
539	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
540
541	if (db->db_parent)
542		pbuf = db->db_parent->db_buf;
543	else
544		pbuf = db->db_objset->os_phys_buf;
545
546	(void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
547	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
548	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
549	    &aflags, &zb);
550	if (aflags & ARC_CACHED)
551		*flags |= DB_RF_CACHED;
552}
553
554int
555dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
556{
557	int err = 0;
558	int havepzio = (zio != NULL);
559	int prefetch;
560
561	/*
562	 * We don't have to hold the mutex to check db_state because it
563	 * can't be freed while we have a hold on the buffer.
564	 */
565	ASSERT(!refcount_is_zero(&db->db_holds));
566
567	if (db->db_state == DB_NOFILL)
568		return (EIO);
569
570	if ((flags & DB_RF_HAVESTRUCT) == 0)
571		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
572
573	prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
574	    (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
575	    DBUF_IS_CACHEABLE(db);
576
577	mutex_enter(&db->db_mtx);
578	if (db->db_state == DB_CACHED) {
579		mutex_exit(&db->db_mtx);
580		if (prefetch)
581			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
582			    db->db.db_size, TRUE);
583		if ((flags & DB_RF_HAVESTRUCT) == 0)
584			rw_exit(&db->db_dnode->dn_struct_rwlock);
585	} else if (db->db_state == DB_UNCACHED) {
586		if (zio == NULL) {
587			zio = zio_root(db->db_dnode->dn_objset->os_spa,
588			    NULL, NULL, ZIO_FLAG_CANFAIL);
589		}
590		dbuf_read_impl(db, zio, &flags);
591
592		/* dbuf_read_impl has dropped db_mtx for us */
593
594		if (prefetch)
595			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
596			    db->db.db_size, flags & DB_RF_CACHED);
597
598		if ((flags & DB_RF_HAVESTRUCT) == 0)
599			rw_exit(&db->db_dnode->dn_struct_rwlock);
600
601		if (!havepzio)
602			err = zio_wait(zio);
603	} else {
604		mutex_exit(&db->db_mtx);
605		if (prefetch)
606			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
607			    db->db.db_size, TRUE);
608		if ((flags & DB_RF_HAVESTRUCT) == 0)
609			rw_exit(&db->db_dnode->dn_struct_rwlock);
610
611		mutex_enter(&db->db_mtx);
612		if ((flags & DB_RF_NEVERWAIT) == 0) {
613			while (db->db_state == DB_READ ||
614			    db->db_state == DB_FILL) {
615				ASSERT(db->db_state == DB_READ ||
616				    (flags & DB_RF_HAVESTRUCT) == 0);
617				cv_wait(&db->db_changed, &db->db_mtx);
618			}
619			if (db->db_state == DB_UNCACHED)
620				err = EIO;
621		}
622		mutex_exit(&db->db_mtx);
623	}
624
625	ASSERT(err || havepzio || db->db_state == DB_CACHED);
626	return (err);
627}
628
629static void
630dbuf_noread(dmu_buf_impl_t *db)
631{
632	ASSERT(!refcount_is_zero(&db->db_holds));
633	ASSERT(db->db_blkid != DB_BONUS_BLKID);
634	mutex_enter(&db->db_mtx);
635	while (db->db_state == DB_READ || db->db_state == DB_FILL)
636		cv_wait(&db->db_changed, &db->db_mtx);
637	if (db->db_state == DB_UNCACHED) {
638		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
639
640		ASSERT(db->db_buf == NULL);
641		ASSERT(db->db.db_data == NULL);
642		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
643		    db->db.db_size, db, type));
644		db->db_state = DB_FILL;
645	} else if (db->db_state == DB_NOFILL) {
646		dbuf_set_data(db, NULL);
647	} else {
648		ASSERT3U(db->db_state, ==, DB_CACHED);
649	}
650	mutex_exit(&db->db_mtx);
651}
652
653/*
654 * This is our just-in-time copy function.  It makes a copy of
655 * buffers, that have been modified in a previous transaction
656 * group, before we modify them in the current active group.
657 *
658 * This function is used in two places: when we are dirtying a
659 * buffer for the first time in a txg, and when we are freeing
660 * a range in a dnode that includes this buffer.
661 *
662 * Note that when we are called from dbuf_free_range() we do
663 * not put a hold on the buffer, we just traverse the active
664 * dbuf list for the dnode.
665 */
666static void
667dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
668{
669	dbuf_dirty_record_t *dr = db->db_last_dirty;
670
671	ASSERT(MUTEX_HELD(&db->db_mtx));
672	ASSERT(db->db.db_data != NULL);
673	ASSERT(db->db_level == 0);
674	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
675
676	if (dr == NULL ||
677	    (dr->dt.dl.dr_data !=
678	    ((db->db_blkid  == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
679		return;
680
681	/*
682	 * If the last dirty record for this dbuf has not yet synced
683	 * and its referencing the dbuf data, either:
684	 * 	reset the reference to point to a new copy,
685	 * or (if there a no active holders)
686	 *	just null out the current db_data pointer.
687	 */
688	ASSERT(dr->dr_txg >= txg - 2);
689	if (db->db_blkid == DB_BONUS_BLKID) {
690		/* Note that the data bufs here are zio_bufs */
691		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
692		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
693		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
694	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
695		int size = db->db.db_size;
696		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
697		dr->dt.dl.dr_data = arc_buf_alloc(
698		    db->db_dnode->dn_objset->os_spa, size, db, type);
699		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
700	} else {
701		dbuf_set_data(db, NULL);
702	}
703}
704
705void
706dbuf_unoverride(dbuf_dirty_record_t *dr)
707{
708	dmu_buf_impl_t *db = dr->dr_dbuf;
709	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
710	uint64_t txg = dr->dr_txg;
711
712	ASSERT(MUTEX_HELD(&db->db_mtx));
713	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
714	ASSERT(db->db_level == 0);
715
716	if (db->db_blkid == DB_BONUS_BLKID ||
717	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
718		return;
719
720	ASSERT(db->db_data_pending != dr);
721
722	/* free this block */
723	if (!BP_IS_HOLE(bp))
724		dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp);
725
726	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
727	/*
728	 * Release the already-written buffer, so we leave it in
729	 * a consistent dirty state.  Note that all callers are
730	 * modifying the buffer, so they will immediately do
731	 * another (redundant) arc_release().  Therefore, leave
732	 * the buf thawed to save the effort of freezing &
733	 * immediately re-thawing it.
734	 */
735	arc_release(dr->dt.dl.dr_data, db);
736}
737
738/*
739 * Evict (if its unreferenced) or clear (if its referenced) any level-0
740 * data blocks in the free range, so that any future readers will find
741 * empty blocks.  Also, if we happen accross any level-1 dbufs in the
742 * range that have not already been marked dirty, mark them dirty so
743 * they stay in memory.
744 */
745void
746dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
747{
748	dmu_buf_impl_t *db, *db_next;
749	uint64_t txg = tx->tx_txg;
750	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
751	uint64_t first_l1 = start >> epbs;
752	uint64_t last_l1 = end >> epbs;
753
754	if (end > dn->dn_maxblkid) {
755		end = dn->dn_maxblkid;
756		last_l1 = end >> epbs;
757	}
758	dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
759	mutex_enter(&dn->dn_dbufs_mtx);
760	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
761		db_next = list_next(&dn->dn_dbufs, db);
762		ASSERT(db->db_blkid != DB_BONUS_BLKID);
763
764		if (db->db_level == 1 &&
765		    db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
766			mutex_enter(&db->db_mtx);
767			if (db->db_last_dirty &&
768			    db->db_last_dirty->dr_txg < txg) {
769				dbuf_add_ref(db, FTAG);
770				mutex_exit(&db->db_mtx);
771				dbuf_will_dirty(db, tx);
772				dbuf_rele(db, FTAG);
773			} else {
774				mutex_exit(&db->db_mtx);
775			}
776		}
777
778		if (db->db_level != 0)
779			continue;
780		dprintf_dbuf(db, "found buf %s\n", "");
781		if (db->db_blkid < start || db->db_blkid > end)
782			continue;
783
784		/* found a level 0 buffer in the range */
785		if (dbuf_undirty(db, tx))
786			continue;
787
788		mutex_enter(&db->db_mtx);
789		if (db->db_state == DB_UNCACHED ||
790		    db->db_state == DB_NOFILL ||
791		    db->db_state == DB_EVICTING) {
792			ASSERT(db->db.db_data == NULL);
793			mutex_exit(&db->db_mtx);
794			continue;
795		}
796		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
797			/* will be handled in dbuf_read_done or dbuf_rele */
798			db->db_freed_in_flight = TRUE;
799			mutex_exit(&db->db_mtx);
800			continue;
801		}
802		if (refcount_count(&db->db_holds) == 0) {
803			ASSERT(db->db_buf);
804			dbuf_clear(db);
805			continue;
806		}
807		/* The dbuf is referenced */
808
809		if (db->db_last_dirty != NULL) {
810			dbuf_dirty_record_t *dr = db->db_last_dirty;
811
812			if (dr->dr_txg == txg) {
813				/*
814				 * This buffer is "in-use", re-adjust the file
815				 * size to reflect that this buffer may
816				 * contain new data when we sync.
817				 */
818				if (db->db_blkid > dn->dn_maxblkid)
819					dn->dn_maxblkid = db->db_blkid;
820				dbuf_unoverride(dr);
821			} else {
822				/*
823				 * This dbuf is not dirty in the open context.
824				 * Either uncache it (if its not referenced in
825				 * the open context) or reset its contents to
826				 * empty.
827				 */
828				dbuf_fix_old_data(db, txg);
829			}
830		}
831		/* clear the contents if its cached */
832		if (db->db_state == DB_CACHED) {
833			ASSERT(db->db.db_data != NULL);
834			arc_release(db->db_buf, db);
835			bzero(db->db.db_data, db->db.db_size);
836			arc_buf_freeze(db->db_buf);
837		}
838
839		mutex_exit(&db->db_mtx);
840	}
841	mutex_exit(&dn->dn_dbufs_mtx);
842}
843
844static int
845dbuf_block_freeable(dmu_buf_impl_t *db)
846{
847	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
848	uint64_t birth_txg = 0;
849
850	/*
851	 * We don't need any locking to protect db_blkptr:
852	 * If it's syncing, then db_last_dirty will be set
853	 * so we'll ignore db_blkptr.
854	 */
855	ASSERT(MUTEX_HELD(&db->db_mtx));
856	if (db->db_last_dirty)
857		birth_txg = db->db_last_dirty->dr_txg;
858	else if (db->db_blkptr)
859		birth_txg = db->db_blkptr->blk_birth;
860
861	/* If we don't exist or are in a snapshot, we can't be freed */
862	if (birth_txg)
863		return (ds == NULL ||
864		    dsl_dataset_block_freeable(ds, birth_txg));
865	else
866		return (FALSE);
867}
868
869void
870dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
871{
872	arc_buf_t *buf, *obuf;
873	int osize = db->db.db_size;
874	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
875
876	ASSERT(db->db_blkid != DB_BONUS_BLKID);
877
878	/* XXX does *this* func really need the lock? */
879	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
880
881	/*
882	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
883	 * is OK, because there can be no other references to the db
884	 * when we are changing its size, so no concurrent DB_FILL can
885	 * be happening.
886	 */
887	/*
888	 * XXX we should be doing a dbuf_read, checking the return
889	 * value and returning that up to our callers
890	 */
891	dbuf_will_dirty(db, tx);
892
893	/* create the data buffer for the new block */
894	buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
895
896	/* copy old block data to the new block */
897	obuf = db->db_buf;
898	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
899	/* zero the remainder */
900	if (size > osize)
901		bzero((uint8_t *)buf->b_data + osize, size - osize);
902
903	mutex_enter(&db->db_mtx);
904	dbuf_set_data(db, buf);
905	VERIFY(arc_buf_remove_ref(obuf, db) == 1);
906	db->db.db_size = size;
907
908	if (db->db_level == 0) {
909		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
910		db->db_last_dirty->dt.dl.dr_data = buf;
911	}
912	mutex_exit(&db->db_mtx);
913
914	dnode_willuse_space(db->db_dnode, size-osize, tx);
915}
916
917dbuf_dirty_record_t *
918dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
919{
920	dnode_t *dn = db->db_dnode;
921	objset_t *os = dn->dn_objset;
922	dbuf_dirty_record_t **drp, *dr;
923	int drop_struct_lock = FALSE;
924	boolean_t do_free_accounting = B_FALSE;
925	int txgoff = tx->tx_txg & TXG_MASK;
926
927	ASSERT(tx->tx_txg != 0);
928	ASSERT(!refcount_is_zero(&db->db_holds));
929	DMU_TX_DIRTY_BUF(tx, db);
930
931	/*
932	 * Shouldn't dirty a regular buffer in syncing context.  Private
933	 * objects may be dirtied in syncing context, but only if they
934	 * were already pre-dirtied in open context.
935	 */
936	ASSERT(!dmu_tx_is_syncing(tx) ||
937	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
938	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
939	    dn->dn_objset->os_dsl_dataset == NULL);
940	/*
941	 * We make this assert for private objects as well, but after we
942	 * check if we're already dirty.  They are allowed to re-dirty
943	 * in syncing context.
944	 */
945	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
946	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
947	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
948
949	mutex_enter(&db->db_mtx);
950	/*
951	 * XXX make this true for indirects too?  The problem is that
952	 * transactions created with dmu_tx_create_assigned() from
953	 * syncing context don't bother holding ahead.
954	 */
955	ASSERT(db->db_level != 0 ||
956	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
957	    db->db_state == DB_NOFILL);
958
959	mutex_enter(&dn->dn_mtx);
960	/*
961	 * Don't set dirtyctx to SYNC if we're just modifying this as we
962	 * initialize the objset.
963	 */
964	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
965	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
966		dn->dn_dirtyctx =
967		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
968		ASSERT(dn->dn_dirtyctx_firstset == NULL);
969		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
970	}
971	mutex_exit(&dn->dn_mtx);
972
973	/*
974	 * If this buffer is already dirty, we're done.
975	 */
976	drp = &db->db_last_dirty;
977	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
978	    db->db.db_object == DMU_META_DNODE_OBJECT);
979	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
980		drp = &dr->dr_next;
981	if (dr && dr->dr_txg == tx->tx_txg) {
982		if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
983			/*
984			 * If this buffer has already been written out,
985			 * we now need to reset its state.
986			 */
987			dbuf_unoverride(dr);
988			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
989			    db->db_state != DB_NOFILL)
990				arc_buf_thaw(db->db_buf);
991		}
992		mutex_exit(&db->db_mtx);
993		return (dr);
994	}
995
996	/*
997	 * Only valid if not already dirty.
998	 */
999	ASSERT(dn->dn_object == 0 ||
1000	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1001	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1002
1003	ASSERT3U(dn->dn_nlevels, >, db->db_level);
1004	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1005	    dn->dn_phys->dn_nlevels > db->db_level ||
1006	    dn->dn_next_nlevels[txgoff] > db->db_level ||
1007	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1008	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1009
1010	/*
1011	 * We should only be dirtying in syncing context if it's the
1012	 * mos or we're initializing the os or it's a special object.
1013	 * However, we are allowed to dirty in syncing context provided
1014	 * we already dirtied it in open context.  Hence we must make
1015	 * this assertion only if we're not already dirty.
1016	 */
1017	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1018	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1019	ASSERT(db->db.db_size != 0);
1020
1021	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1022
1023	if (db->db_blkid != DB_BONUS_BLKID) {
1024		/*
1025		 * Update the accounting.
1026		 * Note: we delay "free accounting" until after we drop
1027		 * the db_mtx.  This keeps us from grabbing other locks
1028		 * (and possibly deadlocking) in bp_get_dsize() while
1029		 * also holding the db_mtx.
1030		 */
1031		dnode_willuse_space(dn, db->db.db_size, tx);
1032		do_free_accounting = dbuf_block_freeable(db);
1033	}
1034
1035	/*
1036	 * If this buffer is dirty in an old transaction group we need
1037	 * to make a copy of it so that the changes we make in this
1038	 * transaction group won't leak out when we sync the older txg.
1039	 */
1040	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1041	if (db->db_level == 0) {
1042		void *data_old = db->db_buf;
1043
1044		if (db->db_state != DB_NOFILL) {
1045			if (db->db_blkid == DB_BONUS_BLKID) {
1046				dbuf_fix_old_data(db, tx->tx_txg);
1047				data_old = db->db.db_data;
1048			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1049				/*
1050				 * Release the data buffer from the cache so
1051				 * that we can modify it without impacting
1052				 * possible other users of this cached data
1053				 * block.  Note that indirect blocks and
1054				 * private objects are not released until the
1055				 * syncing state (since they are only modified
1056				 * then).
1057				 */
1058				arc_release(db->db_buf, db);
1059				dbuf_fix_old_data(db, tx->tx_txg);
1060				data_old = db->db_buf;
1061			}
1062			ASSERT(data_old != NULL);
1063		}
1064		dr->dt.dl.dr_data = data_old;
1065	} else {
1066		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1067		list_create(&dr->dt.di.dr_children,
1068		    sizeof (dbuf_dirty_record_t),
1069		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
1070	}
1071	dr->dr_dbuf = db;
1072	dr->dr_txg = tx->tx_txg;
1073	dr->dr_next = *drp;
1074	*drp = dr;
1075
1076	/*
1077	 * We could have been freed_in_flight between the dbuf_noread
1078	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
1079	 * happened after the free.
1080	 */
1081	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
1082		mutex_enter(&dn->dn_mtx);
1083		dnode_clear_range(dn, db->db_blkid, 1, tx);
1084		mutex_exit(&dn->dn_mtx);
1085		db->db_freed_in_flight = FALSE;
1086	}
1087
1088	/*
1089	 * This buffer is now part of this txg
1090	 */
1091	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1092	db->db_dirtycnt += 1;
1093	ASSERT3U(db->db_dirtycnt, <=, 3);
1094
1095	mutex_exit(&db->db_mtx);
1096
1097	if (db->db_blkid == DB_BONUS_BLKID) {
1098		mutex_enter(&dn->dn_mtx);
1099		ASSERT(!list_link_active(&dr->dr_dirty_node));
1100		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1101		mutex_exit(&dn->dn_mtx);
1102		dnode_setdirty(dn, tx);
1103		return (dr);
1104	} else if (do_free_accounting) {
1105		blkptr_t *bp = db->db_blkptr;
1106		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1107		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1108		/*
1109		 * This is only a guess -- if the dbuf is dirty
1110		 * in a previous txg, we don't know how much
1111		 * space it will use on disk yet.  We should
1112		 * really have the struct_rwlock to access
1113		 * db_blkptr, but since this is just a guess,
1114		 * it's OK if we get an odd answer.
1115		 */
1116		dnode_willuse_space(dn, -willfree, tx);
1117	}
1118
1119	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1120		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1121		drop_struct_lock = TRUE;
1122	}
1123
1124	if (db->db_level == 0) {
1125		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1126		ASSERT(dn->dn_maxblkid >= db->db_blkid);
1127	}
1128
1129	if (db->db_level+1 < dn->dn_nlevels) {
1130		dmu_buf_impl_t *parent = db->db_parent;
1131		dbuf_dirty_record_t *di;
1132		int parent_held = FALSE;
1133
1134		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1135			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1136
1137			parent = dbuf_hold_level(dn, db->db_level+1,
1138			    db->db_blkid >> epbs, FTAG);
1139			parent_held = TRUE;
1140		}
1141		if (drop_struct_lock)
1142			rw_exit(&dn->dn_struct_rwlock);
1143		ASSERT3U(db->db_level+1, ==, parent->db_level);
1144		di = dbuf_dirty(parent, tx);
1145		if (parent_held)
1146			dbuf_rele(parent, FTAG);
1147
1148		mutex_enter(&db->db_mtx);
1149		/*  possible race with dbuf_undirty() */
1150		if (db->db_last_dirty == dr ||
1151		    dn->dn_object == DMU_META_DNODE_OBJECT) {
1152			mutex_enter(&di->dt.di.dr_mtx);
1153			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1154			ASSERT(!list_link_active(&dr->dr_dirty_node));
1155			list_insert_tail(&di->dt.di.dr_children, dr);
1156			mutex_exit(&di->dt.di.dr_mtx);
1157			dr->dr_parent = di;
1158		}
1159		mutex_exit(&db->db_mtx);
1160	} else {
1161		ASSERT(db->db_level+1 == dn->dn_nlevels);
1162		ASSERT(db->db_blkid < dn->dn_nblkptr);
1163		ASSERT(db->db_parent == NULL ||
1164		    db->db_parent == db->db_dnode->dn_dbuf);
1165		mutex_enter(&dn->dn_mtx);
1166		ASSERT(!list_link_active(&dr->dr_dirty_node));
1167		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1168		mutex_exit(&dn->dn_mtx);
1169		if (drop_struct_lock)
1170			rw_exit(&dn->dn_struct_rwlock);
1171	}
1172
1173	dnode_setdirty(dn, tx);
1174	return (dr);
1175}
1176
1177static int
1178dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1179{
1180	dnode_t *dn = db->db_dnode;
1181	uint64_t txg = tx->tx_txg;
1182	dbuf_dirty_record_t *dr, **drp;
1183
1184	ASSERT(txg != 0);
1185	ASSERT(db->db_blkid != DB_BONUS_BLKID);
1186
1187	mutex_enter(&db->db_mtx);
1188	/*
1189	 * If this buffer is not dirty, we're done.
1190	 */
1191	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1192		if (dr->dr_txg <= txg)
1193			break;
1194	if (dr == NULL || dr->dr_txg < txg) {
1195		mutex_exit(&db->db_mtx);
1196		return (0);
1197	}
1198	ASSERT(dr->dr_txg == txg);
1199	ASSERT(dr->dr_dbuf == db);
1200
1201	/*
1202	 * If this buffer is currently held, we cannot undirty
1203	 * it, since one of the current holders may be in the
1204	 * middle of an update.  Note that users of dbuf_undirty()
1205	 * should not place a hold on the dbuf before the call.
1206	 */
1207	if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
1208		mutex_exit(&db->db_mtx);
1209		/* Make sure we don't toss this buffer at sync phase */
1210		mutex_enter(&dn->dn_mtx);
1211		dnode_clear_range(dn, db->db_blkid, 1, tx);
1212		mutex_exit(&dn->dn_mtx);
1213		return (0);
1214	}
1215
1216	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1217
1218	ASSERT(db->db.db_size != 0);
1219
1220	/* XXX would be nice to fix up dn_towrite_space[] */
1221
1222	*drp = dr->dr_next;
1223
1224	if (dr->dr_parent) {
1225		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1226		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1227		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1228	} else if (db->db_level+1 == dn->dn_nlevels) {
1229		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1230		mutex_enter(&dn->dn_mtx);
1231		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1232		mutex_exit(&dn->dn_mtx);
1233	}
1234
1235	if (db->db_level == 0) {
1236		if (db->db_state != DB_NOFILL) {
1237			dbuf_unoverride(dr);
1238
1239			ASSERT(db->db_buf != NULL);
1240			ASSERT(dr->dt.dl.dr_data != NULL);
1241			if (dr->dt.dl.dr_data != db->db_buf)
1242				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
1243				    db) == 1);
1244		}
1245	} else {
1246		ASSERT(db->db_buf != NULL);
1247		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1248		mutex_destroy(&dr->dt.di.dr_mtx);
1249		list_destroy(&dr->dt.di.dr_children);
1250	}
1251	kmem_free(dr, sizeof (dbuf_dirty_record_t));
1252
1253	ASSERT(db->db_dirtycnt > 0);
1254	db->db_dirtycnt -= 1;
1255
1256	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1257		arc_buf_t *buf = db->db_buf;
1258
1259		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1260		dbuf_set_data(db, NULL);
1261		VERIFY(arc_buf_remove_ref(buf, db) == 1);
1262		dbuf_evict(db);
1263		return (1);
1264	}
1265
1266	mutex_exit(&db->db_mtx);
1267	return (0);
1268}
1269
1270#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1271void
1272dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1273{
1274	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1275
1276	ASSERT(tx->tx_txg != 0);
1277	ASSERT(!refcount_is_zero(&db->db_holds));
1278
1279	if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
1280		rf |= DB_RF_HAVESTRUCT;
1281	(void) dbuf_read(db, NULL, rf);
1282	(void) dbuf_dirty(db, tx);
1283}
1284
1285void
1286dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1287{
1288	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1289
1290	db->db_state = DB_NOFILL;
1291
1292	dmu_buf_will_fill(db_fake, tx);
1293}
1294
1295void
1296dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1297{
1298	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1299
1300	ASSERT(db->db_blkid != DB_BONUS_BLKID);
1301	ASSERT(tx->tx_txg != 0);
1302	ASSERT(db->db_level == 0);
1303	ASSERT(!refcount_is_zero(&db->db_holds));
1304
1305	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1306	    dmu_tx_private_ok(tx));
1307
1308	dbuf_noread(db);
1309	(void) dbuf_dirty(db, tx);
1310}
1311
1312#pragma weak dmu_buf_fill_done = dbuf_fill_done
1313/* ARGSUSED */
1314void
1315dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1316{
1317	mutex_enter(&db->db_mtx);
1318	DBUF_VERIFY(db);
1319
1320	if (db->db_state == DB_FILL) {
1321		if (db->db_level == 0 && db->db_freed_in_flight) {
1322			ASSERT(db->db_blkid != DB_BONUS_BLKID);
1323			/* we were freed while filling */
1324			/* XXX dbuf_undirty? */
1325			bzero(db->db.db_data, db->db.db_size);
1326			db->db_freed_in_flight = FALSE;
1327		}
1328		db->db_state = DB_CACHED;
1329		cv_broadcast(&db->db_changed);
1330	}
1331	mutex_exit(&db->db_mtx);
1332}
1333
1334/*
1335 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1336 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1337 */
1338void
1339dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1340{
1341	ASSERT(!refcount_is_zero(&db->db_holds));
1342	ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
1343	ASSERT(db->db_blkid != DB_BONUS_BLKID);
1344	ASSERT(db->db_level == 0);
1345	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1346	ASSERT(buf != NULL);
1347	ASSERT(arc_buf_size(buf) == db->db.db_size);
1348	ASSERT(tx->tx_txg != 0);
1349
1350	arc_return_buf(buf, db);
1351	ASSERT(arc_released(buf));
1352
1353	mutex_enter(&db->db_mtx);
1354
1355	while (db->db_state == DB_READ || db->db_state == DB_FILL)
1356		cv_wait(&db->db_changed, &db->db_mtx);
1357
1358	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1359
1360	if (db->db_state == DB_CACHED &&
1361	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1362		mutex_exit(&db->db_mtx);
1363		(void) dbuf_dirty(db, tx);
1364		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1365		VERIFY(arc_buf_remove_ref(buf, db) == 1);
1366		xuio_stat_wbuf_copied();
1367		return;
1368	}
1369
1370	xuio_stat_wbuf_nocopy();
1371	if (db->db_state == DB_CACHED) {
1372		dbuf_dirty_record_t *dr = db->db_last_dirty;
1373
1374		ASSERT(db->db_buf != NULL);
1375		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1376			ASSERT(dr->dt.dl.dr_data == db->db_buf);
1377			if (!arc_released(db->db_buf)) {
1378				ASSERT(dr->dt.dl.dr_override_state ==
1379				    DR_OVERRIDDEN);
1380				arc_release(db->db_buf, db);
1381			}
1382			dr->dt.dl.dr_data = buf;
1383			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
1384		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1385			arc_release(db->db_buf, db);
1386			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
1387		}
1388		db->db_buf = NULL;
1389	}
1390	ASSERT(db->db_buf == NULL);
1391	dbuf_set_data(db, buf);
1392	db->db_state = DB_FILL;
1393	mutex_exit(&db->db_mtx);
1394	(void) dbuf_dirty(db, tx);
1395	dbuf_fill_done(db, tx);
1396}
1397
1398/*
1399 * "Clear" the contents of this dbuf.  This will mark the dbuf
1400 * EVICTING and clear *most* of its references.  Unfortunetely,
1401 * when we are not holding the dn_dbufs_mtx, we can't clear the
1402 * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1403 * in this case.  For callers from the DMU we will usually see:
1404 *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1405 * For the arc callback, we will usually see:
1406 * 	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1407 * Sometimes, though, we will get a mix of these two:
1408 *	DMU: dbuf_clear()->arc_buf_evict()
1409 *	ARC: dbuf_do_evict()->dbuf_destroy()
1410 */
1411void
1412dbuf_clear(dmu_buf_impl_t *db)
1413{
1414	dnode_t *dn = db->db_dnode;
1415	dmu_buf_impl_t *parent = db->db_parent;
1416	dmu_buf_impl_t *dndb = dn->dn_dbuf;
1417	int dbuf_gone = FALSE;
1418
1419	ASSERT(MUTEX_HELD(&db->db_mtx));
1420	ASSERT(refcount_is_zero(&db->db_holds));
1421
1422	dbuf_evict_user(db);
1423
1424	if (db->db_state == DB_CACHED) {
1425		ASSERT(db->db.db_data != NULL);
1426		if (db->db_blkid == DB_BONUS_BLKID) {
1427			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1428			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1429		}
1430		db->db.db_data = NULL;
1431		db->db_state = DB_UNCACHED;
1432	}
1433
1434	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1435	ASSERT(db->db_data_pending == NULL);
1436
1437	db->db_state = DB_EVICTING;
1438	db->db_blkptr = NULL;
1439
1440	if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1441		list_remove(&dn->dn_dbufs, db);
1442		dnode_rele(dn, db);
1443		db->db_dnode = NULL;
1444	}
1445
1446	if (db->db_buf)
1447		dbuf_gone = arc_buf_evict(db->db_buf);
1448
1449	if (!dbuf_gone)
1450		mutex_exit(&db->db_mtx);
1451
1452	/*
1453	 * If this dbuf is referened from an indirect dbuf,
1454	 * decrement the ref count on the indirect dbuf.
1455	 */
1456	if (parent && parent != dndb)
1457		dbuf_rele(parent, db);
1458}
1459
1460static int
1461dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1462    dmu_buf_impl_t **parentp, blkptr_t **bpp)
1463{
1464	int nlevels, epbs;
1465
1466	*parentp = NULL;
1467	*bpp = NULL;
1468
1469	ASSERT(blkid != DB_BONUS_BLKID);
1470
1471	if (dn->dn_phys->dn_nlevels == 0)
1472		nlevels = 1;
1473	else
1474		nlevels = dn->dn_phys->dn_nlevels;
1475
1476	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1477
1478	ASSERT3U(level * epbs, <, 64);
1479	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1480	if (level >= nlevels ||
1481	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1482		/* the buffer has no parent yet */
1483		return (ENOENT);
1484	} else if (level < nlevels-1) {
1485		/* this block is referenced from an indirect block */
1486		int err = dbuf_hold_impl(dn, level+1,
1487		    blkid >> epbs, fail_sparse, NULL, parentp);
1488		if (err)
1489			return (err);
1490		err = dbuf_read(*parentp, NULL,
1491		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1492		if (err) {
1493			dbuf_rele(*parentp, NULL);
1494			*parentp = NULL;
1495			return (err);
1496		}
1497		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1498		    (blkid & ((1ULL << epbs) - 1));
1499		return (0);
1500	} else {
1501		/* the block is referenced from the dnode */
1502		ASSERT3U(level, ==, nlevels-1);
1503		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1504		    blkid < dn->dn_phys->dn_nblkptr);
1505		if (dn->dn_dbuf) {
1506			dbuf_add_ref(dn->dn_dbuf, NULL);
1507			*parentp = dn->dn_dbuf;
1508		}
1509		*bpp = &dn->dn_phys->dn_blkptr[blkid];
1510		return (0);
1511	}
1512}
1513
1514static dmu_buf_impl_t *
1515dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1516    dmu_buf_impl_t *parent, blkptr_t *blkptr)
1517{
1518	objset_t *os = dn->dn_objset;
1519	dmu_buf_impl_t *db, *odb;
1520
1521	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1522	ASSERT(dn->dn_type != DMU_OT_NONE);
1523
1524	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1525
1526	db->db_objset = os;
1527	db->db.db_object = dn->dn_object;
1528	db->db_level = level;
1529	db->db_blkid = blkid;
1530	db->db_last_dirty = NULL;
1531	db->db_dirtycnt = 0;
1532	db->db_dnode = dn;
1533	db->db_parent = parent;
1534	db->db_blkptr = blkptr;
1535
1536	db->db_user_ptr = NULL;
1537	db->db_user_data_ptr_ptr = NULL;
1538	db->db_evict_func = NULL;
1539	db->db_immediate_evict = 0;
1540	db->db_freed_in_flight = 0;
1541
1542	if (blkid == DB_BONUS_BLKID) {
1543		ASSERT3P(parent, ==, dn->dn_dbuf);
1544		db->db.db_size = DN_MAX_BONUSLEN -
1545		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1546		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1547		db->db.db_offset = DB_BONUS_BLKID;
1548		db->db_state = DB_UNCACHED;
1549		/* the bonus dbuf is not placed in the hash table */
1550		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1551		return (db);
1552	} else {
1553		int blocksize =
1554		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
1555		db->db.db_size = blocksize;
1556		db->db.db_offset = db->db_blkid * blocksize;
1557	}
1558
1559	/*
1560	 * Hold the dn_dbufs_mtx while we get the new dbuf
1561	 * in the hash table *and* added to the dbufs list.
1562	 * This prevents a possible deadlock with someone
1563	 * trying to look up this dbuf before its added to the
1564	 * dn_dbufs list.
1565	 */
1566	mutex_enter(&dn->dn_dbufs_mtx);
1567	db->db_state = DB_EVICTING;
1568	if ((odb = dbuf_hash_insert(db)) != NULL) {
1569		/* someone else inserted it first */
1570		kmem_cache_free(dbuf_cache, db);
1571		mutex_exit(&dn->dn_dbufs_mtx);
1572		return (odb);
1573	}
1574	list_insert_head(&dn->dn_dbufs, db);
1575	db->db_state = DB_UNCACHED;
1576	mutex_exit(&dn->dn_dbufs_mtx);
1577	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1578
1579	if (parent && parent != dn->dn_dbuf)
1580		dbuf_add_ref(parent, db);
1581
1582	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1583	    refcount_count(&dn->dn_holds) > 0);
1584	(void) refcount_add(&dn->dn_holds, db);
1585
1586	dprintf_dbuf(db, "db=%p\n", db);
1587
1588	return (db);
1589}
1590
1591static int
1592dbuf_do_evict(void *private)
1593{
1594	arc_buf_t *buf = private;
1595	dmu_buf_impl_t *db = buf->b_private;
1596
1597	if (!MUTEX_HELD(&db->db_mtx))
1598		mutex_enter(&db->db_mtx);
1599
1600	ASSERT(refcount_is_zero(&db->db_holds));
1601
1602	if (db->db_state != DB_EVICTING) {
1603		ASSERT(db->db_state == DB_CACHED);
1604		DBUF_VERIFY(db);
1605		db->db_buf = NULL;
1606		dbuf_evict(db);
1607	} else {
1608		mutex_exit(&db->db_mtx);
1609		dbuf_destroy(db);
1610	}
1611	return (0);
1612}
1613
1614static void
1615dbuf_destroy(dmu_buf_impl_t *db)
1616{
1617	ASSERT(refcount_is_zero(&db->db_holds));
1618
1619	if (db->db_blkid != DB_BONUS_BLKID) {
1620		/*
1621		 * If this dbuf is still on the dn_dbufs list,
1622		 * remove it from that list.
1623		 */
1624		if (db->db_dnode) {
1625			dnode_t *dn = db->db_dnode;
1626
1627			mutex_enter(&dn->dn_dbufs_mtx);
1628			list_remove(&dn->dn_dbufs, db);
1629			mutex_exit(&dn->dn_dbufs_mtx);
1630
1631			dnode_rele(dn, db);
1632			db->db_dnode = NULL;
1633		}
1634		dbuf_hash_remove(db);
1635	}
1636	db->db_parent = NULL;
1637	db->db_buf = NULL;
1638
1639	ASSERT(!list_link_active(&db->db_link));
1640	ASSERT(db->db.db_data == NULL);
1641	ASSERT(db->db_hash_next == NULL);
1642	ASSERT(db->db_blkptr == NULL);
1643	ASSERT(db->db_data_pending == NULL);
1644
1645	kmem_cache_free(dbuf_cache, db);
1646	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1647}
1648
1649void
1650dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1651{
1652	dmu_buf_impl_t *db = NULL;
1653	blkptr_t *bp = NULL;
1654
1655	ASSERT(blkid != DB_BONUS_BLKID);
1656	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1657
1658	if (dnode_block_freed(dn, blkid))
1659		return;
1660
1661	/* dbuf_find() returns with db_mtx held */
1662	if (db = dbuf_find(dn, 0, blkid)) {
1663		if (refcount_count(&db->db_holds) > 0) {
1664			/*
1665			 * This dbuf is active.  We assume that it is
1666			 * already CACHED, or else about to be either
1667			 * read or filled.
1668			 */
1669			mutex_exit(&db->db_mtx);
1670			return;
1671		}
1672		mutex_exit(&db->db_mtx);
1673		db = NULL;
1674	}
1675
1676	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1677		if (bp && !BP_IS_HOLE(bp)) {
1678			arc_buf_t *pbuf;
1679			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1680			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1681			zbookmark_t zb;
1682
1683			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1684			    dn->dn_object, 0, blkid);
1685
1686			if (db)
1687				pbuf = db->db_buf;
1688			else
1689				pbuf = dn->dn_objset->os_phys_buf;
1690
1691			(void) arc_read(NULL, dn->dn_objset->os_spa,
1692			    bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
1693			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1694			    &aflags, &zb);
1695		}
1696		if (db)
1697			dbuf_rele(db, NULL);
1698	}
1699}
1700
1701/*
1702 * Returns with db_holds incremented, and db_mtx not held.
1703 * Note: dn_struct_rwlock must be held.
1704 */
1705int
1706dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1707    void *tag, dmu_buf_impl_t **dbp)
1708{
1709	dmu_buf_impl_t *db, *parent = NULL;
1710
1711	ASSERT(blkid != DB_BONUS_BLKID);
1712	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1713	ASSERT3U(dn->dn_nlevels, >, level);
1714
1715	*dbp = NULL;
1716top:
1717	/* dbuf_find() returns with db_mtx held */
1718	db = dbuf_find(dn, level, blkid);
1719
1720	if (db == NULL) {
1721		blkptr_t *bp = NULL;
1722		int err;
1723
1724		ASSERT3P(parent, ==, NULL);
1725		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1726		if (fail_sparse) {
1727			if (err == 0 && bp && BP_IS_HOLE(bp))
1728				err = ENOENT;
1729			if (err) {
1730				if (parent)
1731					dbuf_rele(parent, NULL);
1732				return (err);
1733			}
1734		}
1735		if (err && err != ENOENT)
1736			return (err);
1737		db = dbuf_create(dn, level, blkid, parent, bp);
1738	}
1739
1740	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1741		arc_buf_add_ref(db->db_buf, db);
1742		if (db->db_buf->b_data == NULL) {
1743			dbuf_clear(db);
1744			if (parent) {
1745				dbuf_rele(parent, NULL);
1746				parent = NULL;
1747			}
1748			goto top;
1749		}
1750		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1751	}
1752
1753	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1754
1755	/*
1756	 * If this buffer is currently syncing out, and we are are
1757	 * still referencing it from db_data, we need to make a copy
1758	 * of it in case we decide we want to dirty it again in this txg.
1759	 */
1760	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
1761	    dn->dn_object != DMU_META_DNODE_OBJECT &&
1762	    db->db_state == DB_CACHED && db->db_data_pending) {
1763		dbuf_dirty_record_t *dr = db->db_data_pending;
1764
1765		if (dr->dt.dl.dr_data == db->db_buf) {
1766			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1767
1768			dbuf_set_data(db,
1769			    arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
1770			    db->db.db_size, db, type));
1771			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1772			    db->db.db_size);
1773		}
1774	}
1775
1776	(void) refcount_add(&db->db_holds, tag);
1777	dbuf_update_data(db);
1778	DBUF_VERIFY(db);
1779	mutex_exit(&db->db_mtx);
1780
1781	/* NOTE: we can't rele the parent until after we drop the db_mtx */
1782	if (parent)
1783		dbuf_rele(parent, NULL);
1784
1785	ASSERT3P(db->db_dnode, ==, dn);
1786	ASSERT3U(db->db_blkid, ==, blkid);
1787	ASSERT3U(db->db_level, ==, level);
1788	*dbp = db;
1789
1790	return (0);
1791}
1792
1793dmu_buf_impl_t *
1794dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1795{
1796	dmu_buf_impl_t *db;
1797	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1798	return (err ? NULL : db);
1799}
1800
1801dmu_buf_impl_t *
1802dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1803{
1804	dmu_buf_impl_t *db;
1805	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1806	return (err ? NULL : db);
1807}
1808
1809void
1810dbuf_create_bonus(dnode_t *dn)
1811{
1812	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1813
1814	ASSERT(dn->dn_bonus == NULL);
1815	dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
1816}
1817
1818#pragma weak dmu_buf_add_ref = dbuf_add_ref
1819void
1820dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
1821{
1822	int64_t holds = refcount_add(&db->db_holds, tag);
1823	ASSERT(holds > 1);
1824}
1825
1826#pragma weak dmu_buf_rele = dbuf_rele
1827void
1828dbuf_rele(dmu_buf_impl_t *db, void *tag)
1829{
1830	mutex_enter(&db->db_mtx);
1831	dbuf_rele_and_unlock(db, tag);
1832}
1833
1834/*
1835 * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
1836 * db_dirtycnt and db_holds to be updated atomically.
1837 */
1838void
1839dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
1840{
1841	int64_t holds;
1842
1843	ASSERT(MUTEX_HELD(&db->db_mtx));
1844	DBUF_VERIFY(db);
1845
1846	holds = refcount_remove(&db->db_holds, tag);
1847	ASSERT(holds >= 0);
1848
1849	/*
1850	 * We can't freeze indirects if there is a possibility that they
1851	 * may be modified in the current syncing context.
1852	 */
1853	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
1854		arc_buf_freeze(db->db_buf);
1855
1856	if (holds == db->db_dirtycnt &&
1857	    db->db_level == 0 && db->db_immediate_evict)
1858		dbuf_evict_user(db);
1859
1860	if (holds == 0) {
1861		if (db->db_blkid == DB_BONUS_BLKID) {
1862			mutex_exit(&db->db_mtx);
1863			dnode_rele(db->db_dnode, db);
1864		} else if (db->db_buf == NULL) {
1865			/*
1866			 * This is a special case: we never associated this
1867			 * dbuf with any data allocated from the ARC.
1868			 */
1869			ASSERT(db->db_state == DB_UNCACHED ||
1870			    db->db_state == DB_NOFILL);
1871			dbuf_evict(db);
1872		} else if (arc_released(db->db_buf)) {
1873			arc_buf_t *buf = db->db_buf;
1874			/*
1875			 * This dbuf has anonymous data associated with it.
1876			 */
1877			dbuf_set_data(db, NULL);
1878			VERIFY(arc_buf_remove_ref(buf, db) == 1);
1879			dbuf_evict(db);
1880		} else {
1881			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
1882			if (!DBUF_IS_CACHEABLE(db))
1883				dbuf_clear(db);
1884			else
1885				mutex_exit(&db->db_mtx);
1886		}
1887	} else {
1888		mutex_exit(&db->db_mtx);
1889	}
1890}
1891
1892#pragma weak dmu_buf_refcount = dbuf_refcount
1893uint64_t
1894dbuf_refcount(dmu_buf_impl_t *db)
1895{
1896	return (refcount_count(&db->db_holds));
1897}
1898
1899void *
1900dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1901    dmu_buf_evict_func_t *evict_func)
1902{
1903	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1904	    user_data_ptr_ptr, evict_func));
1905}
1906
1907void *
1908dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1909    dmu_buf_evict_func_t *evict_func)
1910{
1911	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1912
1913	db->db_immediate_evict = TRUE;
1914	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1915	    user_data_ptr_ptr, evict_func));
1916}
1917
1918void *
1919dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
1920    void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
1921{
1922	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1923	ASSERT(db->db_level == 0);
1924
1925	ASSERT((user_ptr == NULL) == (evict_func == NULL));
1926
1927	mutex_enter(&db->db_mtx);
1928
1929	if (db->db_user_ptr == old_user_ptr) {
1930		db->db_user_ptr = user_ptr;
1931		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
1932		db->db_evict_func = evict_func;
1933
1934		dbuf_update_data(db);
1935	} else {
1936		old_user_ptr = db->db_user_ptr;
1937	}
1938
1939	mutex_exit(&db->db_mtx);
1940	return (old_user_ptr);
1941}
1942
1943void *
1944dmu_buf_get_user(dmu_buf_t *db_fake)
1945{
1946	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1947	ASSERT(!refcount_is_zero(&db->db_holds));
1948
1949	return (db->db_user_ptr);
1950}
1951
1952boolean_t
1953dmu_buf_freeable(dmu_buf_t *dbuf)
1954{
1955	boolean_t res = B_FALSE;
1956	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
1957
1958	if (db->db_blkptr)
1959		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
1960		    db->db_blkptr->blk_birth);
1961
1962	return (res);
1963}
1964
1965static void
1966dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
1967{
1968	/* ASSERT(dmu_tx_is_syncing(tx) */
1969	ASSERT(MUTEX_HELD(&db->db_mtx));
1970
1971	if (db->db_blkptr != NULL)
1972		return;
1973
1974	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
1975		/*
1976		 * This buffer was allocated at a time when there was
1977		 * no available blkptrs from the dnode, or it was
1978		 * inappropriate to hook it in (i.e., nlevels mis-match).
1979		 */
1980		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
1981		ASSERT(db->db_parent == NULL);
1982		db->db_parent = dn->dn_dbuf;
1983		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
1984		DBUF_VERIFY(db);
1985	} else {
1986		dmu_buf_impl_t *parent = db->db_parent;
1987		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
1988
1989		ASSERT(dn->dn_phys->dn_nlevels > 1);
1990		if (parent == NULL) {
1991			mutex_exit(&db->db_mtx);
1992			rw_enter(&dn->dn_struct_rwlock, RW_READER);
1993			(void) dbuf_hold_impl(dn, db->db_level+1,
1994			    db->db_blkid >> epbs, FALSE, db, &parent);
1995			rw_exit(&dn->dn_struct_rwlock);
1996			mutex_enter(&db->db_mtx);
1997			db->db_parent = parent;
1998		}
1999		db->db_blkptr = (blkptr_t *)parent->db.db_data +
2000		    (db->db_blkid & ((1ULL << epbs) - 1));
2001		DBUF_VERIFY(db);
2002	}
2003}
2004
2005static void
2006dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2007{
2008	dmu_buf_impl_t *db = dr->dr_dbuf;
2009	dnode_t *dn = db->db_dnode;
2010	zio_t *zio;
2011
2012	ASSERT(dmu_tx_is_syncing(tx));
2013
2014	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2015
2016	mutex_enter(&db->db_mtx);
2017
2018	ASSERT(db->db_level > 0);
2019	DBUF_VERIFY(db);
2020
2021	if (db->db_buf == NULL) {
2022		mutex_exit(&db->db_mtx);
2023		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2024		mutex_enter(&db->db_mtx);
2025	}
2026	ASSERT3U(db->db_state, ==, DB_CACHED);
2027	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2028	ASSERT(db->db_buf != NULL);
2029
2030	dbuf_check_blkptr(dn, db);
2031
2032	db->db_data_pending = dr;
2033
2034	mutex_exit(&db->db_mtx);
2035	dbuf_write(dr, db->db_buf, tx);
2036
2037	zio = dr->dr_zio;
2038	mutex_enter(&dr->dt.di.dr_mtx);
2039	dbuf_sync_list(&dr->dt.di.dr_children, tx);
2040	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2041	mutex_exit(&dr->dt.di.dr_mtx);
2042	zio_nowait(zio);
2043}
2044
2045static void
2046dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2047{
2048	arc_buf_t **datap = &dr->dt.dl.dr_data;
2049	dmu_buf_impl_t *db = dr->dr_dbuf;
2050	dnode_t *dn = db->db_dnode;
2051	objset_t *os = dn->dn_objset;
2052	uint64_t txg = tx->tx_txg;
2053
2054	ASSERT(dmu_tx_is_syncing(tx));
2055
2056	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2057
2058	mutex_enter(&db->db_mtx);
2059	/*
2060	 * To be synced, we must be dirtied.  But we
2061	 * might have been freed after the dirty.
2062	 */
2063	if (db->db_state == DB_UNCACHED) {
2064		/* This buffer has been freed since it was dirtied */
2065		ASSERT(db->db.db_data == NULL);
2066	} else if (db->db_state == DB_FILL) {
2067		/* This buffer was freed and is now being re-filled */
2068		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2069	} else {
2070		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2071	}
2072	DBUF_VERIFY(db);
2073
2074	/*
2075	 * If this is a bonus buffer, simply copy the bonus data into the
2076	 * dnode.  It will be written out when the dnode is synced (and it
2077	 * will be synced, since it must have been dirty for dbuf_sync to
2078	 * be called).
2079	 */
2080	if (db->db_blkid == DB_BONUS_BLKID) {
2081		dbuf_dirty_record_t **drp;
2082
2083		ASSERT(*datap != NULL);
2084		ASSERT3U(db->db_level, ==, 0);
2085		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2086		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2087		if (*datap != db->db.db_data) {
2088			zio_buf_free(*datap, DN_MAX_BONUSLEN);
2089			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2090		}
2091		db->db_data_pending = NULL;
2092		drp = &db->db_last_dirty;
2093		while (*drp != dr)
2094			drp = &(*drp)->dr_next;
2095		ASSERT(dr->dr_next == NULL);
2096		ASSERT(dr->dr_dbuf == db);
2097		*drp = dr->dr_next;
2098		kmem_free(dr, sizeof (dbuf_dirty_record_t));
2099		ASSERT(db->db_dirtycnt > 0);
2100		db->db_dirtycnt -= 1;
2101		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2102		return;
2103	}
2104
2105	/*
2106	 * This function may have dropped the db_mtx lock allowing a dmu_sync
2107	 * operation to sneak in. As a result, we need to ensure that we
2108	 * don't check the dr_override_state until we have returned from
2109	 * dbuf_check_blkptr.
2110	 */
2111	dbuf_check_blkptr(dn, db);
2112
2113	/*
2114	 * If this buffer is in the middle of an immdiate write,
2115	 * wait for the synchronous IO to complete.
2116	 */
2117	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2118		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2119		cv_wait(&db->db_changed, &db->db_mtx);
2120		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2121	}
2122
2123	if (db->db_state != DB_NOFILL &&
2124	    dn->dn_object != DMU_META_DNODE_OBJECT &&
2125	    refcount_count(&db->db_holds) > 1 &&
2126	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2127	    *datap == db->db_buf) {
2128		/*
2129		 * If this buffer is currently "in use" (i.e., there
2130		 * are active holds and db_data still references it),
2131		 * then make a copy before we start the write so that
2132		 * any modifications from the open txg will not leak
2133		 * into this write.
2134		 *
2135		 * NOTE: this copy does not need to be made for
2136		 * objects only modified in the syncing context (e.g.
2137		 * DNONE_DNODE blocks).
2138		 */
2139		int blksz = arc_buf_size(*datap);
2140		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2141		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2142		bcopy(db->db.db_data, (*datap)->b_data, blksz);
2143	}
2144	db->db_data_pending = dr;
2145
2146	mutex_exit(&db->db_mtx);
2147
2148	dbuf_write(dr, *datap, tx);
2149
2150	ASSERT(!list_link_active(&dr->dr_dirty_node));
2151	if (dn->dn_object == DMU_META_DNODE_OBJECT)
2152		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2153	else
2154		zio_nowait(dr->dr_zio);
2155}
2156
2157void
2158dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2159{
2160	dbuf_dirty_record_t *dr;
2161
2162	while (dr = list_head(list)) {
2163		if (dr->dr_zio != NULL) {
2164			/*
2165			 * If we find an already initialized zio then we
2166			 * are processing the meta-dnode, and we have finished.
2167			 * The dbufs for all dnodes are put back on the list
2168			 * during processing, so that we can zio_wait()
2169			 * these IOs after initiating all child IOs.
2170			 */
2171			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2172			    DMU_META_DNODE_OBJECT);
2173			break;
2174		}
2175		list_remove(list, dr);
2176		if (dr->dr_dbuf->db_level > 0)
2177			dbuf_sync_indirect(dr, tx);
2178		else
2179			dbuf_sync_leaf(dr, tx);
2180	}
2181}
2182
2183/* ARGSUSED */
2184static void
2185dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2186{
2187	dmu_buf_impl_t *db = vdb;
2188	blkptr_t *bp = zio->io_bp;
2189	blkptr_t *bp_orig = &zio->io_bp_orig;
2190	dnode_t *dn = db->db_dnode;
2191	spa_t *spa = zio->io_spa;
2192	int64_t delta;
2193	uint64_t fill = 0;
2194	int i;
2195
2196	ASSERT(db->db_blkptr == bp);
2197
2198	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2199	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2200	zio->io_prev_space_delta = delta;
2201
2202	if (BP_IS_HOLE(bp)) {
2203		ASSERT(bp->blk_fill == 0);
2204		return;
2205	}
2206
2207	ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
2208	ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2209
2210	mutex_enter(&db->db_mtx);
2211
2212	if (db->db_level == 0) {
2213		mutex_enter(&dn->dn_mtx);
2214		if (db->db_blkid > dn->dn_phys->dn_maxblkid)
2215			dn->dn_phys->dn_maxblkid = db->db_blkid;
2216		mutex_exit(&dn->dn_mtx);
2217
2218		if (dn->dn_type == DMU_OT_DNODE) {
2219			dnode_phys_t *dnp = db->db.db_data;
2220			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2221			    i--, dnp++) {
2222				if (dnp->dn_type != DMU_OT_NONE)
2223					fill++;
2224			}
2225		} else {
2226			fill = 1;
2227		}
2228	} else {
2229		blkptr_t *ibp = db->db.db_data;
2230		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2231		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2232			if (BP_IS_HOLE(ibp))
2233				continue;
2234			fill += ibp->blk_fill;
2235		}
2236	}
2237
2238	bp->blk_fill = fill;
2239
2240	mutex_exit(&db->db_mtx);
2241}
2242
2243/* ARGSUSED */
2244static void
2245dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2246{
2247	dmu_buf_impl_t *db = vdb;
2248	blkptr_t *bp = zio->io_bp;
2249	blkptr_t *bp_orig = &zio->io_bp_orig;
2250	dnode_t *dn = db->db_dnode;
2251	objset_t *os = dn->dn_objset;
2252	uint64_t txg = zio->io_txg;
2253	dbuf_dirty_record_t **drp, *dr;
2254
2255	ASSERT3U(zio->io_error, ==, 0);
2256	ASSERT(db->db_blkptr == bp);
2257
2258	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
2259		ASSERT(BP_EQUAL(bp, bp_orig));
2260	} else {
2261		dsl_dataset_t *ds = os->os_dsl_dataset;
2262		dmu_tx_t *tx = os->os_synctx;
2263
2264		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2265		dsl_dataset_block_born(ds, bp, tx);
2266	}
2267
2268	mutex_enter(&db->db_mtx);
2269
2270	DBUF_VERIFY(db);
2271
2272	drp = &db->db_last_dirty;
2273	while ((dr = *drp) != db->db_data_pending)
2274		drp = &dr->dr_next;
2275	ASSERT(!list_link_active(&dr->dr_dirty_node));
2276	ASSERT(dr->dr_txg == txg);
2277	ASSERT(dr->dr_dbuf == db);
2278	ASSERT(dr->dr_next == NULL);
2279	*drp = dr->dr_next;
2280
2281	if (db->db_level == 0) {
2282		ASSERT(db->db_blkid != DB_BONUS_BLKID);
2283		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2284		if (db->db_state != DB_NOFILL) {
2285			if (dr->dt.dl.dr_data != db->db_buf)
2286				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2287				    db) == 1);
2288			else if (!arc_released(db->db_buf))
2289				arc_set_callback(db->db_buf, dbuf_do_evict, db);
2290		}
2291	} else {
2292		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2293		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2294		if (!BP_IS_HOLE(db->db_blkptr)) {
2295			int epbs =
2296			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2297			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2298			    db->db.db_size);
2299			ASSERT3U(dn->dn_phys->dn_maxblkid
2300			    >> (db->db_level * epbs), >=, db->db_blkid);
2301			arc_set_callback(db->db_buf, dbuf_do_evict, db);
2302		}
2303		mutex_destroy(&dr->dt.di.dr_mtx);
2304		list_destroy(&dr->dt.di.dr_children);
2305	}
2306	kmem_free(dr, sizeof (dbuf_dirty_record_t));
2307
2308	cv_broadcast(&db->db_changed);
2309	ASSERT(db->db_dirtycnt > 0);
2310	db->db_dirtycnt -= 1;
2311	db->db_data_pending = NULL;
2312	dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2313}
2314
2315static void
2316dbuf_write_nofill_ready(zio_t *zio)
2317{
2318	dbuf_write_ready(zio, NULL, zio->io_private);
2319}
2320
2321static void
2322dbuf_write_nofill_done(zio_t *zio)
2323{
2324	dbuf_write_done(zio, NULL, zio->io_private);
2325}
2326
2327static void
2328dbuf_write_override_ready(zio_t *zio)
2329{
2330	dbuf_dirty_record_t *dr = zio->io_private;
2331	dmu_buf_impl_t *db = dr->dr_dbuf;
2332
2333	dbuf_write_ready(zio, NULL, db);
2334}
2335
2336static void
2337dbuf_write_override_done(zio_t *zio)
2338{
2339	dbuf_dirty_record_t *dr = zio->io_private;
2340	dmu_buf_impl_t *db = dr->dr_dbuf;
2341	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2342
2343	mutex_enter(&db->db_mtx);
2344	if (!BP_EQUAL(zio->io_bp, obp)) {
2345		if (!BP_IS_HOLE(obp))
2346			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2347		arc_release(dr->dt.dl.dr_data, db);
2348	}
2349	mutex_exit(&db->db_mtx);
2350
2351	dbuf_write_done(zio, NULL, db);
2352}
2353
2354static void
2355dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2356{
2357	dmu_buf_impl_t *db = dr->dr_dbuf;
2358	dnode_t *dn = db->db_dnode;
2359	objset_t *os = dn->dn_objset;
2360	dmu_buf_impl_t *parent = db->db_parent;
2361	uint64_t txg = tx->tx_txg;
2362	zbookmark_t zb;
2363	zio_prop_t zp;
2364	zio_t *zio;
2365
2366	if (db->db_state != DB_NOFILL) {
2367		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2368			/*
2369			 * Private object buffers are released here rather
2370			 * than in dbuf_dirty() since they are only modified
2371			 * in the syncing context and we don't want the
2372			 * overhead of making multiple copies of the data.
2373			 */
2374			if (BP_IS_HOLE(db->db_blkptr)) {
2375				arc_buf_thaw(data);
2376			} else {
2377				arc_release(data, db);
2378			}
2379		}
2380	}
2381
2382	if (parent != dn->dn_dbuf) {
2383		ASSERT(parent && parent->db_data_pending);
2384		ASSERT(db->db_level == parent->db_level-1);
2385		ASSERT(arc_released(parent->db_buf));
2386		zio = parent->db_data_pending->dr_zio;
2387	} else {
2388		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
2389		ASSERT3P(db->db_blkptr, ==,
2390		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
2391		zio = dn->dn_zio;
2392	}
2393
2394	ASSERT(db->db_level == 0 || data == db->db_buf);
2395	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2396	ASSERT(zio);
2397
2398	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2399	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2400	    db->db.db_object, db->db_level, db->db_blkid);
2401
2402	dmu_write_policy(os, dn, db->db_level,
2403	    db->db_state == DB_NOFILL ? WP_NOFILL : 0, &zp);
2404
2405	if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2406		ASSERT(db->db_state != DB_NOFILL);
2407		dr->dr_zio = zio_write(zio, os->os_spa, txg,
2408		    db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2409		    dbuf_write_override_ready, dbuf_write_override_done, dr,
2410		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2411		mutex_enter(&db->db_mtx);
2412		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2413		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2414		    dr->dt.dl.dr_copies);
2415		mutex_exit(&db->db_mtx);
2416	} else if (db->db_state == DB_NOFILL) {
2417		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
2418		dr->dr_zio = zio_write(zio, os->os_spa, txg,
2419		    db->db_blkptr, NULL, db->db.db_size, &zp,
2420		    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
2421		    ZIO_PRIORITY_ASYNC_WRITE,
2422		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2423	} else {
2424		ASSERT(arc_released(data));
2425		dr->dr_zio = arc_write(zio, os->os_spa, txg,
2426		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
2427		    dbuf_write_ready, dbuf_write_done, db,
2428		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2429	}
2430}
2431