dbuf.c revision 168696
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22168404Spjd * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23168404Spjd * Use is subject to license terms.
24168404Spjd */
25168404Spjd
26168404Spjd#pragma ident	"%Z%%M%	%I%	%E% SMI"
27168404Spjd
28168404Spjd#include <sys/zfs_context.h>
29168404Spjd#include <sys/dmu.h>
30168404Spjd#include <sys/dmu_impl.h>
31168404Spjd#include <sys/dbuf.h>
32168404Spjd#include <sys/dmu_objset.h>
33168404Spjd#include <sys/dsl_dataset.h>
34168404Spjd#include <sys/dsl_dir.h>
35168404Spjd#include <sys/dmu_tx.h>
36168404Spjd#include <sys/spa.h>
37168404Spjd#include <sys/zio.h>
38168404Spjd#include <sys/dmu_zfetch.h>
39168404Spjd
40168404Spjdstatic void dbuf_destroy(dmu_buf_impl_t *db);
41168404Spjdstatic int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
42168404Spjdstatic void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
43168404Spjd    int compress, dmu_tx_t *tx);
44168404Spjdstatic arc_done_func_t dbuf_write_ready;
45168404Spjdstatic arc_done_func_t dbuf_write_done;
46168404Spjd
47168404Spjdint zfs_mdcomp_disable = 0;
48168404SpjdSYSCTL_DECL(_vfs_zfs);
49168404SpjdTUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
50168404SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
51168404Spjd    &zfs_mdcomp_disable, 0, "Disable metadata compression");
52168404Spjd
53168404Spjd/*
54168404Spjd * Global data structures and functions for the dbuf cache.
55168404Spjd */
56168404Spjdstatic kmem_cache_t *dbuf_cache;
57168404Spjd
58168404Spjd/* ARGSUSED */
59168404Spjdstatic int
60168404Spjddbuf_cons(void *vdb, void *unused, int kmflag)
61168404Spjd{
62168404Spjd	dmu_buf_impl_t *db = vdb;
63168404Spjd	bzero(db, sizeof (dmu_buf_impl_t));
64168404Spjd
65168404Spjd	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
66168404Spjd	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
67168404Spjd	refcount_create(&db->db_holds);
68168404Spjd	return (0);
69168404Spjd}
70168404Spjd
71168404Spjd/* ARGSUSED */
72168404Spjdstatic void
73168404Spjddbuf_dest(void *vdb, void *unused)
74168404Spjd{
75168404Spjd	dmu_buf_impl_t *db = vdb;
76168404Spjd	mutex_destroy(&db->db_mtx);
77168404Spjd	cv_destroy(&db->db_changed);
78168404Spjd	refcount_destroy(&db->db_holds);
79168404Spjd}
80168404Spjd
81168404Spjd/*
82168404Spjd * dbuf hash table routines
83168404Spjd */
84168404Spjdstatic dbuf_hash_table_t dbuf_hash_table;
85168404Spjd
86168404Spjdstatic uint64_t dbuf_hash_count;
87168404Spjd
88168404Spjdstatic uint64_t
89168404Spjddbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
90168404Spjd{
91168404Spjd	uintptr_t osv = (uintptr_t)os;
92168404Spjd	uint64_t crc = -1ULL;
93168404Spjd
94168404Spjd	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
95168404Spjd	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
96168404Spjd	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
97168404Spjd	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
98168404Spjd	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
99168404Spjd	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
100168404Spjd	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
101168404Spjd
102168404Spjd	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
103168404Spjd
104168404Spjd	return (crc);
105168404Spjd}
106168404Spjd
107168404Spjd#define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
108168404Spjd
109168404Spjd#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
110168404Spjd	((dbuf)->db.db_object == (obj) &&		\
111168404Spjd	(dbuf)->db_objset == (os) &&			\
112168404Spjd	(dbuf)->db_level == (level) &&			\
113168404Spjd	(dbuf)->db_blkid == (blkid))
114168404Spjd
115168404Spjddmu_buf_impl_t *
116168404Spjddbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
117168404Spjd{
118168404Spjd	dbuf_hash_table_t *h = &dbuf_hash_table;
119168404Spjd	objset_impl_t *os = dn->dn_objset;
120168404Spjd	uint64_t obj = dn->dn_object;
121168404Spjd	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
122168404Spjd	uint64_t idx = hv & h->hash_table_mask;
123168404Spjd	dmu_buf_impl_t *db;
124168404Spjd
125168404Spjd	mutex_enter(DBUF_HASH_MUTEX(h, idx));
126168404Spjd	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
127168404Spjd		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
128168404Spjd			mutex_enter(&db->db_mtx);
129168404Spjd			if (db->db_state != DB_EVICTING) {
130168404Spjd				mutex_exit(DBUF_HASH_MUTEX(h, idx));
131168404Spjd				return (db);
132168404Spjd			}
133168404Spjd			mutex_exit(&db->db_mtx);
134168404Spjd		}
135168404Spjd	}
136168404Spjd	mutex_exit(DBUF_HASH_MUTEX(h, idx));
137168404Spjd	return (NULL);
138168404Spjd}
139168404Spjd
140168404Spjd/*
141168404Spjd * Insert an entry into the hash table.  If there is already an element
142168404Spjd * equal to elem in the hash table, then the already existing element
143168404Spjd * will be returned and the new element will not be inserted.
144168404Spjd * Otherwise returns NULL.
145168404Spjd */
146168404Spjdstatic dmu_buf_impl_t *
147168404Spjddbuf_hash_insert(dmu_buf_impl_t *db)
148168404Spjd{
149168404Spjd	dbuf_hash_table_t *h = &dbuf_hash_table;
150168404Spjd	objset_impl_t *os = db->db_objset;
151168404Spjd	uint64_t obj = db->db.db_object;
152168404Spjd	int level = db->db_level;
153168404Spjd	uint64_t blkid = db->db_blkid;
154168404Spjd	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
155168404Spjd	uint64_t idx = hv & h->hash_table_mask;
156168404Spjd	dmu_buf_impl_t *dbf;
157168404Spjd
158168404Spjd	mutex_enter(DBUF_HASH_MUTEX(h, idx));
159168404Spjd	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
160168404Spjd		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
161168404Spjd			mutex_enter(&dbf->db_mtx);
162168404Spjd			if (dbf->db_state != DB_EVICTING) {
163168404Spjd				mutex_exit(DBUF_HASH_MUTEX(h, idx));
164168404Spjd				return (dbf);
165168404Spjd			}
166168404Spjd			mutex_exit(&dbf->db_mtx);
167168404Spjd		}
168168404Spjd	}
169168404Spjd
170168404Spjd	mutex_enter(&db->db_mtx);
171168404Spjd	db->db_hash_next = h->hash_table[idx];
172168404Spjd	h->hash_table[idx] = db;
173168404Spjd	mutex_exit(DBUF_HASH_MUTEX(h, idx));
174168404Spjd	atomic_add_64(&dbuf_hash_count, 1);
175168404Spjd
176168404Spjd	return (NULL);
177168404Spjd}
178168404Spjd
179168404Spjd/*
180168404Spjd * Remove an entry from the hash table.  This operation will
181168404Spjd * fail if there are any existing holds on the db.
182168404Spjd */
183168404Spjdstatic void
184168404Spjddbuf_hash_remove(dmu_buf_impl_t *db)
185168404Spjd{
186168404Spjd	dbuf_hash_table_t *h = &dbuf_hash_table;
187168404Spjd	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
188168404Spjd	    db->db_level, db->db_blkid);
189168404Spjd	uint64_t idx = hv & h->hash_table_mask;
190168404Spjd	dmu_buf_impl_t *dbf, **dbp;
191168404Spjd
192168404Spjd	/*
193168404Spjd	 * We musn't hold db_mtx to maintin lock ordering:
194168404Spjd	 * DBUF_HASH_MUTEX > db_mtx.
195168404Spjd	 */
196168404Spjd	ASSERT(refcount_is_zero(&db->db_holds));
197168404Spjd	ASSERT(db->db_state == DB_EVICTING);
198168404Spjd	ASSERT(!MUTEX_HELD(&db->db_mtx));
199168404Spjd
200168404Spjd	mutex_enter(DBUF_HASH_MUTEX(h, idx));
201168404Spjd	dbp = &h->hash_table[idx];
202168404Spjd	while ((dbf = *dbp) != db) {
203168404Spjd		dbp = &dbf->db_hash_next;
204168404Spjd		ASSERT(dbf != NULL);
205168404Spjd	}
206168404Spjd	*dbp = db->db_hash_next;
207168404Spjd	db->db_hash_next = NULL;
208168404Spjd	mutex_exit(DBUF_HASH_MUTEX(h, idx));
209168404Spjd	atomic_add_64(&dbuf_hash_count, -1);
210168404Spjd}
211168404Spjd
212168404Spjdstatic arc_evict_func_t dbuf_do_evict;
213168404Spjd
214168404Spjdstatic void
215168404Spjddbuf_evict_user(dmu_buf_impl_t *db)
216168404Spjd{
217168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
218168404Spjd
219168404Spjd	if (db->db_level != 0 || db->db_evict_func == NULL)
220168404Spjd		return;
221168404Spjd
222168404Spjd	if (db->db_user_data_ptr_ptr)
223168404Spjd		*db->db_user_data_ptr_ptr = db->db.db_data;
224168404Spjd	db->db_evict_func(&db->db, db->db_user_ptr);
225168404Spjd	db->db_user_ptr = NULL;
226168404Spjd	db->db_user_data_ptr_ptr = NULL;
227168404Spjd	db->db_evict_func = NULL;
228168404Spjd}
229168404Spjd
230168404Spjdvoid
231168404Spjddbuf_evict(dmu_buf_impl_t *db)
232168404Spjd{
233168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
234168404Spjd	ASSERT(db->db_buf == NULL);
235168404Spjd	ASSERT(db->db_data_pending == NULL);
236168404Spjd
237168404Spjd	dbuf_clear(db);
238168404Spjd	dbuf_destroy(db);
239168404Spjd}
240168404Spjd
241168404Spjdvoid
242168404Spjddbuf_init(void)
243168404Spjd{
244168404Spjd	uint64_t hsize = 1ULL << 16;
245168404Spjd	dbuf_hash_table_t *h = &dbuf_hash_table;
246168404Spjd	int i;
247168404Spjd
248168404Spjd	/*
249168404Spjd	 * The hash table is big enough to fill all of physical memory
250168404Spjd	 * with an average 4K block size.  The table will take up
251168404Spjd	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
252168404Spjd	 */
253168696Spjd	while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
254168404Spjd		hsize <<= 1;
255168404Spjd
256168404Spjdretry:
257168404Spjd	h->hash_table_mask = hsize - 1;
258168404Spjd	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
259168404Spjd	if (h->hash_table == NULL) {
260168404Spjd		/* XXX - we should really return an error instead of assert */
261168404Spjd		ASSERT(hsize > (1ULL << 10));
262168404Spjd		hsize >>= 1;
263168404Spjd		goto retry;
264168404Spjd	}
265168404Spjd
266168404Spjd	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
267168404Spjd	    sizeof (dmu_buf_impl_t),
268168404Spjd	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
269168404Spjd
270168404Spjd	for (i = 0; i < DBUF_MUTEXES; i++)
271168404Spjd		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
272168404Spjd}
273168404Spjd
274168404Spjdvoid
275168404Spjddbuf_fini(void)
276168404Spjd{
277168404Spjd	dbuf_hash_table_t *h = &dbuf_hash_table;
278168404Spjd	int i;
279168404Spjd
280168404Spjd	for (i = 0; i < DBUF_MUTEXES; i++)
281168404Spjd		mutex_destroy(&h->hash_mutexes[i]);
282168404Spjd	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
283168404Spjd	kmem_cache_destroy(dbuf_cache);
284168404Spjd}
285168404Spjd
286168404Spjd/*
287168404Spjd * Other stuff.
288168404Spjd */
289168404Spjd
290168404Spjd#ifdef ZFS_DEBUG
291168404Spjdstatic void
292168404Spjddbuf_verify(dmu_buf_impl_t *db)
293168404Spjd{
294168404Spjd	dnode_t *dn = db->db_dnode;
295168404Spjd
296168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
297168404Spjd
298168404Spjd	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
299168404Spjd		return;
300168404Spjd
301168404Spjd	ASSERT(db->db_objset != NULL);
302168404Spjd	if (dn == NULL) {
303168404Spjd		ASSERT(db->db_parent == NULL);
304168404Spjd		ASSERT(db->db_blkptr == NULL);
305168404Spjd	} else {
306168404Spjd		ASSERT3U(db->db.db_object, ==, dn->dn_object);
307168404Spjd		ASSERT3P(db->db_objset, ==, dn->dn_objset);
308168404Spjd		ASSERT3U(db->db_level, <, dn->dn_nlevels);
309168404Spjd		ASSERT(db->db_blkid == DB_BONUS_BLKID ||
310168404Spjd		    list_head(&dn->dn_dbufs));
311168404Spjd	}
312168404Spjd	if (db->db_blkid == DB_BONUS_BLKID) {
313168404Spjd		ASSERT(dn != NULL);
314168404Spjd		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
315168404Spjd		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
316168404Spjd	} else {
317168404Spjd		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
318168404Spjd	}
319168404Spjd
320168404Spjd	if (db->db_level == 0) {
321168404Spjd		/* we can be momentarily larger in dnode_set_blksz() */
322168404Spjd		if (db->db_blkid != DB_BONUS_BLKID && dn) {
323168404Spjd			ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
324168404Spjd		}
325168404Spjd		if (db->db.db_object == DMU_META_DNODE_OBJECT) {
326168404Spjd			dbuf_dirty_record_t *dr = db->db_data_pending;
327168404Spjd			/*
328168404Spjd			 * it should only be modified in syncing
329168404Spjd			 * context, so make sure we only have
330168404Spjd			 * one copy of the data.
331168404Spjd			 */
332168404Spjd			ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
333168404Spjd		}
334168404Spjd	}
335168404Spjd
336168404Spjd	/* verify db->db_blkptr */
337168404Spjd	if (db->db_blkptr) {
338168404Spjd		if (db->db_parent == dn->dn_dbuf) {
339168404Spjd			/* db is pointed to by the dnode */
340168404Spjd			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
341168404Spjd			if (db->db.db_object == DMU_META_DNODE_OBJECT)
342168404Spjd				ASSERT(db->db_parent == NULL);
343168404Spjd			else
344168404Spjd				ASSERT(db->db_parent != NULL);
345168404Spjd			ASSERT3P(db->db_blkptr, ==,
346168404Spjd			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
347168404Spjd		} else {
348168404Spjd			/* db is pointed to by an indirect block */
349168404Spjd			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
350168404Spjd			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
351168404Spjd			ASSERT3U(db->db_parent->db.db_object, ==,
352168404Spjd			    db->db.db_object);
353168404Spjd			/*
354168404Spjd			 * dnode_grow_indblksz() can make this fail if we don't
355168404Spjd			 * have the struct_rwlock.  XXX indblksz no longer
356168404Spjd			 * grows.  safe to do this now?
357168404Spjd			 */
358168404Spjd			if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
359168404Spjd				ASSERT3P(db->db_blkptr, ==,
360168404Spjd				    ((blkptr_t *)db->db_parent->db.db_data +
361168404Spjd				    db->db_blkid % epb));
362168404Spjd			}
363168404Spjd		}
364168404Spjd	}
365168404Spjd	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
366168404Spjd	    db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
367168404Spjd	    db->db_state != DB_FILL && !dn->dn_free_txg) {
368168404Spjd		/*
369168404Spjd		 * If the blkptr isn't set but they have nonzero data,
370168404Spjd		 * it had better be dirty, otherwise we'll lose that
371168404Spjd		 * data when we evict this buffer.
372168404Spjd		 */
373168404Spjd		if (db->db_dirtycnt == 0) {
374168404Spjd			uint64_t *buf = db->db.db_data;
375168404Spjd			int i;
376168404Spjd
377168404Spjd			for (i = 0; i < db->db.db_size >> 3; i++) {
378168404Spjd				ASSERT(buf[i] == 0);
379168404Spjd			}
380168404Spjd		}
381168404Spjd	}
382168404Spjd}
383168404Spjd#endif
384168404Spjd
385168404Spjdstatic void
386168404Spjddbuf_update_data(dmu_buf_impl_t *db)
387168404Spjd{
388168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
389168404Spjd	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
390168404Spjd		ASSERT(!refcount_is_zero(&db->db_holds));
391168404Spjd		*db->db_user_data_ptr_ptr = db->db.db_data;
392168404Spjd	}
393168404Spjd}
394168404Spjd
395168404Spjdstatic void
396168404Spjddbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
397168404Spjd{
398168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
399168404Spjd	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
400168404Spjd	db->db_buf = buf;
401168404Spjd	if (buf != NULL) {
402168404Spjd		ASSERT(buf->b_data != NULL);
403168404Spjd		db->db.db_data = buf->b_data;
404168404Spjd		if (!arc_released(buf))
405168404Spjd			arc_set_callback(buf, dbuf_do_evict, db);
406168404Spjd		dbuf_update_data(db);
407168404Spjd	} else {
408168404Spjd		dbuf_evict_user(db);
409168404Spjd		db->db.db_data = NULL;
410168404Spjd		db->db_state = DB_UNCACHED;
411168404Spjd	}
412168404Spjd}
413168404Spjd
414168404Spjduint64_t
415168404Spjddbuf_whichblock(dnode_t *dn, uint64_t offset)
416168404Spjd{
417168404Spjd	if (dn->dn_datablkshift) {
418168404Spjd		return (offset >> dn->dn_datablkshift);
419168404Spjd	} else {
420168404Spjd		ASSERT3U(offset, <, dn->dn_datablksz);
421168404Spjd		return (0);
422168404Spjd	}
423168404Spjd}
424168404Spjd
425168404Spjdstatic void
426168404Spjddbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
427168404Spjd{
428168404Spjd	dmu_buf_impl_t *db = vdb;
429168404Spjd
430168404Spjd	mutex_enter(&db->db_mtx);
431168404Spjd	ASSERT3U(db->db_state, ==, DB_READ);
432168404Spjd	/*
433168404Spjd	 * All reads are synchronous, so we must have a hold on the dbuf
434168404Spjd	 */
435168404Spjd	ASSERT(refcount_count(&db->db_holds) > 0);
436168404Spjd	ASSERT(db->db_buf == NULL);
437168404Spjd	ASSERT(db->db.db_data == NULL);
438168404Spjd	if (db->db_level == 0 && db->db_freed_in_flight) {
439168404Spjd		/* we were freed in flight; disregard any error */
440168404Spjd		arc_release(buf, db);
441168404Spjd		bzero(buf->b_data, db->db.db_size);
442168404Spjd		arc_buf_freeze(buf);
443168404Spjd		db->db_freed_in_flight = FALSE;
444168404Spjd		dbuf_set_data(db, buf);
445168404Spjd		db->db_state = DB_CACHED;
446168404Spjd	} else if (zio == NULL || zio->io_error == 0) {
447168404Spjd		dbuf_set_data(db, buf);
448168404Spjd		db->db_state = DB_CACHED;
449168404Spjd	} else {
450168404Spjd		ASSERT(db->db_blkid != DB_BONUS_BLKID);
451168404Spjd		ASSERT3P(db->db_buf, ==, NULL);
452168404Spjd		VERIFY(arc_buf_remove_ref(buf, db) == 1);
453168404Spjd		db->db_state = DB_UNCACHED;
454168404Spjd	}
455168404Spjd	cv_broadcast(&db->db_changed);
456168404Spjd	mutex_exit(&db->db_mtx);
457168404Spjd	dbuf_rele(db, NULL);
458168404Spjd}
459168404Spjd
460168404Spjdstatic void
461168404Spjddbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
462168404Spjd{
463168404Spjd	blkptr_t *bp;
464168404Spjd	zbookmark_t zb;
465168404Spjd	uint32_t aflags = ARC_NOWAIT;
466168404Spjd
467168404Spjd	ASSERT(!refcount_is_zero(&db->db_holds));
468168404Spjd	/* We need the struct_rwlock to prevent db_blkptr from changing. */
469168404Spjd	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
470168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
471168404Spjd	ASSERT(db->db_state == DB_UNCACHED);
472168404Spjd	ASSERT(db->db_buf == NULL);
473168404Spjd
474168404Spjd	if (db->db_blkid == DB_BONUS_BLKID) {
475168404Spjd		ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
476168404Spjd		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
477168404Spjd		if (db->db.db_size < DN_MAX_BONUSLEN)
478168404Spjd			bzero(db->db.db_data, DN_MAX_BONUSLEN);
479168404Spjd		bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
480168404Spjd		    db->db.db_size);
481168404Spjd		dbuf_update_data(db);
482168404Spjd		db->db_state = DB_CACHED;
483168404Spjd		mutex_exit(&db->db_mtx);
484168404Spjd		return;
485168404Spjd	}
486168404Spjd
487168404Spjd	if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
488168404Spjd		bp = NULL;
489168404Spjd	else
490168404Spjd		bp = db->db_blkptr;
491168404Spjd
492168404Spjd	if (bp == NULL)
493168404Spjd		dprintf_dbuf(db, "blkptr: %s\n", "NULL");
494168404Spjd	else
495168404Spjd		dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
496168404Spjd
497168404Spjd	if (bp == NULL || BP_IS_HOLE(bp)) {
498168404Spjd		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
499168404Spjd
500168404Spjd		ASSERT(bp == NULL || BP_IS_HOLE(bp));
501168404Spjd		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
502168404Spjd		    db->db.db_size, db, type));
503168404Spjd		bzero(db->db.db_data, db->db.db_size);
504168404Spjd		db->db_state = DB_CACHED;
505168404Spjd		*flags |= DB_RF_CACHED;
506168404Spjd		mutex_exit(&db->db_mtx);
507168404Spjd		return;
508168404Spjd	}
509168404Spjd
510168404Spjd	db->db_state = DB_READ;
511168404Spjd	mutex_exit(&db->db_mtx);
512168404Spjd
513168404Spjd	zb.zb_objset = db->db_objset->os_dsl_dataset ?
514168404Spjd	    db->db_objset->os_dsl_dataset->ds_object : 0;
515168404Spjd	zb.zb_object = db->db.db_object;
516168404Spjd	zb.zb_level = db->db_level;
517168404Spjd	zb.zb_blkid = db->db_blkid;
518168404Spjd
519168404Spjd	dbuf_add_ref(db, NULL);
520168404Spjd	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
521168404Spjd	ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
522168404Spjd	(void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
523168404Spjd	    db->db_level > 0 ? byteswap_uint64_array :
524168404Spjd	    dmu_ot[db->db_dnode->dn_type].ot_byteswap,
525168404Spjd	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
526168404Spjd	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
527168404Spjd	    &aflags, &zb);
528168404Spjd	if (aflags & ARC_CACHED)
529168404Spjd		*flags |= DB_RF_CACHED;
530168404Spjd}
531168404Spjd
532168404Spjdint
533168404Spjddbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
534168404Spjd{
535168404Spjd	int err = 0;
536168404Spjd	int havepzio = (zio != NULL);
537168404Spjd	int prefetch;
538168404Spjd
539168404Spjd	/*
540168404Spjd	 * We don't have to hold the mutex to check db_state because it
541168404Spjd	 * can't be freed while we have a hold on the buffer.
542168404Spjd	 */
543168404Spjd	ASSERT(!refcount_is_zero(&db->db_holds));
544168404Spjd
545168404Spjd	if ((flags & DB_RF_HAVESTRUCT) == 0)
546168404Spjd		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
547168404Spjd
548168404Spjd	prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
549168404Spjd	    (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
550168404Spjd
551168404Spjd	mutex_enter(&db->db_mtx);
552168404Spjd	if (db->db_state == DB_CACHED) {
553168404Spjd		mutex_exit(&db->db_mtx);
554168404Spjd		if (prefetch)
555168404Spjd			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
556168404Spjd			    db->db.db_size, TRUE);
557168404Spjd		if ((flags & DB_RF_HAVESTRUCT) == 0)
558168404Spjd			rw_exit(&db->db_dnode->dn_struct_rwlock);
559168404Spjd	} else if (db->db_state == DB_UNCACHED) {
560168404Spjd		if (zio == NULL) {
561168404Spjd			zio = zio_root(db->db_dnode->dn_objset->os_spa,
562168404Spjd			    NULL, NULL, ZIO_FLAG_CANFAIL);
563168404Spjd		}
564168404Spjd		dbuf_read_impl(db, zio, &flags);
565168404Spjd
566168404Spjd		/* dbuf_read_impl has dropped db_mtx for us */
567168404Spjd
568168404Spjd		if (prefetch)
569168404Spjd			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
570168404Spjd			    db->db.db_size, flags & DB_RF_CACHED);
571168404Spjd
572168404Spjd		if ((flags & DB_RF_HAVESTRUCT) == 0)
573168404Spjd			rw_exit(&db->db_dnode->dn_struct_rwlock);
574168404Spjd
575168404Spjd		if (!havepzio)
576168404Spjd			err = zio_wait(zio);
577168404Spjd	} else {
578168404Spjd		mutex_exit(&db->db_mtx);
579168404Spjd		if (prefetch)
580168404Spjd			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
581168404Spjd			    db->db.db_size, TRUE);
582168404Spjd		if ((flags & DB_RF_HAVESTRUCT) == 0)
583168404Spjd			rw_exit(&db->db_dnode->dn_struct_rwlock);
584168404Spjd
585168404Spjd		mutex_enter(&db->db_mtx);
586168404Spjd		if ((flags & DB_RF_NEVERWAIT) == 0) {
587168404Spjd			while (db->db_state == DB_READ ||
588168404Spjd			    db->db_state == DB_FILL) {
589168404Spjd				ASSERT(db->db_state == DB_READ ||
590168404Spjd				    (flags & DB_RF_HAVESTRUCT) == 0);
591168404Spjd				cv_wait(&db->db_changed, &db->db_mtx);
592168404Spjd			}
593168404Spjd			if (db->db_state == DB_UNCACHED)
594168404Spjd				err = EIO;
595168404Spjd		}
596168404Spjd		mutex_exit(&db->db_mtx);
597168404Spjd	}
598168404Spjd
599168404Spjd	ASSERT(err || havepzio || db->db_state == DB_CACHED);
600168404Spjd	return (err);
601168404Spjd}
602168404Spjd
603168404Spjdstatic void
604168404Spjddbuf_noread(dmu_buf_impl_t *db)
605168404Spjd{
606168404Spjd	ASSERT(!refcount_is_zero(&db->db_holds));
607168404Spjd	ASSERT(db->db_blkid != DB_BONUS_BLKID);
608168404Spjd	mutex_enter(&db->db_mtx);
609168404Spjd	while (db->db_state == DB_READ || db->db_state == DB_FILL)
610168404Spjd		cv_wait(&db->db_changed, &db->db_mtx);
611168404Spjd	if (db->db_state == DB_UNCACHED) {
612168404Spjd		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
613168404Spjd
614168404Spjd		ASSERT(db->db_buf == NULL);
615168404Spjd		ASSERT(db->db.db_data == NULL);
616168404Spjd		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
617168404Spjd		    db->db.db_size, db, type));
618168404Spjd		db->db_state = DB_FILL;
619168404Spjd	} else {
620168404Spjd		ASSERT3U(db->db_state, ==, DB_CACHED);
621168404Spjd	}
622168404Spjd	mutex_exit(&db->db_mtx);
623168404Spjd}
624168404Spjd
625168404Spjd/*
626168404Spjd * This is our just-in-time copy function.  It makes a copy of
627168404Spjd * buffers, that have been modified in a previous transaction
628168404Spjd * group, before we modify them in the current active group.
629168404Spjd *
630168404Spjd * This function is used in two places: when we are dirtying a
631168404Spjd * buffer for the first time in a txg, and when we are freeing
632168404Spjd * a range in a dnode that includes this buffer.
633168404Spjd *
634168404Spjd * Note that when we are called from dbuf_free_range() we do
635168404Spjd * not put a hold on the buffer, we just traverse the active
636168404Spjd * dbuf list for the dnode.
637168404Spjd */
638168404Spjdstatic void
639168404Spjddbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
640168404Spjd{
641168404Spjd	dbuf_dirty_record_t *dr = db->db_last_dirty;
642168404Spjd
643168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
644168404Spjd	ASSERT(db->db.db_data != NULL);
645168404Spjd	ASSERT(db->db_level == 0);
646168404Spjd	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
647168404Spjd
648168404Spjd	if (dr == NULL ||
649168404Spjd	    (dr->dt.dl.dr_data !=
650168404Spjd	    ((db->db_blkid  == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
651168404Spjd		return;
652168404Spjd
653168404Spjd	/*
654168404Spjd	 * If the last dirty record for this dbuf has not yet synced
655168404Spjd	 * and its referencing the dbuf data, either:
656168404Spjd	 * 	reset the reference to point to a new copy,
657168404Spjd	 * or (if there a no active holders)
658168404Spjd	 *	just null out the current db_data pointer.
659168404Spjd	 */
660168404Spjd	ASSERT(dr->dr_txg >= txg - 2);
661168404Spjd	if (db->db_blkid == DB_BONUS_BLKID) {
662168404Spjd		/* Note that the data bufs here are zio_bufs */
663168404Spjd		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
664168404Spjd		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
665168404Spjd	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
666168404Spjd		int size = db->db.db_size;
667168404Spjd		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
668168404Spjd		dr->dt.dl.dr_data = arc_buf_alloc(
669168404Spjd		    db->db_dnode->dn_objset->os_spa, size, db, type);
670168404Spjd		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
671168404Spjd	} else {
672168404Spjd		dbuf_set_data(db, NULL);
673168404Spjd	}
674168404Spjd}
675168404Spjd
676168404Spjdvoid
677168404Spjddbuf_unoverride(dbuf_dirty_record_t *dr)
678168404Spjd{
679168404Spjd	dmu_buf_impl_t *db = dr->dr_dbuf;
680168404Spjd	uint64_t txg = dr->dr_txg;
681168404Spjd
682168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
683168404Spjd	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
684168404Spjd	ASSERT(db->db_level == 0);
685168404Spjd
686168404Spjd	if (db->db_blkid == DB_BONUS_BLKID ||
687168404Spjd	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
688168404Spjd		return;
689168404Spjd
690168404Spjd	/* free this block */
691168404Spjd	if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
692168404Spjd		/* XXX can get silent EIO here */
693168404Spjd		(void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
694168404Spjd		    txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
695168404Spjd	}
696168404Spjd	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
697168404Spjd	/*
698168404Spjd	 * Release the already-written buffer, so we leave it in
699168404Spjd	 * a consistent dirty state.  Note that all callers are
700168404Spjd	 * modifying the buffer, so they will immediately do
701168404Spjd	 * another (redundant) arc_release().  Therefore, leave
702168404Spjd	 * the buf thawed to save the effort of freezing &
703168404Spjd	 * immediately re-thawing it.
704168404Spjd	 */
705168404Spjd	arc_release(dr->dt.dl.dr_data, db);
706168404Spjd}
707168404Spjd
708168404Spjdvoid
709168404Spjddbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
710168404Spjd{
711168404Spjd	dmu_buf_impl_t *db, *db_next;
712168404Spjd	uint64_t txg = tx->tx_txg;
713168404Spjd
714168404Spjd	dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
715168404Spjd	mutex_enter(&dn->dn_dbufs_mtx);
716168404Spjd	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
717168404Spjd		db_next = list_next(&dn->dn_dbufs, db);
718168404Spjd		ASSERT(db->db_blkid != DB_BONUS_BLKID);
719168404Spjd		if (db->db_level != 0)
720168404Spjd			continue;
721168404Spjd		dprintf_dbuf(db, "found buf %s\n", "");
722168404Spjd		if (db->db_blkid < blkid ||
723168404Spjd		    db->db_blkid >= blkid+nblks)
724168404Spjd			continue;
725168404Spjd
726168404Spjd		/* found a level 0 buffer in the range */
727168404Spjd		if (dbuf_undirty(db, tx))
728168404Spjd			continue;
729168404Spjd
730168404Spjd		mutex_enter(&db->db_mtx);
731168404Spjd		if (db->db_state == DB_UNCACHED ||
732168404Spjd		    db->db_state == DB_EVICTING) {
733168404Spjd			ASSERT(db->db.db_data == NULL);
734168404Spjd			mutex_exit(&db->db_mtx);
735168404Spjd			continue;
736168404Spjd		}
737168404Spjd		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
738168404Spjd			/* will be handled in dbuf_read_done or dbuf_rele */
739168404Spjd			db->db_freed_in_flight = TRUE;
740168404Spjd			mutex_exit(&db->db_mtx);
741168404Spjd			continue;
742168404Spjd		}
743168404Spjd		if (refcount_count(&db->db_holds) == 0) {
744168404Spjd			ASSERT(db->db_buf);
745168404Spjd			dbuf_clear(db);
746168404Spjd			continue;
747168404Spjd		}
748168404Spjd		/* The dbuf is referenced */
749168404Spjd
750168404Spjd		if (db->db_last_dirty != NULL) {
751168404Spjd			dbuf_dirty_record_t *dr = db->db_last_dirty;
752168404Spjd
753168404Spjd			if (dr->dr_txg == txg) {
754168404Spjd				/*
755168404Spjd				 * This buffer is "in-use", re-adjust the file
756168404Spjd				 * size to reflect that this buffer may
757168404Spjd				 * contain new data when we sync.
758168404Spjd				 */
759168404Spjd				if (db->db_blkid > dn->dn_maxblkid)
760168404Spjd					dn->dn_maxblkid = db->db_blkid;
761168404Spjd				dbuf_unoverride(dr);
762168404Spjd			} else {
763168404Spjd				/*
764168404Spjd				 * This dbuf is not dirty in the open context.
765168404Spjd				 * Either uncache it (if its not referenced in
766168404Spjd				 * the open context) or reset its contents to
767168404Spjd				 * empty.
768168404Spjd				 */
769168404Spjd				dbuf_fix_old_data(db, txg);
770168404Spjd			}
771168404Spjd		}
772168404Spjd		/* clear the contents if its cached */
773168404Spjd		if (db->db_state == DB_CACHED) {
774168404Spjd			ASSERT(db->db.db_data != NULL);
775168404Spjd			arc_release(db->db_buf, db);
776168404Spjd			bzero(db->db.db_data, db->db.db_size);
777168404Spjd			arc_buf_freeze(db->db_buf);
778168404Spjd		}
779168404Spjd
780168404Spjd		mutex_exit(&db->db_mtx);
781168404Spjd	}
782168404Spjd	mutex_exit(&dn->dn_dbufs_mtx);
783168404Spjd}
784168404Spjd
785168404Spjdstatic int
786168404Spjddbuf_new_block(dmu_buf_impl_t *db)
787168404Spjd{
788168404Spjd	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
789168404Spjd	uint64_t birth_txg = 0;
790168404Spjd
791168404Spjd	/* Don't count meta-objects */
792168404Spjd	if (ds == NULL)
793168404Spjd		return (FALSE);
794168404Spjd
795168404Spjd	/*
796168404Spjd	 * We don't need any locking to protect db_blkptr:
797168404Spjd	 * If it's syncing, then db_last_dirty will be set
798168404Spjd	 * so we'll ignore db_blkptr.
799168404Spjd	 */
800168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
801168404Spjd	/* If we have been dirtied since the last snapshot, its not new */
802168404Spjd	if (db->db_last_dirty)
803168404Spjd		birth_txg = db->db_last_dirty->dr_txg;
804168404Spjd	else if (db->db_blkptr)
805168404Spjd		birth_txg = db->db_blkptr->blk_birth;
806168404Spjd
807168404Spjd	if (birth_txg)
808168404Spjd		return (!dsl_dataset_block_freeable(ds, birth_txg));
809168404Spjd	else
810168404Spjd		return (TRUE);
811168404Spjd}
812168404Spjd
813168404Spjdvoid
814168404Spjddbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
815168404Spjd{
816168404Spjd	arc_buf_t *buf, *obuf;
817168404Spjd	int osize = db->db.db_size;
818168404Spjd	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
819168404Spjd
820168404Spjd	ASSERT(db->db_blkid != DB_BONUS_BLKID);
821168404Spjd
822168404Spjd	/* XXX does *this* func really need the lock? */
823168404Spjd	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
824168404Spjd
825168404Spjd	/*
826168404Spjd	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
827168404Spjd	 * is OK, because there can be no other references to the db
828168404Spjd	 * when we are changing its size, so no concurrent DB_FILL can
829168404Spjd	 * be happening.
830168404Spjd	 */
831168404Spjd	/*
832168404Spjd	 * XXX we should be doing a dbuf_read, checking the return
833168404Spjd	 * value and returning that up to our callers
834168404Spjd	 */
835168404Spjd	dbuf_will_dirty(db, tx);
836168404Spjd
837168404Spjd	/* create the data buffer for the new block */
838168404Spjd	buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
839168404Spjd
840168404Spjd	/* copy old block data to the new block */
841168404Spjd	obuf = db->db_buf;
842168404Spjd	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
843168404Spjd	/* zero the remainder */
844168404Spjd	if (size > osize)
845168404Spjd		bzero((uint8_t *)buf->b_data + osize, size - osize);
846168404Spjd
847168404Spjd	mutex_enter(&db->db_mtx);
848168404Spjd	dbuf_set_data(db, buf);
849168404Spjd	VERIFY(arc_buf_remove_ref(obuf, db) == 1);
850168404Spjd	db->db.db_size = size;
851168404Spjd
852168404Spjd	if (db->db_level == 0) {
853168404Spjd		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
854168404Spjd		db->db_last_dirty->dt.dl.dr_data = buf;
855168404Spjd	}
856168404Spjd	mutex_exit(&db->db_mtx);
857168404Spjd
858168404Spjd	dnode_willuse_space(db->db_dnode, size-osize, tx);
859168404Spjd}
860168404Spjd
861168404Spjddbuf_dirty_record_t *
862168404Spjddbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
863168404Spjd{
864168404Spjd	dnode_t *dn = db->db_dnode;
865168404Spjd	objset_impl_t *os = dn->dn_objset;
866168404Spjd	dbuf_dirty_record_t **drp, *dr;
867168404Spjd	int drop_struct_lock = FALSE;
868168404Spjd	int txgoff = tx->tx_txg & TXG_MASK;
869168404Spjd
870168404Spjd	ASSERT(tx->tx_txg != 0);
871168404Spjd	ASSERT(!refcount_is_zero(&db->db_holds));
872168404Spjd	DMU_TX_DIRTY_BUF(tx, db);
873168404Spjd
874168404Spjd	/*
875168404Spjd	 * Shouldn't dirty a regular buffer in syncing context.  Private
876168404Spjd	 * objects may be dirtied in syncing context, but only if they
877168404Spjd	 * were already pre-dirtied in open context.
878168404Spjd	 * XXX We may want to prohibit dirtying in syncing context even
879168404Spjd	 * if they did pre-dirty.
880168404Spjd	 */
881168404Spjd	ASSERT(!dmu_tx_is_syncing(tx) ||
882168404Spjd	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
883168404Spjd	    dn->dn_object == DMU_META_DNODE_OBJECT ||
884168404Spjd	    dn->dn_objset->os_dsl_dataset == NULL ||
885168404Spjd	    dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
886168404Spjd
887168404Spjd	/*
888168404Spjd	 * We make this assert for private objects as well, but after we
889168404Spjd	 * check if we're already dirty.  They are allowed to re-dirty
890168404Spjd	 * in syncing context.
891168404Spjd	 */
892168404Spjd	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
893168404Spjd	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
894168404Spjd	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
895168404Spjd
896168404Spjd	mutex_enter(&db->db_mtx);
897168404Spjd	/*
898168404Spjd	 * XXX make this true for indirects too?  The problem is that
899168404Spjd	 * transactions created with dmu_tx_create_assigned() from
900168404Spjd	 * syncing context don't bother holding ahead.
901168404Spjd	 */
902168404Spjd	ASSERT(db->db_level != 0 ||
903168404Spjd	    db->db_state == DB_CACHED || db->db_state == DB_FILL);
904168404Spjd
905168404Spjd	mutex_enter(&dn->dn_mtx);
906168404Spjd	/*
907168404Spjd	 * Don't set dirtyctx to SYNC if we're just modifying this as we
908168404Spjd	 * initialize the objset.
909168404Spjd	 */
910168404Spjd	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
911168404Spjd	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
912168404Spjd		dn->dn_dirtyctx =
913168404Spjd		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
914168404Spjd		ASSERT(dn->dn_dirtyctx_firstset == NULL);
915168404Spjd		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
916168404Spjd	}
917168404Spjd	mutex_exit(&dn->dn_mtx);
918168404Spjd
919168404Spjd	/*
920168404Spjd	 * If this buffer is already dirty, we're done.
921168404Spjd	 */
922168404Spjd	drp = &db->db_last_dirty;
923168404Spjd	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
924168404Spjd	    db->db.db_object == DMU_META_DNODE_OBJECT);
925168404Spjd	while (*drp && (*drp)->dr_txg > tx->tx_txg)
926168404Spjd		drp = &(*drp)->dr_next;
927168404Spjd	if (*drp && (*drp)->dr_txg == tx->tx_txg) {
928168404Spjd		if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
929168404Spjd			/*
930168404Spjd			 * If this buffer has already been written out,
931168404Spjd			 * we now need to reset its state.
932168404Spjd			 */
933168404Spjd			dbuf_unoverride(*drp);
934168404Spjd			if (db->db.db_object != DMU_META_DNODE_OBJECT)
935168404Spjd				arc_buf_thaw(db->db_buf);
936168404Spjd		}
937168404Spjd		mutex_exit(&db->db_mtx);
938168404Spjd		return (*drp);
939168404Spjd	}
940168404Spjd
941168404Spjd	/*
942168404Spjd	 * Only valid if not already dirty.
943168404Spjd	 */
944168404Spjd	ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
945168404Spjd	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
946168404Spjd
947168404Spjd	ASSERT3U(dn->dn_nlevels, >, db->db_level);
948168404Spjd	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
949168404Spjd	    dn->dn_phys->dn_nlevels > db->db_level ||
950168404Spjd	    dn->dn_next_nlevels[txgoff] > db->db_level ||
951168404Spjd	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
952168404Spjd	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
953168404Spjd
954168404Spjd	/*
955168404Spjd	 * We should only be dirtying in syncing context if it's the
956168404Spjd	 * mos, a spa os, or we're initializing the os.  However, we are
957168404Spjd	 * allowed to dirty in syncing context provided we already
958168404Spjd	 * dirtied it in open context.  Hence we must make this
959168404Spjd	 * assertion only if we're not already dirty.
960168404Spjd	 */
961168404Spjd	ASSERT(!dmu_tx_is_syncing(tx) ||
962168404Spjd	    os->os_dsl_dataset == NULL ||
963168404Spjd	    !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
964168404Spjd	    !BP_IS_HOLE(os->os_rootbp));
965168404Spjd	ASSERT(db->db.db_size != 0);
966168404Spjd
967168404Spjd	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
968168404Spjd
969168404Spjd	/*
970168404Spjd	 * If this buffer is dirty in an old transaction group we need
971168404Spjd	 * to make a copy of it so that the changes we make in this
972168404Spjd	 * transaction group won't leak out when we sync the older txg.
973168404Spjd	 */
974168404Spjd	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
975168404Spjd	if (db->db_level == 0) {
976168404Spjd		void *data_old = db->db_buf;
977168404Spjd
978168404Spjd		if (db->db_blkid == DB_BONUS_BLKID) {
979168404Spjd			dbuf_fix_old_data(db, tx->tx_txg);
980168404Spjd			data_old = db->db.db_data;
981168404Spjd		} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
982168404Spjd			/*
983168404Spjd			 * Release the data buffer from the cache so that we
984168404Spjd			 * can modify it without impacting possible other users
985168404Spjd			 * of this cached data block.  Note that indirect
986168404Spjd			 * blocks and private objects are not released until the
987168404Spjd			 * syncing state (since they are only modified then).
988168404Spjd			 */
989168404Spjd			arc_release(db->db_buf, db);
990168404Spjd			dbuf_fix_old_data(db, tx->tx_txg);
991168404Spjd			data_old = db->db_buf;
992168404Spjd		}
993168404Spjd		ASSERT(data_old != NULL);
994168404Spjd		dr->dt.dl.dr_data = data_old;
995168404Spjd	} else {
996168404Spjd		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
997168404Spjd		list_create(&dr->dt.di.dr_children,
998168404Spjd		    sizeof (dbuf_dirty_record_t),
999168404Spjd		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
1000168404Spjd	}
1001168404Spjd	dr->dr_dbuf = db;
1002168404Spjd	dr->dr_txg = tx->tx_txg;
1003168404Spjd	dr->dr_next = *drp;
1004168404Spjd	*drp = dr;
1005168404Spjd
1006168404Spjd	/*
1007168404Spjd	 * We could have been freed_in_flight between the dbuf_noread
1008168404Spjd	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
1009168404Spjd	 * happened after the free.
1010168404Spjd	 */
1011168404Spjd	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
1012168404Spjd		mutex_enter(&dn->dn_mtx);
1013168404Spjd		dnode_clear_range(dn, db->db_blkid, 1, tx);
1014168404Spjd		mutex_exit(&dn->dn_mtx);
1015168404Spjd		db->db_freed_in_flight = FALSE;
1016168404Spjd	}
1017168404Spjd
1018168404Spjd	if (db->db_blkid != DB_BONUS_BLKID) {
1019168404Spjd		/*
1020168404Spjd		 * Update the accounting.
1021168404Spjd		 */
1022168404Spjd		if (!dbuf_new_block(db) && db->db_blkptr) {
1023168404Spjd			/*
1024168404Spjd			 * This is only a guess -- if the dbuf is dirty
1025168404Spjd			 * in a previous txg, we don't know how much
1026168404Spjd			 * space it will use on disk yet.  We should
1027168404Spjd			 * really have the struct_rwlock to access
1028168404Spjd			 * db_blkptr, but since this is just a guess,
1029168404Spjd			 * it's OK if we get an odd answer.
1030168404Spjd			 */
1031168404Spjd			dnode_willuse_space(dn,
1032168404Spjd			    -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
1033168404Spjd		}
1034168404Spjd		dnode_willuse_space(dn, db->db.db_size, tx);
1035168404Spjd	}
1036168404Spjd
1037168404Spjd	/*
1038168404Spjd	 * This buffer is now part of this txg
1039168404Spjd	 */
1040168404Spjd	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1041168404Spjd	db->db_dirtycnt += 1;
1042168404Spjd	ASSERT3U(db->db_dirtycnt, <=, 3);
1043168404Spjd
1044168404Spjd	mutex_exit(&db->db_mtx);
1045168404Spjd
1046168404Spjd	if (db->db_blkid == DB_BONUS_BLKID) {
1047168404Spjd		mutex_enter(&dn->dn_mtx);
1048168404Spjd		ASSERT(!list_link_active(&dr->dr_dirty_node));
1049168404Spjd		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1050168404Spjd		mutex_exit(&dn->dn_mtx);
1051168404Spjd		dnode_setdirty(dn, tx);
1052168404Spjd		return (dr);
1053168404Spjd	}
1054168404Spjd
1055168404Spjd	if (db->db_level == 0) {
1056168404Spjd		dnode_new_blkid(dn, db->db_blkid, tx);
1057168404Spjd		ASSERT(dn->dn_maxblkid >= db->db_blkid);
1058168404Spjd	}
1059168404Spjd
1060168404Spjd	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1061168404Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1062168404Spjd		drop_struct_lock = TRUE;
1063168404Spjd	}
1064168404Spjd
1065168404Spjd	if (db->db_level+1 < dn->dn_nlevels) {
1066168404Spjd		dmu_buf_impl_t *parent = db->db_parent;
1067168404Spjd		dbuf_dirty_record_t *di;
1068168404Spjd		int parent_held = FALSE;
1069168404Spjd
1070168404Spjd		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1071168404Spjd			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1072168404Spjd
1073168404Spjd			parent = dbuf_hold_level(dn, db->db_level+1,
1074168404Spjd			    db->db_blkid >> epbs, FTAG);
1075168404Spjd			parent_held = TRUE;
1076168404Spjd		}
1077168404Spjd		if (drop_struct_lock)
1078168404Spjd			rw_exit(&dn->dn_struct_rwlock);
1079168404Spjd		ASSERT3U(db->db_level+1, ==, parent->db_level);
1080168404Spjd		di = dbuf_dirty(parent, tx);
1081168404Spjd		if (parent_held)
1082168404Spjd			dbuf_rele(parent, FTAG);
1083168404Spjd
1084168404Spjd		mutex_enter(&db->db_mtx);
1085168404Spjd		/*  possible race with dbuf_undirty() */
1086168404Spjd		if (db->db_last_dirty == dr ||
1087168404Spjd		    dn->dn_object == DMU_META_DNODE_OBJECT) {
1088168404Spjd			mutex_enter(&di->dt.di.dr_mtx);
1089168404Spjd			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1090168404Spjd			ASSERT(!list_link_active(&dr->dr_dirty_node));
1091168404Spjd			list_insert_tail(&di->dt.di.dr_children, dr);
1092168404Spjd			mutex_exit(&di->dt.di.dr_mtx);
1093168404Spjd			dr->dr_parent = di;
1094168404Spjd		}
1095168404Spjd		mutex_exit(&db->db_mtx);
1096168404Spjd	} else {
1097168404Spjd		ASSERT(db->db_level+1 == dn->dn_nlevels);
1098168404Spjd		ASSERT(db->db_blkid < dn->dn_nblkptr);
1099168404Spjd		ASSERT(db->db_parent == NULL ||
1100168404Spjd		    db->db_parent == db->db_dnode->dn_dbuf);
1101168404Spjd		mutex_enter(&dn->dn_mtx);
1102168404Spjd		ASSERT(!list_link_active(&dr->dr_dirty_node));
1103168404Spjd		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1104168404Spjd		mutex_exit(&dn->dn_mtx);
1105168404Spjd		if (drop_struct_lock)
1106168404Spjd			rw_exit(&dn->dn_struct_rwlock);
1107168404Spjd	}
1108168404Spjd
1109168404Spjd	dnode_setdirty(dn, tx);
1110168404Spjd	return (dr);
1111168404Spjd}
1112168404Spjd
1113168404Spjdstatic int
1114168404Spjddbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1115168404Spjd{
1116168404Spjd	dnode_t *dn = db->db_dnode;
1117168404Spjd	uint64_t txg = tx->tx_txg;
1118168404Spjd	dbuf_dirty_record_t *dr;
1119168404Spjd
1120168404Spjd	ASSERT(txg != 0);
1121168404Spjd	ASSERT(db->db_blkid != DB_BONUS_BLKID);
1122168404Spjd
1123168404Spjd	mutex_enter(&db->db_mtx);
1124168404Spjd
1125168404Spjd	/*
1126168404Spjd	 * If this buffer is not dirty, we're done.
1127168404Spjd	 */
1128168404Spjd	for (dr = db->db_last_dirty; dr; dr = dr->dr_next)
1129168404Spjd		if (dr->dr_txg <= txg)
1130168404Spjd			break;
1131168404Spjd	if (dr == NULL || dr->dr_txg < txg) {
1132168404Spjd		mutex_exit(&db->db_mtx);
1133168404Spjd		return (0);
1134168404Spjd	}
1135168404Spjd	ASSERT(dr->dr_txg == txg);
1136168404Spjd
1137168404Spjd	/*
1138168404Spjd	 * If this buffer is currently held, we cannot undirty
1139168404Spjd	 * it, since one of the current holders may be in the
1140168404Spjd	 * middle of an update.  Note that users of dbuf_undirty()
1141168404Spjd	 * should not place a hold on the dbuf before the call.
1142168404Spjd	 */
1143168404Spjd	if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
1144168404Spjd		mutex_exit(&db->db_mtx);
1145168404Spjd		/* Make sure we don't toss this buffer at sync phase */
1146168404Spjd		mutex_enter(&dn->dn_mtx);
1147168404Spjd		dnode_clear_range(dn, db->db_blkid, 1, tx);
1148168404Spjd		mutex_exit(&dn->dn_mtx);
1149168404Spjd		return (0);
1150168404Spjd	}
1151168404Spjd
1152168404Spjd	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1153168404Spjd
1154168404Spjd	ASSERT(db->db.db_size != 0);
1155168404Spjd
1156168404Spjd	/* XXX would be nice to fix up dn_towrite_space[] */
1157168404Spjd
1158168404Spjd	db->db_last_dirty = dr->dr_next;
1159168404Spjd
1160168404Spjd	if (dr->dr_parent) {
1161168404Spjd		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1162168404Spjd		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1163168404Spjd		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1164168404Spjd	} else if (db->db_level+1 == dn->dn_nlevels) {
1165168404Spjd		ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
1166168404Spjd		mutex_enter(&dn->dn_mtx);
1167168404Spjd		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1168168404Spjd		mutex_exit(&dn->dn_mtx);
1169168404Spjd	}
1170168404Spjd
1171168404Spjd	if (db->db_level == 0) {
1172168404Spjd		dbuf_unoverride(dr);
1173168404Spjd
1174168404Spjd		ASSERT(db->db_buf != NULL);
1175168404Spjd		ASSERT(dr->dt.dl.dr_data != NULL);
1176168404Spjd		if (dr->dt.dl.dr_data != db->db_buf)
1177168404Spjd			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
1178168404Spjd	} else {
1179168404Spjd		ASSERT(db->db_buf != NULL);
1180168404Spjd		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1181168404Spjd		/* XXX - mutex and list destroy? */
1182168404Spjd	}
1183168404Spjd	kmem_free(dr, sizeof (dbuf_dirty_record_t));
1184168404Spjd
1185168404Spjd	ASSERT(db->db_dirtycnt > 0);
1186168404Spjd	db->db_dirtycnt -= 1;
1187168404Spjd
1188168404Spjd	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1189168404Spjd		arc_buf_t *buf = db->db_buf;
1190168404Spjd
1191168404Spjd		ASSERT(arc_released(buf));
1192168404Spjd		dbuf_set_data(db, NULL);
1193168404Spjd		VERIFY(arc_buf_remove_ref(buf, db) == 1);
1194168404Spjd		dbuf_evict(db);
1195168404Spjd		return (1);
1196168404Spjd	}
1197168404Spjd
1198168404Spjd	mutex_exit(&db->db_mtx);
1199168404Spjd	return (0);
1200168404Spjd}
1201168404Spjd
1202168404Spjd#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1203168404Spjdvoid
1204168404Spjddbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1205168404Spjd{
1206168404Spjd	int rf = DB_RF_MUST_SUCCEED;
1207168404Spjd
1208168404Spjd	ASSERT(tx->tx_txg != 0);
1209168404Spjd	ASSERT(!refcount_is_zero(&db->db_holds));
1210168404Spjd
1211168404Spjd	if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
1212168404Spjd		rf |= DB_RF_HAVESTRUCT;
1213168404Spjd	(void) dbuf_read(db, NULL, rf);
1214168404Spjd	(void) dbuf_dirty(db, tx);
1215168404Spjd}
1216168404Spjd
1217168404Spjdvoid
1218168404Spjddmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1219168404Spjd{
1220168404Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1221168404Spjd
1222168404Spjd	ASSERT(db->db_blkid != DB_BONUS_BLKID);
1223168404Spjd	ASSERT(tx->tx_txg != 0);
1224168404Spjd	ASSERT(db->db_level == 0);
1225168404Spjd	ASSERT(!refcount_is_zero(&db->db_holds));
1226168404Spjd
1227168404Spjd	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1228168404Spjd	    dmu_tx_private_ok(tx));
1229168404Spjd
1230168404Spjd	dbuf_noread(db);
1231168404Spjd	(void) dbuf_dirty(db, tx);
1232168404Spjd}
1233168404Spjd
1234168404Spjd#pragma weak dmu_buf_fill_done = dbuf_fill_done
1235168404Spjd/* ARGSUSED */
1236168404Spjdvoid
1237168404Spjddbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1238168404Spjd{
1239168404Spjd	mutex_enter(&db->db_mtx);
1240168404Spjd	DBUF_VERIFY(db);
1241168404Spjd
1242168404Spjd	if (db->db_state == DB_FILL) {
1243168404Spjd		if (db->db_level == 0 && db->db_freed_in_flight) {
1244168404Spjd			ASSERT(db->db_blkid != DB_BONUS_BLKID);
1245168404Spjd			/* we were freed while filling */
1246168404Spjd			/* XXX dbuf_undirty? */
1247168404Spjd			bzero(db->db.db_data, db->db.db_size);
1248168404Spjd			db->db_freed_in_flight = FALSE;
1249168404Spjd		}
1250168404Spjd		db->db_state = DB_CACHED;
1251168404Spjd		cv_broadcast(&db->db_changed);
1252168404Spjd	}
1253168404Spjd	mutex_exit(&db->db_mtx);
1254168404Spjd}
1255168404Spjd
1256168404Spjd/*
1257168404Spjd * "Clear" the contents of this dbuf.  This will mark the dbuf
1258168404Spjd * EVICTING and clear *most* of its references.  Unfortunetely,
1259168404Spjd * when we are not holding the dn_dbufs_mtx, we can't clear the
1260168404Spjd * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1261168404Spjd * in this case.  For callers from the DMU we will usually see:
1262168404Spjd *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1263168404Spjd * For the arc callback, we will usually see:
1264168404Spjd * 	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1265168404Spjd * Sometimes, though, we will get a mix of these two:
1266168404Spjd *	DMU: dbuf_clear()->arc_buf_evict()
1267168404Spjd *	ARC: dbuf_do_evict()->dbuf_destroy()
1268168404Spjd */
1269168404Spjdvoid
1270168404Spjddbuf_clear(dmu_buf_impl_t *db)
1271168404Spjd{
1272168404Spjd	dnode_t *dn = db->db_dnode;
1273168404Spjd	dmu_buf_impl_t *parent = db->db_parent;
1274168404Spjd	dmu_buf_impl_t *dndb = dn->dn_dbuf;
1275168404Spjd	int dbuf_gone = FALSE;
1276168404Spjd
1277168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
1278168404Spjd	ASSERT(refcount_is_zero(&db->db_holds));
1279168404Spjd
1280168404Spjd	dbuf_evict_user(db);
1281168404Spjd
1282168404Spjd	if (db->db_state == DB_CACHED) {
1283168404Spjd		ASSERT(db->db.db_data != NULL);
1284168404Spjd		if (db->db_blkid == DB_BONUS_BLKID)
1285168404Spjd			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1286168404Spjd		db->db.db_data = NULL;
1287168404Spjd		db->db_state = DB_UNCACHED;
1288168404Spjd	}
1289168404Spjd
1290168404Spjd	ASSERT3U(db->db_state, ==, DB_UNCACHED);
1291168404Spjd	ASSERT(db->db_data_pending == NULL);
1292168404Spjd
1293168404Spjd	db->db_state = DB_EVICTING;
1294168404Spjd	db->db_blkptr = NULL;
1295168404Spjd
1296168404Spjd	if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1297168404Spjd		list_remove(&dn->dn_dbufs, db);
1298168404Spjd		dnode_rele(dn, db);
1299168404Spjd	}
1300168404Spjd
1301168404Spjd	if (db->db_buf)
1302168404Spjd		dbuf_gone = arc_buf_evict(db->db_buf);
1303168404Spjd
1304168404Spjd	if (!dbuf_gone)
1305168404Spjd		mutex_exit(&db->db_mtx);
1306168404Spjd
1307168404Spjd	/*
1308168404Spjd	 * If this dbuf is referened from an indirect dbuf,
1309168404Spjd	 * decrement the ref count on the indirect dbuf.
1310168404Spjd	 */
1311168404Spjd	if (parent && parent != dndb)
1312168404Spjd		dbuf_rele(parent, db);
1313168404Spjd}
1314168404Spjd
1315168404Spjdstatic int
1316168404Spjddbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1317168404Spjd    dmu_buf_impl_t **parentp, blkptr_t **bpp)
1318168404Spjd{
1319168404Spjd	int nlevels, epbs;
1320168404Spjd
1321168404Spjd	*parentp = NULL;
1322168404Spjd	*bpp = NULL;
1323168404Spjd
1324168404Spjd	ASSERT(blkid != DB_BONUS_BLKID);
1325168404Spjd
1326168404Spjd	if (dn->dn_phys->dn_nlevels == 0)
1327168404Spjd		nlevels = 1;
1328168404Spjd	else
1329168404Spjd		nlevels = dn->dn_phys->dn_nlevels;
1330168404Spjd
1331168404Spjd	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1332168404Spjd
1333168404Spjd	ASSERT3U(level * epbs, <, 64);
1334168404Spjd	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1335168404Spjd	if (level >= nlevels ||
1336168404Spjd	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1337168404Spjd		/* the buffer has no parent yet */
1338168404Spjd		return (ENOENT);
1339168404Spjd	} else if (level < nlevels-1) {
1340168404Spjd		/* this block is referenced from an indirect block */
1341168404Spjd		int err = dbuf_hold_impl(dn, level+1,
1342168404Spjd		    blkid >> epbs, fail_sparse, NULL, parentp);
1343168404Spjd		if (err)
1344168404Spjd			return (err);
1345168404Spjd		err = dbuf_read(*parentp, NULL,
1346168404Spjd		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1347168404Spjd		if (err) {
1348168404Spjd			dbuf_rele(*parentp, NULL);
1349168404Spjd			*parentp = NULL;
1350168404Spjd			return (err);
1351168404Spjd		}
1352168404Spjd		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1353168404Spjd		    (blkid & ((1ULL << epbs) - 1));
1354168404Spjd		return (0);
1355168404Spjd	} else {
1356168404Spjd		/* the block is referenced from the dnode */
1357168404Spjd		ASSERT3U(level, ==, nlevels-1);
1358168404Spjd		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1359168404Spjd		    blkid < dn->dn_phys->dn_nblkptr);
1360168404Spjd		if (dn->dn_dbuf) {
1361168404Spjd			dbuf_add_ref(dn->dn_dbuf, NULL);
1362168404Spjd			*parentp = dn->dn_dbuf;
1363168404Spjd		}
1364168404Spjd		*bpp = &dn->dn_phys->dn_blkptr[blkid];
1365168404Spjd		return (0);
1366168404Spjd	}
1367168404Spjd}
1368168404Spjd
1369168404Spjdstatic dmu_buf_impl_t *
1370168404Spjddbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1371168404Spjd    dmu_buf_impl_t *parent, blkptr_t *blkptr)
1372168404Spjd{
1373168404Spjd	objset_impl_t *os = dn->dn_objset;
1374168404Spjd	dmu_buf_impl_t *db, *odb;
1375168404Spjd
1376168404Spjd	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1377168404Spjd	ASSERT(dn->dn_type != DMU_OT_NONE);
1378168404Spjd
1379168404Spjd	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1380168404Spjd
1381168404Spjd	db->db_objset = os;
1382168404Spjd	db->db.db_object = dn->dn_object;
1383168404Spjd	db->db_level = level;
1384168404Spjd	db->db_blkid = blkid;
1385168404Spjd	db->db_last_dirty = NULL;
1386168404Spjd	db->db_dirtycnt = 0;
1387168404Spjd	db->db_dnode = dn;
1388168404Spjd	db->db_parent = parent;
1389168404Spjd	db->db_blkptr = blkptr;
1390168404Spjd
1391168404Spjd	db->db_user_ptr = NULL;
1392168404Spjd	db->db_user_data_ptr_ptr = NULL;
1393168404Spjd	db->db_evict_func = NULL;
1394168404Spjd	db->db_immediate_evict = 0;
1395168404Spjd	db->db_freed_in_flight = 0;
1396168404Spjd
1397168404Spjd	if (blkid == DB_BONUS_BLKID) {
1398168404Spjd		ASSERT3P(parent, ==, dn->dn_dbuf);
1399168404Spjd		db->db.db_size = dn->dn_bonuslen;
1400168404Spjd		db->db.db_offset = DB_BONUS_BLKID;
1401168404Spjd		db->db_state = DB_UNCACHED;
1402168404Spjd		/* the bonus dbuf is not placed in the hash table */
1403168404Spjd		return (db);
1404168404Spjd	} else {
1405168404Spjd		int blocksize =
1406168404Spjd		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
1407168404Spjd		db->db.db_size = blocksize;
1408168404Spjd		db->db.db_offset = db->db_blkid * blocksize;
1409168404Spjd	}
1410168404Spjd
1411168404Spjd	/*
1412168404Spjd	 * Hold the dn_dbufs_mtx while we get the new dbuf
1413168404Spjd	 * in the hash table *and* added to the dbufs list.
1414168404Spjd	 * This prevents a possible deadlock with someone
1415168404Spjd	 * trying to look up this dbuf before its added to the
1416168404Spjd	 * dn_dbufs list.
1417168404Spjd	 */
1418168404Spjd	mutex_enter(&dn->dn_dbufs_mtx);
1419168404Spjd	db->db_state = DB_EVICTING;
1420168404Spjd	if ((odb = dbuf_hash_insert(db)) != NULL) {
1421168404Spjd		/* someone else inserted it first */
1422168404Spjd		kmem_cache_free(dbuf_cache, db);
1423168404Spjd		mutex_exit(&dn->dn_dbufs_mtx);
1424168404Spjd		return (odb);
1425168404Spjd	}
1426168404Spjd	list_insert_head(&dn->dn_dbufs, db);
1427168404Spjd	db->db_state = DB_UNCACHED;
1428168404Spjd	mutex_exit(&dn->dn_dbufs_mtx);
1429168404Spjd
1430168404Spjd	if (parent && parent != dn->dn_dbuf)
1431168404Spjd		dbuf_add_ref(parent, db);
1432168404Spjd
1433168404Spjd	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1434168404Spjd	    refcount_count(&dn->dn_holds) > 0);
1435168404Spjd	(void) refcount_add(&dn->dn_holds, db);
1436168404Spjd
1437168404Spjd	dprintf_dbuf(db, "db=%p\n", db);
1438168404Spjd
1439168404Spjd	return (db);
1440168404Spjd}
1441168404Spjd
1442168404Spjdstatic int
1443168404Spjddbuf_do_evict(void *private)
1444168404Spjd{
1445168404Spjd	arc_buf_t *buf = private;
1446168404Spjd	dmu_buf_impl_t *db = buf->b_private;
1447168404Spjd
1448168404Spjd	if (!MUTEX_HELD(&db->db_mtx))
1449168404Spjd		mutex_enter(&db->db_mtx);
1450168404Spjd
1451168404Spjd	ASSERT(refcount_is_zero(&db->db_holds));
1452168404Spjd
1453168404Spjd	if (db->db_state != DB_EVICTING) {
1454168404Spjd		ASSERT(db->db_state == DB_CACHED);
1455168404Spjd		DBUF_VERIFY(db);
1456168404Spjd		db->db_buf = NULL;
1457168404Spjd		dbuf_evict(db);
1458168404Spjd	} else {
1459168404Spjd		mutex_exit(&db->db_mtx);
1460168404Spjd		dbuf_destroy(db);
1461168404Spjd	}
1462168404Spjd	return (0);
1463168404Spjd}
1464168404Spjd
1465168404Spjdstatic void
1466168404Spjddbuf_destroy(dmu_buf_impl_t *db)
1467168404Spjd{
1468168404Spjd	ASSERT(refcount_is_zero(&db->db_holds));
1469168404Spjd
1470168404Spjd	if (db->db_blkid != DB_BONUS_BLKID) {
1471168404Spjd		dnode_t *dn = db->db_dnode;
1472168404Spjd
1473168404Spjd		/*
1474168404Spjd		 * If this dbuf is still on the dn_dbufs list,
1475168404Spjd		 * remove it from that list.
1476168404Spjd		 */
1477168404Spjd		if (list_link_active(&db->db_link)) {
1478168404Spjd			mutex_enter(&dn->dn_dbufs_mtx);
1479168404Spjd			list_remove(&dn->dn_dbufs, db);
1480168404Spjd			mutex_exit(&dn->dn_dbufs_mtx);
1481168404Spjd
1482168404Spjd			dnode_rele(dn, db);
1483168404Spjd		}
1484168404Spjd		dbuf_hash_remove(db);
1485168404Spjd	}
1486168404Spjd	db->db_parent = NULL;
1487168404Spjd	db->db_dnode = NULL;
1488168404Spjd	db->db_buf = NULL;
1489168404Spjd
1490168404Spjd	ASSERT(db->db.db_data == NULL);
1491168404Spjd	ASSERT(db->db_hash_next == NULL);
1492168404Spjd	ASSERT(db->db_blkptr == NULL);
1493168404Spjd	ASSERT(db->db_data_pending == NULL);
1494168404Spjd
1495168404Spjd	kmem_cache_free(dbuf_cache, db);
1496168404Spjd}
1497168404Spjd
1498168404Spjdvoid
1499168404Spjddbuf_prefetch(dnode_t *dn, uint64_t blkid)
1500168404Spjd{
1501168404Spjd	dmu_buf_impl_t *db = NULL;
1502168404Spjd	blkptr_t *bp = NULL;
1503168404Spjd
1504168404Spjd	ASSERT(blkid != DB_BONUS_BLKID);
1505168404Spjd	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1506168404Spjd
1507168404Spjd	if (dnode_block_freed(dn, blkid))
1508168404Spjd		return;
1509168404Spjd
1510168404Spjd	/* dbuf_find() returns with db_mtx held */
1511168404Spjd	if (db = dbuf_find(dn, 0, blkid)) {
1512168404Spjd		if (refcount_count(&db->db_holds) > 0) {
1513168404Spjd			/*
1514168404Spjd			 * This dbuf is active.  We assume that it is
1515168404Spjd			 * already CACHED, or else about to be either
1516168404Spjd			 * read or filled.
1517168404Spjd			 */
1518168404Spjd			mutex_exit(&db->db_mtx);
1519168404Spjd			return;
1520168404Spjd		}
1521168404Spjd		mutex_exit(&db->db_mtx);
1522168404Spjd		db = NULL;
1523168404Spjd	}
1524168404Spjd
1525168404Spjd	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1526168404Spjd		if (bp && !BP_IS_HOLE(bp)) {
1527168404Spjd			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1528168404Spjd			zbookmark_t zb;
1529168404Spjd			zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
1530168404Spjd			    dn->dn_objset->os_dsl_dataset->ds_object : 0;
1531168404Spjd			zb.zb_object = dn->dn_object;
1532168404Spjd			zb.zb_level = 0;
1533168404Spjd			zb.zb_blkid = blkid;
1534168404Spjd
1535168404Spjd			(void) arc_read(NULL, dn->dn_objset->os_spa, bp,
1536168404Spjd			    dmu_ot[dn->dn_type].ot_byteswap,
1537168404Spjd			    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
1538168404Spjd			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1539168404Spjd			    &aflags, &zb);
1540168404Spjd		}
1541168404Spjd		if (db)
1542168404Spjd			dbuf_rele(db, NULL);
1543168404Spjd	}
1544168404Spjd}
1545168404Spjd
1546168404Spjd/*
1547168404Spjd * Returns with db_holds incremented, and db_mtx not held.
1548168404Spjd * Note: dn_struct_rwlock must be held.
1549168404Spjd */
1550168404Spjdint
1551168404Spjddbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1552168404Spjd    void *tag, dmu_buf_impl_t **dbp)
1553168404Spjd{
1554168404Spjd	dmu_buf_impl_t *db, *parent = NULL;
1555168404Spjd
1556168404Spjd	ASSERT(blkid != DB_BONUS_BLKID);
1557168404Spjd	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1558168404Spjd	ASSERT3U(dn->dn_nlevels, >, level);
1559168404Spjd
1560168404Spjd	*dbp = NULL;
1561168404Spjdtop:
1562168404Spjd	/* dbuf_find() returns with db_mtx held */
1563168404Spjd	db = dbuf_find(dn, level, blkid);
1564168404Spjd
1565168404Spjd	if (db == NULL) {
1566168404Spjd		blkptr_t *bp = NULL;
1567168404Spjd		int err;
1568168404Spjd
1569168404Spjd		ASSERT3P(parent, ==, NULL);
1570168404Spjd		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1571168404Spjd		if (fail_sparse) {
1572168404Spjd			if (err == 0 && bp && BP_IS_HOLE(bp))
1573168404Spjd				err = ENOENT;
1574168404Spjd			if (err) {
1575168404Spjd				if (parent)
1576168404Spjd					dbuf_rele(parent, NULL);
1577168404Spjd				return (err);
1578168404Spjd			}
1579168404Spjd		}
1580168404Spjd		if (err && err != ENOENT)
1581168404Spjd			return (err);
1582168404Spjd		db = dbuf_create(dn, level, blkid, parent, bp);
1583168404Spjd	}
1584168404Spjd
1585168404Spjd	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1586168404Spjd		arc_buf_add_ref(db->db_buf, db);
1587168404Spjd		if (db->db_buf->b_data == NULL) {
1588168404Spjd			dbuf_clear(db);
1589168404Spjd			if (parent) {
1590168404Spjd				dbuf_rele(parent, NULL);
1591168404Spjd				parent = NULL;
1592168404Spjd			}
1593168404Spjd			goto top;
1594168404Spjd		}
1595168404Spjd		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1596168404Spjd	}
1597168404Spjd
1598168404Spjd	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1599168404Spjd
1600168404Spjd	/*
1601168404Spjd	 * If this buffer is currently syncing out, and we are are
1602168404Spjd	 * still referencing it from db_data, we need to make a copy
1603168404Spjd	 * of it in case we decide we want to dirty it again in this txg.
1604168404Spjd	 */
1605168404Spjd	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
1606168404Spjd	    dn->dn_object != DMU_META_DNODE_OBJECT &&
1607168404Spjd	    db->db_state == DB_CACHED && db->db_data_pending) {
1608168404Spjd		dbuf_dirty_record_t *dr = db->db_data_pending;
1609168404Spjd
1610168404Spjd		if (dr->dt.dl.dr_data == db->db_buf) {
1611168404Spjd			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1612168404Spjd
1613168404Spjd			dbuf_set_data(db,
1614168404Spjd			    arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
1615168404Spjd			    db->db.db_size, db, type));
1616168404Spjd			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1617168404Spjd			    db->db.db_size);
1618168404Spjd		}
1619168404Spjd	}
1620168404Spjd
1621168404Spjd	(void) refcount_add(&db->db_holds, tag);
1622168404Spjd	dbuf_update_data(db);
1623168404Spjd	DBUF_VERIFY(db);
1624168404Spjd	mutex_exit(&db->db_mtx);
1625168404Spjd
1626168404Spjd	/* NOTE: we can't rele the parent until after we drop the db_mtx */
1627168404Spjd	if (parent)
1628168404Spjd		dbuf_rele(parent, NULL);
1629168404Spjd
1630168404Spjd	ASSERT3P(db->db_dnode, ==, dn);
1631168404Spjd	ASSERT3U(db->db_blkid, ==, blkid);
1632168404Spjd	ASSERT3U(db->db_level, ==, level);
1633168404Spjd	*dbp = db;
1634168404Spjd
1635168404Spjd	return (0);
1636168404Spjd}
1637168404Spjd
1638168404Spjddmu_buf_impl_t *
1639168404Spjddbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1640168404Spjd{
1641168404Spjd	dmu_buf_impl_t *db;
1642168404Spjd	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1643168404Spjd	return (err ? NULL : db);
1644168404Spjd}
1645168404Spjd
1646168404Spjddmu_buf_impl_t *
1647168404Spjddbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1648168404Spjd{
1649168404Spjd	dmu_buf_impl_t *db;
1650168404Spjd	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1651168404Spjd	return (err ? NULL : db);
1652168404Spjd}
1653168404Spjd
1654168404Spjddmu_buf_impl_t *
1655168404Spjddbuf_create_bonus(dnode_t *dn)
1656168404Spjd{
1657168404Spjd	dmu_buf_impl_t *db = dn->dn_bonus;
1658168404Spjd
1659168404Spjd	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1660168404Spjd
1661168404Spjd	ASSERT(dn->dn_bonus == NULL);
1662168404Spjd	db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
1663168404Spjd	return (db);
1664168404Spjd}
1665168404Spjd
1666168404Spjd#pragma weak dmu_buf_add_ref = dbuf_add_ref
1667168404Spjdvoid
1668168404Spjddbuf_add_ref(dmu_buf_impl_t *db, void *tag)
1669168404Spjd{
1670168404Spjd	int64_t holds = refcount_add(&db->db_holds, tag);
1671168404Spjd	ASSERT(holds > 1);
1672168404Spjd}
1673168404Spjd
1674168404Spjd#pragma weak dmu_buf_rele = dbuf_rele
1675168404Spjdvoid
1676168404Spjddbuf_rele(dmu_buf_impl_t *db, void *tag)
1677168404Spjd{
1678168404Spjd	int64_t holds;
1679168404Spjd
1680168404Spjd	mutex_enter(&db->db_mtx);
1681168404Spjd	DBUF_VERIFY(db);
1682168404Spjd
1683168404Spjd	holds = refcount_remove(&db->db_holds, tag);
1684168404Spjd	ASSERT(holds >= 0);
1685168404Spjd
1686168404Spjd	/*
1687168404Spjd	 * We can't freeze indirects if there is a possibility that they
1688168404Spjd	 * may be modified in the current syncing context.
1689168404Spjd	 */
1690168404Spjd	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
1691168404Spjd		arc_buf_freeze(db->db_buf);
1692168404Spjd
1693168404Spjd	if (holds == db->db_dirtycnt &&
1694168404Spjd	    db->db_level == 0 && db->db_immediate_evict)
1695168404Spjd		dbuf_evict_user(db);
1696168404Spjd
1697168404Spjd	if (holds == 0) {
1698168404Spjd		if (db->db_blkid == DB_BONUS_BLKID) {
1699168404Spjd			mutex_exit(&db->db_mtx);
1700168404Spjd			dnode_rele(db->db_dnode, db);
1701168404Spjd		} else if (db->db_buf == NULL) {
1702168404Spjd			/*
1703168404Spjd			 * This is a special case: we never associated this
1704168404Spjd			 * dbuf with any data allocated from the ARC.
1705168404Spjd			 */
1706168404Spjd			ASSERT3U(db->db_state, ==, DB_UNCACHED);
1707168404Spjd			dbuf_evict(db);
1708168404Spjd		} else if (arc_released(db->db_buf)) {
1709168404Spjd			arc_buf_t *buf = db->db_buf;
1710168404Spjd			/*
1711168404Spjd			 * This dbuf has anonymous data associated with it.
1712168404Spjd			 */
1713168404Spjd			dbuf_set_data(db, NULL);
1714168404Spjd			VERIFY(arc_buf_remove_ref(buf, db) == 1);
1715168404Spjd			dbuf_evict(db);
1716168404Spjd		} else {
1717168404Spjd			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
1718168404Spjd			mutex_exit(&db->db_mtx);
1719168404Spjd		}
1720168404Spjd	} else {
1721168404Spjd		mutex_exit(&db->db_mtx);
1722168404Spjd	}
1723168404Spjd}
1724168404Spjd
1725168404Spjd#pragma weak dmu_buf_refcount = dbuf_refcount
1726168404Spjduint64_t
1727168404Spjddbuf_refcount(dmu_buf_impl_t *db)
1728168404Spjd{
1729168404Spjd	return (refcount_count(&db->db_holds));
1730168404Spjd}
1731168404Spjd
1732168404Spjdvoid *
1733168404Spjddmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1734168404Spjd    dmu_buf_evict_func_t *evict_func)
1735168404Spjd{
1736168404Spjd	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1737168404Spjd	    user_data_ptr_ptr, evict_func));
1738168404Spjd}
1739168404Spjd
1740168404Spjdvoid *
1741168404Spjddmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1742168404Spjd    dmu_buf_evict_func_t *evict_func)
1743168404Spjd{
1744168404Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1745168404Spjd
1746168404Spjd	db->db_immediate_evict = TRUE;
1747168404Spjd	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1748168404Spjd	    user_data_ptr_ptr, evict_func));
1749168404Spjd}
1750168404Spjd
1751168404Spjdvoid *
1752168404Spjddmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
1753168404Spjd    void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
1754168404Spjd{
1755168404Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1756168404Spjd	ASSERT(db->db_level == 0);
1757168404Spjd
1758168404Spjd	ASSERT((user_ptr == NULL) == (evict_func == NULL));
1759168404Spjd
1760168404Spjd	mutex_enter(&db->db_mtx);
1761168404Spjd
1762168404Spjd	if (db->db_user_ptr == old_user_ptr) {
1763168404Spjd		db->db_user_ptr = user_ptr;
1764168404Spjd		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
1765168404Spjd		db->db_evict_func = evict_func;
1766168404Spjd
1767168404Spjd		dbuf_update_data(db);
1768168404Spjd	} else {
1769168404Spjd		old_user_ptr = db->db_user_ptr;
1770168404Spjd	}
1771168404Spjd
1772168404Spjd	mutex_exit(&db->db_mtx);
1773168404Spjd	return (old_user_ptr);
1774168404Spjd}
1775168404Spjd
1776168404Spjdvoid *
1777168404Spjddmu_buf_get_user(dmu_buf_t *db_fake)
1778168404Spjd{
1779168404Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1780168404Spjd	ASSERT(!refcount_is_zero(&db->db_holds));
1781168404Spjd
1782168404Spjd	return (db->db_user_ptr);
1783168404Spjd}
1784168404Spjd
1785168404Spjdstatic void
1786168404Spjddbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
1787168404Spjd{
1788168404Spjd	/* ASSERT(dmu_tx_is_syncing(tx) */
1789168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
1790168404Spjd
1791168404Spjd	if (db->db_blkptr != NULL)
1792168404Spjd		return;
1793168404Spjd
1794168404Spjd	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
1795168404Spjd		/*
1796168404Spjd		 * This buffer was allocated at a time when there was
1797168404Spjd		 * no available blkptrs from the dnode, or it was
1798168404Spjd		 * inappropriate to hook it in (i.e., nlevels mis-match).
1799168404Spjd		 */
1800168404Spjd		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
1801168404Spjd		ASSERT(db->db_parent == NULL);
1802168404Spjd		db->db_parent = dn->dn_dbuf;
1803168404Spjd		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
1804168404Spjd		DBUF_VERIFY(db);
1805168404Spjd	} else {
1806168404Spjd		dmu_buf_impl_t *parent = db->db_parent;
1807168404Spjd		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
1808168404Spjd
1809168404Spjd		ASSERT(dn->dn_phys->dn_nlevels > 1);
1810168404Spjd		if (parent == NULL) {
1811168404Spjd			mutex_exit(&db->db_mtx);
1812168404Spjd			rw_enter(&dn->dn_struct_rwlock, RW_READER);
1813168404Spjd			(void) dbuf_hold_impl(dn, db->db_level+1,
1814168404Spjd			    db->db_blkid >> epbs, FALSE, db, &parent);
1815168404Spjd			rw_exit(&dn->dn_struct_rwlock);
1816168404Spjd			mutex_enter(&db->db_mtx);
1817168404Spjd			db->db_parent = parent;
1818168404Spjd		}
1819168404Spjd		db->db_blkptr = (blkptr_t *)parent->db.db_data +
1820168404Spjd		    (db->db_blkid & ((1ULL << epbs) - 1));
1821168404Spjd		DBUF_VERIFY(db);
1822168404Spjd	}
1823168404Spjd}
1824168404Spjd
1825168404Spjdstatic void
1826168404Spjddbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
1827168404Spjd{
1828168404Spjd	dmu_buf_impl_t *db = dr->dr_dbuf;
1829168404Spjd	dnode_t *dn = db->db_dnode;
1830168404Spjd	zio_t *zio;
1831168404Spjd
1832168404Spjd	ASSERT(dmu_tx_is_syncing(tx));
1833168404Spjd
1834168404Spjd	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
1835168404Spjd
1836168404Spjd	mutex_enter(&db->db_mtx);
1837168404Spjd
1838168404Spjd	ASSERT(db->db_level > 0);
1839168404Spjd	DBUF_VERIFY(db);
1840168404Spjd
1841168404Spjd	if (db->db_buf == NULL) {
1842168404Spjd		mutex_exit(&db->db_mtx);
1843168404Spjd		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
1844168404Spjd		mutex_enter(&db->db_mtx);
1845168404Spjd	}
1846168404Spjd	ASSERT3U(db->db_state, ==, DB_CACHED);
1847168404Spjd	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
1848168404Spjd	ASSERT(db->db_buf != NULL);
1849168404Spjd
1850168404Spjd	dbuf_check_blkptr(dn, db);
1851168404Spjd
1852168404Spjd	db->db_data_pending = dr;
1853168404Spjd
1854168404Spjd	arc_release(db->db_buf, db);
1855168404Spjd	mutex_exit(&db->db_mtx);
1856168404Spjd
1857168404Spjd	/*
1858168404Spjd	 * XXX -- we should design a compression algorithm
1859168404Spjd	 * that specializes in arrays of bps.
1860168404Spjd	 */
1861168404Spjd	dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
1862168404Spjd	    zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
1863168404Spjd
1864168404Spjd	zio = dr->dr_zio;
1865168404Spjd	mutex_enter(&dr->dt.di.dr_mtx);
1866168404Spjd	dbuf_sync_list(&dr->dt.di.dr_children, tx);
1867168404Spjd	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1868168404Spjd	mutex_exit(&dr->dt.di.dr_mtx);
1869168404Spjd	zio_nowait(zio);
1870168404Spjd}
1871168404Spjd
1872168404Spjdstatic void
1873168404Spjddbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
1874168404Spjd{
1875168404Spjd	arc_buf_t **datap = &dr->dt.dl.dr_data;
1876168404Spjd	dmu_buf_impl_t *db = dr->dr_dbuf;
1877168404Spjd	dnode_t *dn = db->db_dnode;
1878168404Spjd	objset_impl_t *os = dn->dn_objset;
1879168404Spjd	uint64_t txg = tx->tx_txg;
1880168404Spjd	int checksum, compress;
1881168404Spjd	int blksz;
1882168404Spjd
1883168404Spjd	ASSERT(dmu_tx_is_syncing(tx));
1884168404Spjd
1885168404Spjd	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
1886168404Spjd
1887168404Spjd	mutex_enter(&db->db_mtx);
1888168404Spjd	/*
1889168404Spjd	 * To be synced, we must be dirtied.  But we
1890168404Spjd	 * might have been freed after the dirty.
1891168404Spjd	 */
1892168404Spjd	if (db->db_state == DB_UNCACHED) {
1893168404Spjd		/* This buffer has been freed since it was dirtied */
1894168404Spjd		ASSERT(db->db.db_data == NULL);
1895168404Spjd	} else if (db->db_state == DB_FILL) {
1896168404Spjd		/* This buffer was freed and is now being re-filled */
1897168404Spjd		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
1898168404Spjd	} else {
1899168404Spjd		ASSERT3U(db->db_state, ==, DB_CACHED);
1900168404Spjd	}
1901168404Spjd	DBUF_VERIFY(db);
1902168404Spjd
1903168404Spjd	/*
1904168404Spjd	 * If this is a bonus buffer, simply copy the bonus data into the
1905168404Spjd	 * dnode.  It will be written out when the dnode is synced (and it
1906168404Spjd	 * will be synced, since it must have been dirty for dbuf_sync to
1907168404Spjd	 * be called).
1908168404Spjd	 */
1909168404Spjd	if (db->db_blkid == DB_BONUS_BLKID) {
1910168404Spjd		dbuf_dirty_record_t **drp;
1911168404Spjd		/*
1912168404Spjd		 * Use dn_phys->dn_bonuslen since db.db_size is the length
1913168404Spjd		 * of the bonus buffer in the open transaction rather than
1914168404Spjd		 * the syncing transaction.
1915168404Spjd		 */
1916168404Spjd		ASSERT(*datap != NULL);
1917168404Spjd		ASSERT3U(db->db_level, ==, 0);
1918168404Spjd		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
1919168404Spjd		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
1920168404Spjd		if (*datap != db->db.db_data)
1921168404Spjd			zio_buf_free(*datap, DN_MAX_BONUSLEN);
1922168404Spjd		db->db_data_pending = NULL;
1923168404Spjd		drp = &db->db_last_dirty;
1924168404Spjd		while (*drp != dr)
1925168404Spjd			drp = &(*drp)->dr_next;
1926168404Spjd		ASSERT((*drp)->dr_next == NULL);
1927168404Spjd		*drp = NULL;
1928168404Spjd		kmem_free(dr, sizeof (dbuf_dirty_record_t));
1929168404Spjd		ASSERT(db->db_dirtycnt > 0);
1930168404Spjd		db->db_dirtycnt -= 1;
1931168404Spjd		mutex_exit(&db->db_mtx);
1932168404Spjd		dbuf_rele(db, (void *)(uintptr_t)txg);
1933168404Spjd		return;
1934168404Spjd	}
1935168404Spjd
1936168404Spjd	/*
1937168404Spjd	 * If this buffer is in the middle of an immdiate write,
1938168404Spjd	 * wait for the synchronous IO to complete.
1939168404Spjd	 */
1940168404Spjd	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
1941168404Spjd		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
1942168404Spjd		cv_wait(&db->db_changed, &db->db_mtx);
1943168404Spjd		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
1944168404Spjd	}
1945168404Spjd
1946168404Spjd	dbuf_check_blkptr(dn, db);
1947168404Spjd
1948168404Spjd	/*
1949168404Spjd	 * If this dbuf has already been written out via an immediate write,
1950168404Spjd	 * just complete the write by copying over the new block pointer and
1951168404Spjd	 * updating the accounting via the write-completion functions.
1952168404Spjd	 */
1953168404Spjd	if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
1954168404Spjd		zio_t zio_fake;
1955168404Spjd
1956168404Spjd		zio_fake.io_private = &db;
1957168404Spjd		zio_fake.io_error = 0;
1958168404Spjd		zio_fake.io_bp = db->db_blkptr;
1959168404Spjd		zio_fake.io_bp_orig = *db->db_blkptr;
1960168404Spjd		zio_fake.io_txg = txg;
1961168404Spjd
1962168404Spjd		*db->db_blkptr = dr->dt.dl.dr_overridden_by;
1963168404Spjd		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1964168404Spjd		db->db_data_pending = dr;
1965168404Spjd		dr->dr_zio = &zio_fake;
1966168404Spjd		mutex_exit(&db->db_mtx);
1967168404Spjd
1968168404Spjd		if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
1969168404Spjd			dsl_dataset_block_kill(os->os_dsl_dataset,
1970168404Spjd			    &zio_fake.io_bp_orig, dn->dn_zio, tx);
1971168404Spjd
1972168404Spjd		dbuf_write_ready(&zio_fake, db->db_buf, db);
1973168404Spjd		dbuf_write_done(&zio_fake, db->db_buf, db);
1974168404Spjd
1975168404Spjd		return;
1976168404Spjd	}
1977168404Spjd
1978168404Spjd	blksz = arc_buf_size(*datap);
1979168404Spjd
1980168404Spjd	if (dn->dn_object != DMU_META_DNODE_OBJECT) {
1981168404Spjd		/*
1982168404Spjd		 * If this buffer is currently "in use" (i.e., there are
1983168404Spjd		 * active holds and db_data still references it), then make
1984168404Spjd		 * a copy before we start the write so that any modifications
1985168404Spjd		 * from the open txg will not leak into this write.
1986168404Spjd		 *
1987168404Spjd		 * NOTE: this copy does not need to be made for objects only
1988168404Spjd		 * modified in the syncing context (e.g. DNONE_DNODE blocks).
1989168404Spjd		 */
1990168404Spjd		if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) {
1991168404Spjd			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1992168404Spjd			*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
1993168404Spjd			bcopy(db->db.db_data, (*datap)->b_data, blksz);
1994168404Spjd		}
1995168404Spjd	} else {
1996168404Spjd		/*
1997168404Spjd		 * Private object buffers are released here rather
1998168404Spjd		 * than in dbuf_dirty() since they are only modified
1999168404Spjd		 * in the syncing context and we don't want the
2000168404Spjd		 * overhead of making multiple copies of the data.
2001168404Spjd		 */
2002168404Spjd		arc_release(db->db_buf, db);
2003168404Spjd	}
2004168404Spjd
2005168404Spjd	ASSERT(*datap != NULL);
2006168404Spjd	db->db_data_pending = dr;
2007168404Spjd
2008168404Spjd	mutex_exit(&db->db_mtx);
2009168404Spjd
2010168404Spjd	/*
2011168404Spjd	 * Allow dnode settings to override objset settings,
2012168404Spjd	 * except for metadata checksums.
2013168404Spjd	 */
2014168404Spjd	if (dmu_ot[dn->dn_type].ot_metadata) {
2015168404Spjd		checksum = os->os_md_checksum;
2016168404Spjd		compress = zio_compress_select(dn->dn_compress,
2017168404Spjd		    os->os_md_compress);
2018168404Spjd	} else {
2019168404Spjd		checksum = zio_checksum_select(dn->dn_checksum,
2020168404Spjd		    os->os_checksum);
2021168404Spjd		compress = zio_compress_select(dn->dn_compress,
2022168404Spjd		    os->os_compress);
2023168404Spjd	}
2024168404Spjd
2025168404Spjd	dbuf_write(dr, *datap, checksum, compress, tx);
2026168404Spjd
2027168404Spjd	ASSERT(!list_link_active(&dr->dr_dirty_node));
2028168404Spjd	if (dn->dn_object == DMU_META_DNODE_OBJECT)
2029168404Spjd		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2030168404Spjd	else
2031168404Spjd		zio_nowait(dr->dr_zio);
2032168404Spjd}
2033168404Spjd
2034168404Spjdvoid
2035168404Spjddbuf_sync_list(list_t *list, dmu_tx_t *tx)
2036168404Spjd{
2037168404Spjd	dbuf_dirty_record_t *dr;
2038168404Spjd
2039168404Spjd	while (dr = list_head(list)) {
2040168404Spjd		if (dr->dr_zio != NULL) {
2041168404Spjd			/*
2042168404Spjd			 * If we find an already initialized zio then we
2043168404Spjd			 * are processing the meta-dnode, and we have finished.
2044168404Spjd			 * The dbufs for all dnodes are put back on the list
2045168404Spjd			 * during processing, so that we can zio_wait()
2046168404Spjd			 * these IOs after initiating all child IOs.
2047168404Spjd			 */
2048168404Spjd			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2049168404Spjd			    DMU_META_DNODE_OBJECT);
2050168404Spjd			break;
2051168404Spjd		}
2052168404Spjd		list_remove(list, dr);
2053168404Spjd		if (dr->dr_dbuf->db_level > 0)
2054168404Spjd			dbuf_sync_indirect(dr, tx);
2055168404Spjd		else
2056168404Spjd			dbuf_sync_leaf(dr, tx);
2057168404Spjd	}
2058168404Spjd}
2059168404Spjd
2060168404Spjdstatic void
2061168404Spjddbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
2062168404Spjd    int compress, dmu_tx_t *tx)
2063168404Spjd{
2064168404Spjd	dmu_buf_impl_t *db = dr->dr_dbuf;
2065168404Spjd	dnode_t *dn = db->db_dnode;
2066168404Spjd	objset_impl_t *os = dn->dn_objset;
2067168404Spjd	dmu_buf_impl_t *parent = db->db_parent;
2068168404Spjd	uint64_t txg = tx->tx_txg;
2069168404Spjd	zbookmark_t zb;
2070168404Spjd	zio_t *zio;
2071168404Spjd	int zio_flags;
2072168404Spjd
2073168404Spjd	if (parent != dn->dn_dbuf) {
2074168404Spjd		ASSERT(parent && parent->db_data_pending);
2075168404Spjd		ASSERT(db->db_level == parent->db_level-1);
2076168404Spjd		ASSERT(arc_released(parent->db_buf));
2077168404Spjd		zio = parent->db_data_pending->dr_zio;
2078168404Spjd	} else {
2079168404Spjd		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
2080168404Spjd		ASSERT3P(db->db_blkptr, ==,
2081168404Spjd		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
2082168404Spjd		zio = dn->dn_zio;
2083168404Spjd	}
2084168404Spjd
2085168404Spjd	ASSERT(db->db_level == 0 || data == db->db_buf);
2086168404Spjd	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2087168404Spjd	ASSERT(zio);
2088168404Spjd
2089168404Spjd	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
2090168404Spjd	zb.zb_object = db->db.db_object;
2091168404Spjd	zb.zb_level = db->db_level;
2092168404Spjd	zb.zb_blkid = db->db_blkid;
2093168404Spjd
2094168404Spjd	zio_flags = ZIO_FLAG_MUSTSUCCEED;
2095168404Spjd	if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
2096168404Spjd		zio_flags |= ZIO_FLAG_METADATA;
2097168404Spjd	if (BP_IS_OLDER(db->db_blkptr, txg))
2098168404Spjd		dsl_dataset_block_kill(
2099168404Spjd		    os->os_dsl_dataset, db->db_blkptr, zio, tx);
2100168404Spjd
2101168404Spjd	dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
2102168404Spjd	    dmu_get_replication_level(os, &zb, dn->dn_type), txg,
2103168404Spjd	    db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
2104168404Spjd	    ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
2105168404Spjd}
2106168404Spjd
2107168404Spjd/* ARGSUSED */
2108168404Spjdstatic void
2109168404Spjddbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2110168404Spjd{
2111168404Spjd	dmu_buf_impl_t *db = vdb;
2112168404Spjd	dnode_t *dn = db->db_dnode;
2113168404Spjd	objset_impl_t *os = dn->dn_objset;
2114168404Spjd	blkptr_t *bp_orig = &zio->io_bp_orig;
2115168404Spjd	uint64_t fill = 0;
2116168404Spjd	int old_size, new_size, i;
2117168404Spjd
2118168404Spjd	dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
2119168404Spjd
2120168404Spjd	old_size = bp_get_dasize(os->os_spa, bp_orig);
2121168404Spjd	new_size = bp_get_dasize(os->os_spa, zio->io_bp);
2122168404Spjd
2123168404Spjd	dnode_diduse_space(dn, new_size-old_size);
2124168404Spjd
2125168404Spjd	if (BP_IS_HOLE(zio->io_bp)) {
2126168404Spjd		dsl_dataset_t *ds = os->os_dsl_dataset;
2127168404Spjd		dmu_tx_t *tx = os->os_synctx;
2128168404Spjd
2129168404Spjd		if (bp_orig->blk_birth == tx->tx_txg)
2130168404Spjd			dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
2131168404Spjd		ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
2132168404Spjd		return;
2133168404Spjd	}
2134168404Spjd
2135168404Spjd	mutex_enter(&db->db_mtx);
2136168404Spjd
2137168404Spjd	if (db->db_level == 0) {
2138168404Spjd		mutex_enter(&dn->dn_mtx);
2139168404Spjd		if (db->db_blkid > dn->dn_phys->dn_maxblkid)
2140168404Spjd			dn->dn_phys->dn_maxblkid = db->db_blkid;
2141168404Spjd		mutex_exit(&dn->dn_mtx);
2142168404Spjd
2143168404Spjd		if (dn->dn_type == DMU_OT_DNODE) {
2144168404Spjd			dnode_phys_t *dnp = db->db.db_data;
2145168404Spjd			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2146168404Spjd			    i--, dnp++) {
2147168404Spjd				if (dnp->dn_type != DMU_OT_NONE)
2148168404Spjd					fill++;
2149168404Spjd			}
2150168404Spjd		} else {
2151168404Spjd			fill = 1;
2152168404Spjd		}
2153168404Spjd	} else {
2154168404Spjd		blkptr_t *bp = db->db.db_data;
2155168404Spjd		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2156168404Spjd		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
2157168404Spjd			if (BP_IS_HOLE(bp))
2158168404Spjd				continue;
2159168404Spjd			ASSERT3U(BP_GET_LSIZE(bp), ==,
2160168404Spjd			    db->db_level == 1 ? dn->dn_datablksz :
2161168404Spjd			    (1<<dn->dn_phys->dn_indblkshift));
2162168404Spjd			fill += bp->blk_fill;
2163168404Spjd		}
2164168404Spjd	}
2165168404Spjd
2166168404Spjd	db->db_blkptr->blk_fill = fill;
2167168404Spjd	BP_SET_TYPE(db->db_blkptr, dn->dn_type);
2168168404Spjd	BP_SET_LEVEL(db->db_blkptr, db->db_level);
2169168404Spjd
2170168404Spjd	mutex_exit(&db->db_mtx);
2171168404Spjd
2172168404Spjd	/* We must do this after we've set the bp's type and level */
2173168404Spjd	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
2174168404Spjd		dsl_dataset_t *ds = os->os_dsl_dataset;
2175168404Spjd		dmu_tx_t *tx = os->os_synctx;
2176168404Spjd
2177168404Spjd		if (bp_orig->blk_birth == tx->tx_txg)
2178168404Spjd			dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
2179168404Spjd		dsl_dataset_block_born(ds, zio->io_bp, tx);
2180168404Spjd	}
2181168404Spjd}
2182168404Spjd
2183168404Spjd/* ARGSUSED */
2184168404Spjdstatic void
2185168404Spjddbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2186168404Spjd{
2187168404Spjd	dmu_buf_impl_t *db = vdb;
2188168404Spjd	uint64_t txg = zio->io_txg;
2189168404Spjd	dbuf_dirty_record_t **drp, *dr;
2190168404Spjd
2191168404Spjd	ASSERT3U(zio->io_error, ==, 0);
2192168404Spjd
2193168404Spjd	mutex_enter(&db->db_mtx);
2194168404Spjd
2195168404Spjd	drp = &db->db_last_dirty;
2196168404Spjd	while (*drp != db->db_data_pending)
2197168404Spjd		drp = &(*drp)->dr_next;
2198168404Spjd	ASSERT(!list_link_active(&(*drp)->dr_dirty_node));
2199168404Spjd	ASSERT((*drp)->dr_txg == txg);
2200168404Spjd	ASSERT((*drp)->dr_next == NULL);
2201168404Spjd	dr = *drp;
2202168404Spjd	*drp = NULL;
2203168404Spjd
2204168404Spjd	if (db->db_level == 0) {
2205168404Spjd		ASSERT(db->db_blkid != DB_BONUS_BLKID);
2206168404Spjd		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2207168404Spjd
2208168404Spjd		if (dr->dt.dl.dr_data != db->db_buf)
2209168404Spjd			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
2210168404Spjd		else if (!BP_IS_HOLE(db->db_blkptr))
2211168404Spjd			arc_set_callback(db->db_buf, dbuf_do_evict, db);
2212168404Spjd		else
2213168404Spjd			ASSERT(arc_released(db->db_buf));
2214168404Spjd	} else {
2215168404Spjd		dnode_t *dn = db->db_dnode;
2216168404Spjd
2217168404Spjd		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2218168404Spjd		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2219168404Spjd		if (!BP_IS_HOLE(db->db_blkptr)) {
2220168404Spjd			int epbs =
2221168404Spjd			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2222168404Spjd			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2223168404Spjd			    db->db.db_size);
2224168404Spjd			ASSERT3U(dn->dn_phys->dn_maxblkid
2225168404Spjd			    >> (db->db_level * epbs), >=, db->db_blkid);
2226168404Spjd			arc_set_callback(db->db_buf, dbuf_do_evict, db);
2227168404Spjd		}
2228168404Spjd	}
2229168404Spjd	kmem_free(dr, sizeof (dbuf_dirty_record_t));
2230168404Spjd
2231168404Spjd	cv_broadcast(&db->db_changed);
2232168404Spjd	ASSERT(db->db_dirtycnt > 0);
2233168404Spjd	db->db_dirtycnt -= 1;
2234168404Spjd	db->db_data_pending = NULL;
2235168404Spjd	mutex_exit(&db->db_mtx);
2236168404Spjd
2237168404Spjd	dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
2238168404Spjd
2239168404Spjd	dbuf_rele(db, (void *)(uintptr_t)txg);
2240168404Spjd}
2241