dbuf.c revision 249195
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 */
26
27#include <sys/zfs_context.h>
28#include <sys/dmu.h>
29#include <sys/dmu_impl.h>
30#include <sys/dbuf.h>
31#include <sys/dmu_objset.h>
32#include <sys/dsl_dataset.h>
33#include <sys/dsl_dir.h>
34#include <sys/dmu_tx.h>
35#include <sys/spa.h>
36#include <sys/zio.h>
37#include <sys/dmu_zfetch.h>
38#include <sys/sa.h>
39#include <sys/sa_impl.h>
40
41static void dbuf_destroy(dmu_buf_impl_t *db);
42static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
43static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
44
45/*
46 * Global data structures and functions for the dbuf cache.
47 */
48static kmem_cache_t *dbuf_cache;
49
50/* ARGSUSED */
51static int
52dbuf_cons(void *vdb, void *unused, int kmflag)
53{
54	dmu_buf_impl_t *db = vdb;
55	bzero(db, sizeof (dmu_buf_impl_t));
56
57	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
58	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
59	refcount_create(&db->db_holds);
60	return (0);
61}
62
63/* ARGSUSED */
64static void
65dbuf_dest(void *vdb, void *unused)
66{
67	dmu_buf_impl_t *db = vdb;
68	mutex_destroy(&db->db_mtx);
69	cv_destroy(&db->db_changed);
70	refcount_destroy(&db->db_holds);
71}
72
73/*
74 * dbuf hash table routines
75 */
76static dbuf_hash_table_t dbuf_hash_table;
77
78static uint64_t dbuf_hash_count;
79
80static uint64_t
81dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
82{
83	uintptr_t osv = (uintptr_t)os;
84	uint64_t crc = -1ULL;
85
86	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
87	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
88	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
89	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
90	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
91	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
92	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
93
94	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
95
96	return (crc);
97}
98
99#define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
100
101#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
102	((dbuf)->db.db_object == (obj) &&		\
103	(dbuf)->db_objset == (os) &&			\
104	(dbuf)->db_level == (level) &&			\
105	(dbuf)->db_blkid == (blkid))
106
107dmu_buf_impl_t *
108dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
109{
110	dbuf_hash_table_t *h = &dbuf_hash_table;
111	objset_t *os = dn->dn_objset;
112	uint64_t obj = dn->dn_object;
113	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
114	uint64_t idx = hv & h->hash_table_mask;
115	dmu_buf_impl_t *db;
116
117	mutex_enter(DBUF_HASH_MUTEX(h, idx));
118	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
119		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
120			mutex_enter(&db->db_mtx);
121			if (db->db_state != DB_EVICTING) {
122				mutex_exit(DBUF_HASH_MUTEX(h, idx));
123				return (db);
124			}
125			mutex_exit(&db->db_mtx);
126		}
127	}
128	mutex_exit(DBUF_HASH_MUTEX(h, idx));
129	return (NULL);
130}
131
132/*
133 * Insert an entry into the hash table.  If there is already an element
134 * equal to elem in the hash table, then the already existing element
135 * will be returned and the new element will not be inserted.
136 * Otherwise returns NULL.
137 */
138static dmu_buf_impl_t *
139dbuf_hash_insert(dmu_buf_impl_t *db)
140{
141	dbuf_hash_table_t *h = &dbuf_hash_table;
142	objset_t *os = db->db_objset;
143	uint64_t obj = db->db.db_object;
144	int level = db->db_level;
145	uint64_t blkid = db->db_blkid;
146	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
147	uint64_t idx = hv & h->hash_table_mask;
148	dmu_buf_impl_t *dbf;
149
150	mutex_enter(DBUF_HASH_MUTEX(h, idx));
151	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
152		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
153			mutex_enter(&dbf->db_mtx);
154			if (dbf->db_state != DB_EVICTING) {
155				mutex_exit(DBUF_HASH_MUTEX(h, idx));
156				return (dbf);
157			}
158			mutex_exit(&dbf->db_mtx);
159		}
160	}
161
162	mutex_enter(&db->db_mtx);
163	db->db_hash_next = h->hash_table[idx];
164	h->hash_table[idx] = db;
165	mutex_exit(DBUF_HASH_MUTEX(h, idx));
166	atomic_add_64(&dbuf_hash_count, 1);
167
168	return (NULL);
169}
170
171/*
172 * Remove an entry from the hash table.  This operation will
173 * fail if there are any existing holds on the db.
174 */
175static void
176dbuf_hash_remove(dmu_buf_impl_t *db)
177{
178	dbuf_hash_table_t *h = &dbuf_hash_table;
179	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
180	    db->db_level, db->db_blkid);
181	uint64_t idx = hv & h->hash_table_mask;
182	dmu_buf_impl_t *dbf, **dbp;
183
184	/*
185	 * We musn't hold db_mtx to maintin lock ordering:
186	 * DBUF_HASH_MUTEX > db_mtx.
187	 */
188	ASSERT(refcount_is_zero(&db->db_holds));
189	ASSERT(db->db_state == DB_EVICTING);
190	ASSERT(!MUTEX_HELD(&db->db_mtx));
191
192	mutex_enter(DBUF_HASH_MUTEX(h, idx));
193	dbp = &h->hash_table[idx];
194	while ((dbf = *dbp) != db) {
195		dbp = &dbf->db_hash_next;
196		ASSERT(dbf != NULL);
197	}
198	*dbp = db->db_hash_next;
199	db->db_hash_next = NULL;
200	mutex_exit(DBUF_HASH_MUTEX(h, idx));
201	atomic_add_64(&dbuf_hash_count, -1);
202}
203
204static arc_evict_func_t dbuf_do_evict;
205
206static void
207dbuf_evict_user(dmu_buf_impl_t *db)
208{
209	ASSERT(MUTEX_HELD(&db->db_mtx));
210
211	if (db->db_level != 0 || db->db_evict_func == NULL)
212		return;
213
214	if (db->db_user_data_ptr_ptr)
215		*db->db_user_data_ptr_ptr = db->db.db_data;
216	db->db_evict_func(&db->db, db->db_user_ptr);
217	db->db_user_ptr = NULL;
218	db->db_user_data_ptr_ptr = NULL;
219	db->db_evict_func = NULL;
220}
221
222boolean_t
223dbuf_is_metadata(dmu_buf_impl_t *db)
224{
225	if (db->db_level > 0) {
226		return (B_TRUE);
227	} else {
228		boolean_t is_metadata;
229
230		DB_DNODE_ENTER(db);
231		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
232		DB_DNODE_EXIT(db);
233
234		return (is_metadata);
235	}
236}
237
238void
239dbuf_evict(dmu_buf_impl_t *db)
240{
241	ASSERT(MUTEX_HELD(&db->db_mtx));
242	ASSERT(db->db_buf == NULL);
243	ASSERT(db->db_data_pending == NULL);
244
245	dbuf_clear(db);
246	dbuf_destroy(db);
247}
248
249void
250dbuf_init(void)
251{
252	uint64_t hsize = 1ULL << 16;
253	dbuf_hash_table_t *h = &dbuf_hash_table;
254	int i;
255
256	/*
257	 * The hash table is big enough to fill all of physical memory
258	 * with an average 4K block size.  The table will take up
259	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
260	 */
261	while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
262		hsize <<= 1;
263
264retry:
265	h->hash_table_mask = hsize - 1;
266	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
267	if (h->hash_table == NULL) {
268		/* XXX - we should really return an error instead of assert */
269		ASSERT(hsize > (1ULL << 10));
270		hsize >>= 1;
271		goto retry;
272	}
273
274	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
275	    sizeof (dmu_buf_impl_t),
276	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
277
278	for (i = 0; i < DBUF_MUTEXES; i++)
279		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
280}
281
282void
283dbuf_fini(void)
284{
285	dbuf_hash_table_t *h = &dbuf_hash_table;
286	int i;
287
288	for (i = 0; i < DBUF_MUTEXES; i++)
289		mutex_destroy(&h->hash_mutexes[i]);
290	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
291	kmem_cache_destroy(dbuf_cache);
292}
293
294/*
295 * Other stuff.
296 */
297
298#ifdef ZFS_DEBUG
299static void
300dbuf_verify(dmu_buf_impl_t *db)
301{
302	dnode_t *dn;
303	dbuf_dirty_record_t *dr;
304
305	ASSERT(MUTEX_HELD(&db->db_mtx));
306
307	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
308		return;
309
310	ASSERT(db->db_objset != NULL);
311	DB_DNODE_ENTER(db);
312	dn = DB_DNODE(db);
313	if (dn == NULL) {
314		ASSERT(db->db_parent == NULL);
315		ASSERT(db->db_blkptr == NULL);
316	} else {
317		ASSERT3U(db->db.db_object, ==, dn->dn_object);
318		ASSERT3P(db->db_objset, ==, dn->dn_objset);
319		ASSERT3U(db->db_level, <, dn->dn_nlevels);
320		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
321		    db->db_blkid == DMU_SPILL_BLKID ||
322		    !list_is_empty(&dn->dn_dbufs));
323	}
324	if (db->db_blkid == DMU_BONUS_BLKID) {
325		ASSERT(dn != NULL);
326		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
327		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
328	} else if (db->db_blkid == DMU_SPILL_BLKID) {
329		ASSERT(dn != NULL);
330		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
331		ASSERT0(db->db.db_offset);
332	} else {
333		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
334	}
335
336	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
337		ASSERT(dr->dr_dbuf == db);
338
339	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
340		ASSERT(dr->dr_dbuf == db);
341
342	/*
343	 * We can't assert that db_size matches dn_datablksz because it
344	 * can be momentarily different when another thread is doing
345	 * dnode_set_blksz().
346	 */
347	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
348		dr = db->db_data_pending;
349		/*
350		 * It should only be modified in syncing context, so
351		 * make sure we only have one copy of the data.
352		 */
353		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
354	}
355
356	/* verify db->db_blkptr */
357	if (db->db_blkptr) {
358		if (db->db_parent == dn->dn_dbuf) {
359			/* db is pointed to by the dnode */
360			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
361			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
362				ASSERT(db->db_parent == NULL);
363			else
364				ASSERT(db->db_parent != NULL);
365			if (db->db_blkid != DMU_SPILL_BLKID)
366				ASSERT3P(db->db_blkptr, ==,
367				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
368		} else {
369			/* db is pointed to by an indirect block */
370			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
371			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
372			ASSERT3U(db->db_parent->db.db_object, ==,
373			    db->db.db_object);
374			/*
375			 * dnode_grow_indblksz() can make this fail if we don't
376			 * have the struct_rwlock.  XXX indblksz no longer
377			 * grows.  safe to do this now?
378			 */
379			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
380				ASSERT3P(db->db_blkptr, ==,
381				    ((blkptr_t *)db->db_parent->db.db_data +
382				    db->db_blkid % epb));
383			}
384		}
385	}
386	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
387	    (db->db_buf == NULL || db->db_buf->b_data) &&
388	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
389	    db->db_state != DB_FILL && !dn->dn_free_txg) {
390		/*
391		 * If the blkptr isn't set but they have nonzero data,
392		 * it had better be dirty, otherwise we'll lose that
393		 * data when we evict this buffer.
394		 */
395		if (db->db_dirtycnt == 0) {
396			uint64_t *buf = db->db.db_data;
397			int i;
398
399			for (i = 0; i < db->db.db_size >> 3; i++) {
400				ASSERT(buf[i] == 0);
401			}
402		}
403	}
404	DB_DNODE_EXIT(db);
405}
406#endif
407
408static void
409dbuf_update_data(dmu_buf_impl_t *db)
410{
411	ASSERT(MUTEX_HELD(&db->db_mtx));
412	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
413		ASSERT(!refcount_is_zero(&db->db_holds));
414		*db->db_user_data_ptr_ptr = db->db.db_data;
415	}
416}
417
418static void
419dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
420{
421	ASSERT(MUTEX_HELD(&db->db_mtx));
422	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
423	db->db_buf = buf;
424	if (buf != NULL) {
425		ASSERT(buf->b_data != NULL);
426		db->db.db_data = buf->b_data;
427		if (!arc_released(buf))
428			arc_set_callback(buf, dbuf_do_evict, db);
429		dbuf_update_data(db);
430	} else {
431		dbuf_evict_user(db);
432		db->db.db_data = NULL;
433		if (db->db_state != DB_NOFILL)
434			db->db_state = DB_UNCACHED;
435	}
436}
437
438/*
439 * Loan out an arc_buf for read.  Return the loaned arc_buf.
440 */
441arc_buf_t *
442dbuf_loan_arcbuf(dmu_buf_impl_t *db)
443{
444	arc_buf_t *abuf;
445
446	mutex_enter(&db->db_mtx);
447	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
448		int blksz = db->db.db_size;
449		spa_t *spa;
450
451		mutex_exit(&db->db_mtx);
452		DB_GET_SPA(&spa, db);
453		abuf = arc_loan_buf(spa, blksz);
454		bcopy(db->db.db_data, abuf->b_data, blksz);
455	} else {
456		abuf = db->db_buf;
457		arc_loan_inuse_buf(abuf, db);
458		dbuf_set_data(db, NULL);
459		mutex_exit(&db->db_mtx);
460	}
461	return (abuf);
462}
463
464uint64_t
465dbuf_whichblock(dnode_t *dn, uint64_t offset)
466{
467	if (dn->dn_datablkshift) {
468		return (offset >> dn->dn_datablkshift);
469	} else {
470		ASSERT3U(offset, <, dn->dn_datablksz);
471		return (0);
472	}
473}
474
475static void
476dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
477{
478	dmu_buf_impl_t *db = vdb;
479
480	mutex_enter(&db->db_mtx);
481	ASSERT3U(db->db_state, ==, DB_READ);
482	/*
483	 * All reads are synchronous, so we must have a hold on the dbuf
484	 */
485	ASSERT(refcount_count(&db->db_holds) > 0);
486	ASSERT(db->db_buf == NULL);
487	ASSERT(db->db.db_data == NULL);
488	if (db->db_level == 0 && db->db_freed_in_flight) {
489		/* we were freed in flight; disregard any error */
490		arc_release(buf, db);
491		bzero(buf->b_data, db->db.db_size);
492		arc_buf_freeze(buf);
493		db->db_freed_in_flight = FALSE;
494		dbuf_set_data(db, buf);
495		db->db_state = DB_CACHED;
496	} else if (zio == NULL || zio->io_error == 0) {
497		dbuf_set_data(db, buf);
498		db->db_state = DB_CACHED;
499	} else {
500		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
501		ASSERT3P(db->db_buf, ==, NULL);
502		VERIFY(arc_buf_remove_ref(buf, db));
503		db->db_state = DB_UNCACHED;
504	}
505	cv_broadcast(&db->db_changed);
506	dbuf_rele_and_unlock(db, NULL);
507}
508
509static void
510dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
511{
512	dnode_t *dn;
513	spa_t *spa;
514	zbookmark_t zb;
515	uint32_t aflags = ARC_NOWAIT;
516
517	DB_DNODE_ENTER(db);
518	dn = DB_DNODE(db);
519	ASSERT(!refcount_is_zero(&db->db_holds));
520	/* We need the struct_rwlock to prevent db_blkptr from changing. */
521	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
522	ASSERT(MUTEX_HELD(&db->db_mtx));
523	ASSERT(db->db_state == DB_UNCACHED);
524	ASSERT(db->db_buf == NULL);
525
526	if (db->db_blkid == DMU_BONUS_BLKID) {
527		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
528
529		ASSERT3U(bonuslen, <=, db->db.db_size);
530		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
531		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
532		if (bonuslen < DN_MAX_BONUSLEN)
533			bzero(db->db.db_data, DN_MAX_BONUSLEN);
534		if (bonuslen)
535			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
536		DB_DNODE_EXIT(db);
537		dbuf_update_data(db);
538		db->db_state = DB_CACHED;
539		mutex_exit(&db->db_mtx);
540		return;
541	}
542
543	/*
544	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
545	 * processes the delete record and clears the bp while we are waiting
546	 * for the dn_mtx (resulting in a "no" from block_freed).
547	 */
548	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
549	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
550	    BP_IS_HOLE(db->db_blkptr)))) {
551		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
552
553		dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
554		    db->db.db_size, db, type));
555		DB_DNODE_EXIT(db);
556		bzero(db->db.db_data, db->db.db_size);
557		db->db_state = DB_CACHED;
558		*flags |= DB_RF_CACHED;
559		mutex_exit(&db->db_mtx);
560		return;
561	}
562
563	spa = dn->dn_objset->os_spa;
564	DB_DNODE_EXIT(db);
565
566	db->db_state = DB_READ;
567	mutex_exit(&db->db_mtx);
568
569	if (DBUF_IS_L2CACHEABLE(db))
570		aflags |= ARC_L2CACHE;
571
572	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
573	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
574	    db->db.db_object, db->db_level, db->db_blkid);
575
576	dbuf_add_ref(db, NULL);
577
578	(void) arc_read(zio, spa, db->db_blkptr,
579	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
580	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
581	    &aflags, &zb);
582	if (aflags & ARC_CACHED)
583		*flags |= DB_RF_CACHED;
584}
585
586int
587dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
588{
589	int err = 0;
590	int havepzio = (zio != NULL);
591	int prefetch;
592	dnode_t *dn;
593
594	/*
595	 * We don't have to hold the mutex to check db_state because it
596	 * can't be freed while we have a hold on the buffer.
597	 */
598	ASSERT(!refcount_is_zero(&db->db_holds));
599
600	if (db->db_state == DB_NOFILL)
601		return (SET_ERROR(EIO));
602
603	DB_DNODE_ENTER(db);
604	dn = DB_DNODE(db);
605	if ((flags & DB_RF_HAVESTRUCT) == 0)
606		rw_enter(&dn->dn_struct_rwlock, RW_READER);
607
608	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
609	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
610	    DBUF_IS_CACHEABLE(db);
611
612	mutex_enter(&db->db_mtx);
613	if (db->db_state == DB_CACHED) {
614		mutex_exit(&db->db_mtx);
615		if (prefetch)
616			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
617			    db->db.db_size, TRUE);
618		if ((flags & DB_RF_HAVESTRUCT) == 0)
619			rw_exit(&dn->dn_struct_rwlock);
620		DB_DNODE_EXIT(db);
621	} else if (db->db_state == DB_UNCACHED) {
622		spa_t *spa = dn->dn_objset->os_spa;
623
624		if (zio == NULL)
625			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
626		dbuf_read_impl(db, zio, &flags);
627
628		/* dbuf_read_impl has dropped db_mtx for us */
629
630		if (prefetch)
631			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
632			    db->db.db_size, flags & DB_RF_CACHED);
633
634		if ((flags & DB_RF_HAVESTRUCT) == 0)
635			rw_exit(&dn->dn_struct_rwlock);
636		DB_DNODE_EXIT(db);
637
638		if (!havepzio)
639			err = zio_wait(zio);
640	} else {
641		mutex_exit(&db->db_mtx);
642		if (prefetch)
643			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
644			    db->db.db_size, TRUE);
645		if ((flags & DB_RF_HAVESTRUCT) == 0)
646			rw_exit(&dn->dn_struct_rwlock);
647		DB_DNODE_EXIT(db);
648
649		mutex_enter(&db->db_mtx);
650		if ((flags & DB_RF_NEVERWAIT) == 0) {
651			while (db->db_state == DB_READ ||
652			    db->db_state == DB_FILL) {
653				ASSERT(db->db_state == DB_READ ||
654				    (flags & DB_RF_HAVESTRUCT) == 0);
655				cv_wait(&db->db_changed, &db->db_mtx);
656			}
657			if (db->db_state == DB_UNCACHED)
658				err = SET_ERROR(EIO);
659		}
660		mutex_exit(&db->db_mtx);
661	}
662
663	ASSERT(err || havepzio || db->db_state == DB_CACHED);
664	return (err);
665}
666
667static void
668dbuf_noread(dmu_buf_impl_t *db)
669{
670	ASSERT(!refcount_is_zero(&db->db_holds));
671	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
672	mutex_enter(&db->db_mtx);
673	while (db->db_state == DB_READ || db->db_state == DB_FILL)
674		cv_wait(&db->db_changed, &db->db_mtx);
675	if (db->db_state == DB_UNCACHED) {
676		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
677		spa_t *spa;
678
679		ASSERT(db->db_buf == NULL);
680		ASSERT(db->db.db_data == NULL);
681		DB_GET_SPA(&spa, db);
682		dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
683		db->db_state = DB_FILL;
684	} else if (db->db_state == DB_NOFILL) {
685		dbuf_set_data(db, NULL);
686	} else {
687		ASSERT3U(db->db_state, ==, DB_CACHED);
688	}
689	mutex_exit(&db->db_mtx);
690}
691
692/*
693 * This is our just-in-time copy function.  It makes a copy of
694 * buffers, that have been modified in a previous transaction
695 * group, before we modify them in the current active group.
696 *
697 * This function is used in two places: when we are dirtying a
698 * buffer for the first time in a txg, and when we are freeing
699 * a range in a dnode that includes this buffer.
700 *
701 * Note that when we are called from dbuf_free_range() we do
702 * not put a hold on the buffer, we just traverse the active
703 * dbuf list for the dnode.
704 */
705static void
706dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
707{
708	dbuf_dirty_record_t *dr = db->db_last_dirty;
709
710	ASSERT(MUTEX_HELD(&db->db_mtx));
711	ASSERT(db->db.db_data != NULL);
712	ASSERT(db->db_level == 0);
713	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
714
715	if (dr == NULL ||
716	    (dr->dt.dl.dr_data !=
717	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
718		return;
719
720	/*
721	 * If the last dirty record for this dbuf has not yet synced
722	 * and its referencing the dbuf data, either:
723	 *	reset the reference to point to a new copy,
724	 * or (if there a no active holders)
725	 *	just null out the current db_data pointer.
726	 */
727	ASSERT(dr->dr_txg >= txg - 2);
728	if (db->db_blkid == DMU_BONUS_BLKID) {
729		/* Note that the data bufs here are zio_bufs */
730		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
731		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
732		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
733	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
734		int size = db->db.db_size;
735		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
736		spa_t *spa;
737
738		DB_GET_SPA(&spa, db);
739		dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
740		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
741	} else {
742		dbuf_set_data(db, NULL);
743	}
744}
745
746void
747dbuf_unoverride(dbuf_dirty_record_t *dr)
748{
749	dmu_buf_impl_t *db = dr->dr_dbuf;
750	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
751	uint64_t txg = dr->dr_txg;
752
753	ASSERT(MUTEX_HELD(&db->db_mtx));
754	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
755	ASSERT(db->db_level == 0);
756
757	if (db->db_blkid == DMU_BONUS_BLKID ||
758	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
759		return;
760
761	ASSERT(db->db_data_pending != dr);
762
763	/* free this block */
764	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
765		spa_t *spa;
766
767		DB_GET_SPA(&spa, db);
768		zio_free(spa, txg, bp);
769	}
770	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
771	dr->dt.dl.dr_nopwrite = B_FALSE;
772
773	/*
774	 * Release the already-written buffer, so we leave it in
775	 * a consistent dirty state.  Note that all callers are
776	 * modifying the buffer, so they will immediately do
777	 * another (redundant) arc_release().  Therefore, leave
778	 * the buf thawed to save the effort of freezing &
779	 * immediately re-thawing it.
780	 */
781	arc_release(dr->dt.dl.dr_data, db);
782}
783
784/*
785 * Evict (if its unreferenced) or clear (if its referenced) any level-0
786 * data blocks in the free range, so that any future readers will find
787 * empty blocks.  Also, if we happen accross any level-1 dbufs in the
788 * range that have not already been marked dirty, mark them dirty so
789 * they stay in memory.
790 */
791void
792dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
793{
794	dmu_buf_impl_t *db, *db_next;
795	uint64_t txg = tx->tx_txg;
796	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
797	uint64_t first_l1 = start >> epbs;
798	uint64_t last_l1 = end >> epbs;
799
800	if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
801		end = dn->dn_maxblkid;
802		last_l1 = end >> epbs;
803	}
804	dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
805	mutex_enter(&dn->dn_dbufs_mtx);
806	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
807		db_next = list_next(&dn->dn_dbufs, db);
808		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
809
810		if (db->db_level == 1 &&
811		    db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
812			mutex_enter(&db->db_mtx);
813			if (db->db_last_dirty &&
814			    db->db_last_dirty->dr_txg < txg) {
815				dbuf_add_ref(db, FTAG);
816				mutex_exit(&db->db_mtx);
817				dbuf_will_dirty(db, tx);
818				dbuf_rele(db, FTAG);
819			} else {
820				mutex_exit(&db->db_mtx);
821			}
822		}
823
824		if (db->db_level != 0)
825			continue;
826		dprintf_dbuf(db, "found buf %s\n", "");
827		if (db->db_blkid < start || db->db_blkid > end)
828			continue;
829
830		/* found a level 0 buffer in the range */
831		mutex_enter(&db->db_mtx);
832		if (dbuf_undirty(db, tx)) {
833			/* mutex has been dropped and dbuf destroyed */
834			continue;
835		}
836
837		if (db->db_state == DB_UNCACHED ||
838		    db->db_state == DB_NOFILL ||
839		    db->db_state == DB_EVICTING) {
840			ASSERT(db->db.db_data == NULL);
841			mutex_exit(&db->db_mtx);
842			continue;
843		}
844		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
845			/* will be handled in dbuf_read_done or dbuf_rele */
846			db->db_freed_in_flight = TRUE;
847			mutex_exit(&db->db_mtx);
848			continue;
849		}
850		if (refcount_count(&db->db_holds) == 0) {
851			ASSERT(db->db_buf);
852			dbuf_clear(db);
853			continue;
854		}
855		/* The dbuf is referenced */
856
857		if (db->db_last_dirty != NULL) {
858			dbuf_dirty_record_t *dr = db->db_last_dirty;
859
860			if (dr->dr_txg == txg) {
861				/*
862				 * This buffer is "in-use", re-adjust the file
863				 * size to reflect that this buffer may
864				 * contain new data when we sync.
865				 */
866				if (db->db_blkid != DMU_SPILL_BLKID &&
867				    db->db_blkid > dn->dn_maxblkid)
868					dn->dn_maxblkid = db->db_blkid;
869				dbuf_unoverride(dr);
870			} else {
871				/*
872				 * This dbuf is not dirty in the open context.
873				 * Either uncache it (if its not referenced in
874				 * the open context) or reset its contents to
875				 * empty.
876				 */
877				dbuf_fix_old_data(db, txg);
878			}
879		}
880		/* clear the contents if its cached */
881		if (db->db_state == DB_CACHED) {
882			ASSERT(db->db.db_data != NULL);
883			arc_release(db->db_buf, db);
884			bzero(db->db.db_data, db->db.db_size);
885			arc_buf_freeze(db->db_buf);
886		}
887
888		mutex_exit(&db->db_mtx);
889	}
890	mutex_exit(&dn->dn_dbufs_mtx);
891}
892
893static int
894dbuf_block_freeable(dmu_buf_impl_t *db)
895{
896	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
897	uint64_t birth_txg = 0;
898
899	/*
900	 * We don't need any locking to protect db_blkptr:
901	 * If it's syncing, then db_last_dirty will be set
902	 * so we'll ignore db_blkptr.
903	 */
904	ASSERT(MUTEX_HELD(&db->db_mtx));
905	if (db->db_last_dirty)
906		birth_txg = db->db_last_dirty->dr_txg;
907	else if (db->db_blkptr)
908		birth_txg = db->db_blkptr->blk_birth;
909
910	/*
911	 * If we don't exist or are in a snapshot, we can't be freed.
912	 * Don't pass the bp to dsl_dataset_block_freeable() since we
913	 * are holding the db_mtx lock and might deadlock if we are
914	 * prefetching a dedup-ed block.
915	 */
916	if (birth_txg)
917		return (ds == NULL ||
918		    dsl_dataset_block_freeable(ds, NULL, birth_txg));
919	else
920		return (FALSE);
921}
922
923void
924dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
925{
926	arc_buf_t *buf, *obuf;
927	int osize = db->db.db_size;
928	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
929	dnode_t *dn;
930
931	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
932
933	DB_DNODE_ENTER(db);
934	dn = DB_DNODE(db);
935
936	/* XXX does *this* func really need the lock? */
937	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
938
939	/*
940	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
941	 * is OK, because there can be no other references to the db
942	 * when we are changing its size, so no concurrent DB_FILL can
943	 * be happening.
944	 */
945	/*
946	 * XXX we should be doing a dbuf_read, checking the return
947	 * value and returning that up to our callers
948	 */
949	dbuf_will_dirty(db, tx);
950
951	/* create the data buffer for the new block */
952	buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
953
954	/* copy old block data to the new block */
955	obuf = db->db_buf;
956	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
957	/* zero the remainder */
958	if (size > osize)
959		bzero((uint8_t *)buf->b_data + osize, size - osize);
960
961	mutex_enter(&db->db_mtx);
962	dbuf_set_data(db, buf);
963	VERIFY(arc_buf_remove_ref(obuf, db));
964	db->db.db_size = size;
965
966	if (db->db_level == 0) {
967		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
968		db->db_last_dirty->dt.dl.dr_data = buf;
969	}
970	mutex_exit(&db->db_mtx);
971
972	dnode_willuse_space(dn, size-osize, tx);
973	DB_DNODE_EXIT(db);
974}
975
976void
977dbuf_release_bp(dmu_buf_impl_t *db)
978{
979	objset_t *os;
980
981	DB_GET_OBJSET(&os, db);
982	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
983	ASSERT(arc_released(os->os_phys_buf) ||
984	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
985	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
986
987	(void) arc_release(db->db_buf, db);
988}
989
990dbuf_dirty_record_t *
991dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
992{
993	dnode_t *dn;
994	objset_t *os;
995	dbuf_dirty_record_t **drp, *dr;
996	int drop_struct_lock = FALSE;
997	boolean_t do_free_accounting = B_FALSE;
998	int txgoff = tx->tx_txg & TXG_MASK;
999
1000	ASSERT(tx->tx_txg != 0);
1001	ASSERT(!refcount_is_zero(&db->db_holds));
1002	DMU_TX_DIRTY_BUF(tx, db);
1003
1004	DB_DNODE_ENTER(db);
1005	dn = DB_DNODE(db);
1006	/*
1007	 * Shouldn't dirty a regular buffer in syncing context.  Private
1008	 * objects may be dirtied in syncing context, but only if they
1009	 * were already pre-dirtied in open context.
1010	 */
1011	ASSERT(!dmu_tx_is_syncing(tx) ||
1012	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1013	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1014	    dn->dn_objset->os_dsl_dataset == NULL);
1015	/*
1016	 * We make this assert for private objects as well, but after we
1017	 * check if we're already dirty.  They are allowed to re-dirty
1018	 * in syncing context.
1019	 */
1020	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1021	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1022	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1023
1024	mutex_enter(&db->db_mtx);
1025	/*
1026	 * XXX make this true for indirects too?  The problem is that
1027	 * transactions created with dmu_tx_create_assigned() from
1028	 * syncing context don't bother holding ahead.
1029	 */
1030	ASSERT(db->db_level != 0 ||
1031	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1032	    db->db_state == DB_NOFILL);
1033
1034	mutex_enter(&dn->dn_mtx);
1035	/*
1036	 * Don't set dirtyctx to SYNC if we're just modifying this as we
1037	 * initialize the objset.
1038	 */
1039	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1040	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1041		dn->dn_dirtyctx =
1042		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1043		ASSERT(dn->dn_dirtyctx_firstset == NULL);
1044		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1045	}
1046	mutex_exit(&dn->dn_mtx);
1047
1048	if (db->db_blkid == DMU_SPILL_BLKID)
1049		dn->dn_have_spill = B_TRUE;
1050
1051	/*
1052	 * If this buffer is already dirty, we're done.
1053	 */
1054	drp = &db->db_last_dirty;
1055	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1056	    db->db.db_object == DMU_META_DNODE_OBJECT);
1057	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1058		drp = &dr->dr_next;
1059	if (dr && dr->dr_txg == tx->tx_txg) {
1060		DB_DNODE_EXIT(db);
1061
1062		if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1063			/*
1064			 * If this buffer has already been written out,
1065			 * we now need to reset its state.
1066			 */
1067			dbuf_unoverride(dr);
1068			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1069			    db->db_state != DB_NOFILL)
1070				arc_buf_thaw(db->db_buf);
1071		}
1072		mutex_exit(&db->db_mtx);
1073		return (dr);
1074	}
1075
1076	/*
1077	 * Only valid if not already dirty.
1078	 */
1079	ASSERT(dn->dn_object == 0 ||
1080	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1081	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1082
1083	ASSERT3U(dn->dn_nlevels, >, db->db_level);
1084	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1085	    dn->dn_phys->dn_nlevels > db->db_level ||
1086	    dn->dn_next_nlevels[txgoff] > db->db_level ||
1087	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1088	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1089
1090	/*
1091	 * We should only be dirtying in syncing context if it's the
1092	 * mos or we're initializing the os or it's a special object.
1093	 * However, we are allowed to dirty in syncing context provided
1094	 * we already dirtied it in open context.  Hence we must make
1095	 * this assertion only if we're not already dirty.
1096	 */
1097	os = dn->dn_objset;
1098	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1099	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1100	ASSERT(db->db.db_size != 0);
1101
1102	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1103
1104	if (db->db_blkid != DMU_BONUS_BLKID) {
1105		/*
1106		 * Update the accounting.
1107		 * Note: we delay "free accounting" until after we drop
1108		 * the db_mtx.  This keeps us from grabbing other locks
1109		 * (and possibly deadlocking) in bp_get_dsize() while
1110		 * also holding the db_mtx.
1111		 */
1112		dnode_willuse_space(dn, db->db.db_size, tx);
1113		do_free_accounting = dbuf_block_freeable(db);
1114	}
1115
1116	/*
1117	 * If this buffer is dirty in an old transaction group we need
1118	 * to make a copy of it so that the changes we make in this
1119	 * transaction group won't leak out when we sync the older txg.
1120	 */
1121	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1122	if (db->db_level == 0) {
1123		void *data_old = db->db_buf;
1124
1125		if (db->db_state != DB_NOFILL) {
1126			if (db->db_blkid == DMU_BONUS_BLKID) {
1127				dbuf_fix_old_data(db, tx->tx_txg);
1128				data_old = db->db.db_data;
1129			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1130				/*
1131				 * Release the data buffer from the cache so
1132				 * that we can modify it without impacting
1133				 * possible other users of this cached data
1134				 * block.  Note that indirect blocks and
1135				 * private objects are not released until the
1136				 * syncing state (since they are only modified
1137				 * then).
1138				 */
1139				arc_release(db->db_buf, db);
1140				dbuf_fix_old_data(db, tx->tx_txg);
1141				data_old = db->db_buf;
1142			}
1143			ASSERT(data_old != NULL);
1144		}
1145		dr->dt.dl.dr_data = data_old;
1146	} else {
1147		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1148		list_create(&dr->dt.di.dr_children,
1149		    sizeof (dbuf_dirty_record_t),
1150		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
1151	}
1152	dr->dr_dbuf = db;
1153	dr->dr_txg = tx->tx_txg;
1154	dr->dr_next = *drp;
1155	*drp = dr;
1156
1157	/*
1158	 * We could have been freed_in_flight between the dbuf_noread
1159	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
1160	 * happened after the free.
1161	 */
1162	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1163	    db->db_blkid != DMU_SPILL_BLKID) {
1164		mutex_enter(&dn->dn_mtx);
1165		dnode_clear_range(dn, db->db_blkid, 1, tx);
1166		mutex_exit(&dn->dn_mtx);
1167		db->db_freed_in_flight = FALSE;
1168	}
1169
1170	/*
1171	 * This buffer is now part of this txg
1172	 */
1173	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1174	db->db_dirtycnt += 1;
1175	ASSERT3U(db->db_dirtycnt, <=, 3);
1176
1177	mutex_exit(&db->db_mtx);
1178
1179	if (db->db_blkid == DMU_BONUS_BLKID ||
1180	    db->db_blkid == DMU_SPILL_BLKID) {
1181		mutex_enter(&dn->dn_mtx);
1182		ASSERT(!list_link_active(&dr->dr_dirty_node));
1183		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1184		mutex_exit(&dn->dn_mtx);
1185		dnode_setdirty(dn, tx);
1186		DB_DNODE_EXIT(db);
1187		return (dr);
1188	} else if (do_free_accounting) {
1189		blkptr_t *bp = db->db_blkptr;
1190		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1191		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1192		/*
1193		 * This is only a guess -- if the dbuf is dirty
1194		 * in a previous txg, we don't know how much
1195		 * space it will use on disk yet.  We should
1196		 * really have the struct_rwlock to access
1197		 * db_blkptr, but since this is just a guess,
1198		 * it's OK if we get an odd answer.
1199		 */
1200		ddt_prefetch(os->os_spa, bp);
1201		dnode_willuse_space(dn, -willfree, tx);
1202	}
1203
1204	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1205		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1206		drop_struct_lock = TRUE;
1207	}
1208
1209	if (db->db_level == 0) {
1210		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1211		ASSERT(dn->dn_maxblkid >= db->db_blkid);
1212	}
1213
1214	if (db->db_level+1 < dn->dn_nlevels) {
1215		dmu_buf_impl_t *parent = db->db_parent;
1216		dbuf_dirty_record_t *di;
1217		int parent_held = FALSE;
1218
1219		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1220			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1221
1222			parent = dbuf_hold_level(dn, db->db_level+1,
1223			    db->db_blkid >> epbs, FTAG);
1224			ASSERT(parent != NULL);
1225			parent_held = TRUE;
1226		}
1227		if (drop_struct_lock)
1228			rw_exit(&dn->dn_struct_rwlock);
1229		ASSERT3U(db->db_level+1, ==, parent->db_level);
1230		di = dbuf_dirty(parent, tx);
1231		if (parent_held)
1232			dbuf_rele(parent, FTAG);
1233
1234		mutex_enter(&db->db_mtx);
1235		/*  possible race with dbuf_undirty() */
1236		if (db->db_last_dirty == dr ||
1237		    dn->dn_object == DMU_META_DNODE_OBJECT) {
1238			mutex_enter(&di->dt.di.dr_mtx);
1239			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1240			ASSERT(!list_link_active(&dr->dr_dirty_node));
1241			list_insert_tail(&di->dt.di.dr_children, dr);
1242			mutex_exit(&di->dt.di.dr_mtx);
1243			dr->dr_parent = di;
1244		}
1245		mutex_exit(&db->db_mtx);
1246	} else {
1247		ASSERT(db->db_level+1 == dn->dn_nlevels);
1248		ASSERT(db->db_blkid < dn->dn_nblkptr);
1249		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1250		mutex_enter(&dn->dn_mtx);
1251		ASSERT(!list_link_active(&dr->dr_dirty_node));
1252		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1253		mutex_exit(&dn->dn_mtx);
1254		if (drop_struct_lock)
1255			rw_exit(&dn->dn_struct_rwlock);
1256	}
1257
1258	dnode_setdirty(dn, tx);
1259	DB_DNODE_EXIT(db);
1260	return (dr);
1261}
1262
1263/*
1264 * Return TRUE if this evicted the dbuf.
1265 */
1266static boolean_t
1267dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1268{
1269	dnode_t *dn;
1270	uint64_t txg = tx->tx_txg;
1271	dbuf_dirty_record_t *dr, **drp;
1272
1273	ASSERT(txg != 0);
1274	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1275	ASSERT0(db->db_level);
1276	ASSERT(MUTEX_HELD(&db->db_mtx));
1277
1278	/*
1279	 * If this buffer is not dirty, we're done.
1280	 */
1281	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1282		if (dr->dr_txg <= txg)
1283			break;
1284	if (dr == NULL || dr->dr_txg < txg)
1285		return (B_FALSE);
1286	ASSERT(dr->dr_txg == txg);
1287	ASSERT(dr->dr_dbuf == db);
1288
1289	DB_DNODE_ENTER(db);
1290	dn = DB_DNODE(db);
1291
1292	/*
1293	 * Note:  This code will probably work even if there are concurrent
1294	 * holders, but it is untested in that scenerio, as the ZPL and
1295	 * ztest have additional locking (the range locks) that prevents
1296	 * that type of concurrent access.
1297	 */
1298	ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
1299
1300	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1301
1302	ASSERT(db->db.db_size != 0);
1303
1304	/* XXX would be nice to fix up dn_towrite_space[] */
1305
1306	*drp = dr->dr_next;
1307
1308	/*
1309	 * Note that there are three places in dbuf_dirty()
1310	 * where this dirty record may be put on a list.
1311	 * Make sure to do a list_remove corresponding to
1312	 * every one of those list_insert calls.
1313	 */
1314	if (dr->dr_parent) {
1315		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1316		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1317		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1318	} else if (db->db_blkid == DMU_SPILL_BLKID ||
1319	    db->db_level+1 == dn->dn_nlevels) {
1320		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1321		mutex_enter(&dn->dn_mtx);
1322		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1323		mutex_exit(&dn->dn_mtx);
1324	}
1325	DB_DNODE_EXIT(db);
1326
1327	if (db->db_state != DB_NOFILL) {
1328		dbuf_unoverride(dr);
1329
1330		ASSERT(db->db_buf != NULL);
1331		ASSERT(dr->dt.dl.dr_data != NULL);
1332		if (dr->dt.dl.dr_data != db->db_buf)
1333			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1334	}
1335	kmem_free(dr, sizeof (dbuf_dirty_record_t));
1336
1337	ASSERT(db->db_dirtycnt > 0);
1338	db->db_dirtycnt -= 1;
1339
1340	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1341		arc_buf_t *buf = db->db_buf;
1342
1343		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1344		dbuf_set_data(db, NULL);
1345		VERIFY(arc_buf_remove_ref(buf, db));
1346		dbuf_evict(db);
1347		return (B_TRUE);
1348	}
1349
1350	return (B_FALSE);
1351}
1352
1353#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1354void
1355dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1356{
1357	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1358
1359	ASSERT(tx->tx_txg != 0);
1360	ASSERT(!refcount_is_zero(&db->db_holds));
1361
1362	DB_DNODE_ENTER(db);
1363	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1364		rf |= DB_RF_HAVESTRUCT;
1365	DB_DNODE_EXIT(db);
1366	(void) dbuf_read(db, NULL, rf);
1367	(void) dbuf_dirty(db, tx);
1368}
1369
1370void
1371dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1372{
1373	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1374
1375	db->db_state = DB_NOFILL;
1376
1377	dmu_buf_will_fill(db_fake, tx);
1378}
1379
1380void
1381dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1382{
1383	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1384
1385	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1386	ASSERT(tx->tx_txg != 0);
1387	ASSERT(db->db_level == 0);
1388	ASSERT(!refcount_is_zero(&db->db_holds));
1389
1390	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1391	    dmu_tx_private_ok(tx));
1392
1393	dbuf_noread(db);
1394	(void) dbuf_dirty(db, tx);
1395}
1396
1397#pragma weak dmu_buf_fill_done = dbuf_fill_done
1398/* ARGSUSED */
1399void
1400dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1401{
1402	mutex_enter(&db->db_mtx);
1403	DBUF_VERIFY(db);
1404
1405	if (db->db_state == DB_FILL) {
1406		if (db->db_level == 0 && db->db_freed_in_flight) {
1407			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1408			/* we were freed while filling */
1409			/* XXX dbuf_undirty? */
1410			bzero(db->db.db_data, db->db.db_size);
1411			db->db_freed_in_flight = FALSE;
1412		}
1413		db->db_state = DB_CACHED;
1414		cv_broadcast(&db->db_changed);
1415	}
1416	mutex_exit(&db->db_mtx);
1417}
1418
1419/*
1420 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1421 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1422 */
1423void
1424dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1425{
1426	ASSERT(!refcount_is_zero(&db->db_holds));
1427	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1428	ASSERT(db->db_level == 0);
1429	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1430	ASSERT(buf != NULL);
1431	ASSERT(arc_buf_size(buf) == db->db.db_size);
1432	ASSERT(tx->tx_txg != 0);
1433
1434	arc_return_buf(buf, db);
1435	ASSERT(arc_released(buf));
1436
1437	mutex_enter(&db->db_mtx);
1438
1439	while (db->db_state == DB_READ || db->db_state == DB_FILL)
1440		cv_wait(&db->db_changed, &db->db_mtx);
1441
1442	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1443
1444	if (db->db_state == DB_CACHED &&
1445	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1446		mutex_exit(&db->db_mtx);
1447		(void) dbuf_dirty(db, tx);
1448		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1449		VERIFY(arc_buf_remove_ref(buf, db));
1450		xuio_stat_wbuf_copied();
1451		return;
1452	}
1453
1454	xuio_stat_wbuf_nocopy();
1455	if (db->db_state == DB_CACHED) {
1456		dbuf_dirty_record_t *dr = db->db_last_dirty;
1457
1458		ASSERT(db->db_buf != NULL);
1459		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1460			ASSERT(dr->dt.dl.dr_data == db->db_buf);
1461			if (!arc_released(db->db_buf)) {
1462				ASSERT(dr->dt.dl.dr_override_state ==
1463				    DR_OVERRIDDEN);
1464				arc_release(db->db_buf, db);
1465			}
1466			dr->dt.dl.dr_data = buf;
1467			VERIFY(arc_buf_remove_ref(db->db_buf, db));
1468		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1469			arc_release(db->db_buf, db);
1470			VERIFY(arc_buf_remove_ref(db->db_buf, db));
1471		}
1472		db->db_buf = NULL;
1473	}
1474	ASSERT(db->db_buf == NULL);
1475	dbuf_set_data(db, buf);
1476	db->db_state = DB_FILL;
1477	mutex_exit(&db->db_mtx);
1478	(void) dbuf_dirty(db, tx);
1479	dbuf_fill_done(db, tx);
1480}
1481
1482/*
1483 * "Clear" the contents of this dbuf.  This will mark the dbuf
1484 * EVICTING and clear *most* of its references.  Unfortunetely,
1485 * when we are not holding the dn_dbufs_mtx, we can't clear the
1486 * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1487 * in this case.  For callers from the DMU we will usually see:
1488 *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1489 * For the arc callback, we will usually see:
1490 *	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1491 * Sometimes, though, we will get a mix of these two:
1492 *	DMU: dbuf_clear()->arc_buf_evict()
1493 *	ARC: dbuf_do_evict()->dbuf_destroy()
1494 */
1495void
1496dbuf_clear(dmu_buf_impl_t *db)
1497{
1498	dnode_t *dn;
1499	dmu_buf_impl_t *parent = db->db_parent;
1500	dmu_buf_impl_t *dndb;
1501	int dbuf_gone = FALSE;
1502
1503	ASSERT(MUTEX_HELD(&db->db_mtx));
1504	ASSERT(refcount_is_zero(&db->db_holds));
1505
1506	dbuf_evict_user(db);
1507
1508	if (db->db_state == DB_CACHED) {
1509		ASSERT(db->db.db_data != NULL);
1510		if (db->db_blkid == DMU_BONUS_BLKID) {
1511			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1512			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1513		}
1514		db->db.db_data = NULL;
1515		db->db_state = DB_UNCACHED;
1516	}
1517
1518	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1519	ASSERT(db->db_data_pending == NULL);
1520
1521	db->db_state = DB_EVICTING;
1522	db->db_blkptr = NULL;
1523
1524	DB_DNODE_ENTER(db);
1525	dn = DB_DNODE(db);
1526	dndb = dn->dn_dbuf;
1527	if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1528		list_remove(&dn->dn_dbufs, db);
1529		(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1530		membar_producer();
1531		DB_DNODE_EXIT(db);
1532		/*
1533		 * Decrementing the dbuf count means that the hold corresponding
1534		 * to the removed dbuf is no longer discounted in dnode_move(),
1535		 * so the dnode cannot be moved until after we release the hold.
1536		 * The membar_producer() ensures visibility of the decremented
1537		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1538		 * release any lock.
1539		 */
1540		dnode_rele(dn, db);
1541		db->db_dnode_handle = NULL;
1542	} else {
1543		DB_DNODE_EXIT(db);
1544	}
1545
1546	if (db->db_buf)
1547		dbuf_gone = arc_buf_evict(db->db_buf);
1548
1549	if (!dbuf_gone)
1550		mutex_exit(&db->db_mtx);
1551
1552	/*
1553	 * If this dbuf is referenced from an indirect dbuf,
1554	 * decrement the ref count on the indirect dbuf.
1555	 */
1556	if (parent && parent != dndb)
1557		dbuf_rele(parent, db);
1558}
1559
1560static int
1561dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1562    dmu_buf_impl_t **parentp, blkptr_t **bpp)
1563{
1564	int nlevels, epbs;
1565
1566	*parentp = NULL;
1567	*bpp = NULL;
1568
1569	ASSERT(blkid != DMU_BONUS_BLKID);
1570
1571	if (blkid == DMU_SPILL_BLKID) {
1572		mutex_enter(&dn->dn_mtx);
1573		if (dn->dn_have_spill &&
1574		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1575			*bpp = &dn->dn_phys->dn_spill;
1576		else
1577			*bpp = NULL;
1578		dbuf_add_ref(dn->dn_dbuf, NULL);
1579		*parentp = dn->dn_dbuf;
1580		mutex_exit(&dn->dn_mtx);
1581		return (0);
1582	}
1583
1584	if (dn->dn_phys->dn_nlevels == 0)
1585		nlevels = 1;
1586	else
1587		nlevels = dn->dn_phys->dn_nlevels;
1588
1589	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1590
1591	ASSERT3U(level * epbs, <, 64);
1592	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1593	if (level >= nlevels ||
1594	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1595		/* the buffer has no parent yet */
1596		return (SET_ERROR(ENOENT));
1597	} else if (level < nlevels-1) {
1598		/* this block is referenced from an indirect block */
1599		int err = dbuf_hold_impl(dn, level+1,
1600		    blkid >> epbs, fail_sparse, NULL, parentp);
1601		if (err)
1602			return (err);
1603		err = dbuf_read(*parentp, NULL,
1604		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1605		if (err) {
1606			dbuf_rele(*parentp, NULL);
1607			*parentp = NULL;
1608			return (err);
1609		}
1610		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1611		    (blkid & ((1ULL << epbs) - 1));
1612		return (0);
1613	} else {
1614		/* the block is referenced from the dnode */
1615		ASSERT3U(level, ==, nlevels-1);
1616		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1617		    blkid < dn->dn_phys->dn_nblkptr);
1618		if (dn->dn_dbuf) {
1619			dbuf_add_ref(dn->dn_dbuf, NULL);
1620			*parentp = dn->dn_dbuf;
1621		}
1622		*bpp = &dn->dn_phys->dn_blkptr[blkid];
1623		return (0);
1624	}
1625}
1626
1627static dmu_buf_impl_t *
1628dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1629    dmu_buf_impl_t *parent, blkptr_t *blkptr)
1630{
1631	objset_t *os = dn->dn_objset;
1632	dmu_buf_impl_t *db, *odb;
1633
1634	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1635	ASSERT(dn->dn_type != DMU_OT_NONE);
1636
1637	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1638
1639	db->db_objset = os;
1640	db->db.db_object = dn->dn_object;
1641	db->db_level = level;
1642	db->db_blkid = blkid;
1643	db->db_last_dirty = NULL;
1644	db->db_dirtycnt = 0;
1645	db->db_dnode_handle = dn->dn_handle;
1646	db->db_parent = parent;
1647	db->db_blkptr = blkptr;
1648
1649	db->db_user_ptr = NULL;
1650	db->db_user_data_ptr_ptr = NULL;
1651	db->db_evict_func = NULL;
1652	db->db_immediate_evict = 0;
1653	db->db_freed_in_flight = 0;
1654
1655	if (blkid == DMU_BONUS_BLKID) {
1656		ASSERT3P(parent, ==, dn->dn_dbuf);
1657		db->db.db_size = DN_MAX_BONUSLEN -
1658		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1659		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1660		db->db.db_offset = DMU_BONUS_BLKID;
1661		db->db_state = DB_UNCACHED;
1662		/* the bonus dbuf is not placed in the hash table */
1663		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1664		return (db);
1665	} else if (blkid == DMU_SPILL_BLKID) {
1666		db->db.db_size = (blkptr != NULL) ?
1667		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1668		db->db.db_offset = 0;
1669	} else {
1670		int blocksize =
1671		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
1672		db->db.db_size = blocksize;
1673		db->db.db_offset = db->db_blkid * blocksize;
1674	}
1675
1676	/*
1677	 * Hold the dn_dbufs_mtx while we get the new dbuf
1678	 * in the hash table *and* added to the dbufs list.
1679	 * This prevents a possible deadlock with someone
1680	 * trying to look up this dbuf before its added to the
1681	 * dn_dbufs list.
1682	 */
1683	mutex_enter(&dn->dn_dbufs_mtx);
1684	db->db_state = DB_EVICTING;
1685	if ((odb = dbuf_hash_insert(db)) != NULL) {
1686		/* someone else inserted it first */
1687		kmem_cache_free(dbuf_cache, db);
1688		mutex_exit(&dn->dn_dbufs_mtx);
1689		return (odb);
1690	}
1691	list_insert_head(&dn->dn_dbufs, db);
1692	db->db_state = DB_UNCACHED;
1693	mutex_exit(&dn->dn_dbufs_mtx);
1694	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1695
1696	if (parent && parent != dn->dn_dbuf)
1697		dbuf_add_ref(parent, db);
1698
1699	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1700	    refcount_count(&dn->dn_holds) > 0);
1701	(void) refcount_add(&dn->dn_holds, db);
1702	(void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1703
1704	dprintf_dbuf(db, "db=%p\n", db);
1705
1706	return (db);
1707}
1708
1709static int
1710dbuf_do_evict(void *private)
1711{
1712	arc_buf_t *buf = private;
1713	dmu_buf_impl_t *db = buf->b_private;
1714
1715	if (!MUTEX_HELD(&db->db_mtx))
1716		mutex_enter(&db->db_mtx);
1717
1718	ASSERT(refcount_is_zero(&db->db_holds));
1719
1720	if (db->db_state != DB_EVICTING) {
1721		ASSERT(db->db_state == DB_CACHED);
1722		DBUF_VERIFY(db);
1723		db->db_buf = NULL;
1724		dbuf_evict(db);
1725	} else {
1726		mutex_exit(&db->db_mtx);
1727		dbuf_destroy(db);
1728	}
1729	return (0);
1730}
1731
1732static void
1733dbuf_destroy(dmu_buf_impl_t *db)
1734{
1735	ASSERT(refcount_is_zero(&db->db_holds));
1736
1737	if (db->db_blkid != DMU_BONUS_BLKID) {
1738		/*
1739		 * If this dbuf is still on the dn_dbufs list,
1740		 * remove it from that list.
1741		 */
1742		if (db->db_dnode_handle != NULL) {
1743			dnode_t *dn;
1744
1745			DB_DNODE_ENTER(db);
1746			dn = DB_DNODE(db);
1747			mutex_enter(&dn->dn_dbufs_mtx);
1748			list_remove(&dn->dn_dbufs, db);
1749			(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1750			mutex_exit(&dn->dn_dbufs_mtx);
1751			DB_DNODE_EXIT(db);
1752			/*
1753			 * Decrementing the dbuf count means that the hold
1754			 * corresponding to the removed dbuf is no longer
1755			 * discounted in dnode_move(), so the dnode cannot be
1756			 * moved until after we release the hold.
1757			 */
1758			dnode_rele(dn, db);
1759			db->db_dnode_handle = NULL;
1760		}
1761		dbuf_hash_remove(db);
1762	}
1763	db->db_parent = NULL;
1764	db->db_buf = NULL;
1765
1766	ASSERT(!list_link_active(&db->db_link));
1767	ASSERT(db->db.db_data == NULL);
1768	ASSERT(db->db_hash_next == NULL);
1769	ASSERT(db->db_blkptr == NULL);
1770	ASSERT(db->db_data_pending == NULL);
1771
1772	kmem_cache_free(dbuf_cache, db);
1773	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1774}
1775
1776void
1777dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1778{
1779	dmu_buf_impl_t *db = NULL;
1780	blkptr_t *bp = NULL;
1781
1782	ASSERT(blkid != DMU_BONUS_BLKID);
1783	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1784
1785	if (dnode_block_freed(dn, blkid))
1786		return;
1787
1788	/* dbuf_find() returns with db_mtx held */
1789	if (db = dbuf_find(dn, 0, blkid)) {
1790		/*
1791		 * This dbuf is already in the cache.  We assume that
1792		 * it is already CACHED, or else about to be either
1793		 * read or filled.
1794		 */
1795		mutex_exit(&db->db_mtx);
1796		return;
1797	}
1798
1799	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1800		if (bp && !BP_IS_HOLE(bp)) {
1801			int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
1802			    ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
1803			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1804			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1805			zbookmark_t zb;
1806
1807			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1808			    dn->dn_object, 0, blkid);
1809
1810			(void) arc_read(NULL, dn->dn_objset->os_spa,
1811			    bp, NULL, NULL, priority,
1812			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1813			    &aflags, &zb);
1814		}
1815		if (db)
1816			dbuf_rele(db, NULL);
1817	}
1818}
1819
1820/*
1821 * Returns with db_holds incremented, and db_mtx not held.
1822 * Note: dn_struct_rwlock must be held.
1823 */
1824int
1825dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1826    void *tag, dmu_buf_impl_t **dbp)
1827{
1828	dmu_buf_impl_t *db, *parent = NULL;
1829
1830	ASSERT(blkid != DMU_BONUS_BLKID);
1831	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1832	ASSERT3U(dn->dn_nlevels, >, level);
1833
1834	*dbp = NULL;
1835top:
1836	/* dbuf_find() returns with db_mtx held */
1837	db = dbuf_find(dn, level, blkid);
1838
1839	if (db == NULL) {
1840		blkptr_t *bp = NULL;
1841		int err;
1842
1843		ASSERT3P(parent, ==, NULL);
1844		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1845		if (fail_sparse) {
1846			if (err == 0 && bp && BP_IS_HOLE(bp))
1847				err = SET_ERROR(ENOENT);
1848			if (err) {
1849				if (parent)
1850					dbuf_rele(parent, NULL);
1851				return (err);
1852			}
1853		}
1854		if (err && err != ENOENT)
1855			return (err);
1856		db = dbuf_create(dn, level, blkid, parent, bp);
1857	}
1858
1859	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1860		arc_buf_add_ref(db->db_buf, db);
1861		if (db->db_buf->b_data == NULL) {
1862			dbuf_clear(db);
1863			if (parent) {
1864				dbuf_rele(parent, NULL);
1865				parent = NULL;
1866			}
1867			goto top;
1868		}
1869		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1870	}
1871
1872	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1873
1874	/*
1875	 * If this buffer is currently syncing out, and we are are
1876	 * still referencing it from db_data, we need to make a copy
1877	 * of it in case we decide we want to dirty it again in this txg.
1878	 */
1879	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1880	    dn->dn_object != DMU_META_DNODE_OBJECT &&
1881	    db->db_state == DB_CACHED && db->db_data_pending) {
1882		dbuf_dirty_record_t *dr = db->db_data_pending;
1883
1884		if (dr->dt.dl.dr_data == db->db_buf) {
1885			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1886
1887			dbuf_set_data(db,
1888			    arc_buf_alloc(dn->dn_objset->os_spa,
1889			    db->db.db_size, db, type));
1890			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1891			    db->db.db_size);
1892		}
1893	}
1894
1895	(void) refcount_add(&db->db_holds, tag);
1896	dbuf_update_data(db);
1897	DBUF_VERIFY(db);
1898	mutex_exit(&db->db_mtx);
1899
1900	/* NOTE: we can't rele the parent until after we drop the db_mtx */
1901	if (parent)
1902		dbuf_rele(parent, NULL);
1903
1904	ASSERT3P(DB_DNODE(db), ==, dn);
1905	ASSERT3U(db->db_blkid, ==, blkid);
1906	ASSERT3U(db->db_level, ==, level);
1907	*dbp = db;
1908
1909	return (0);
1910}
1911
1912dmu_buf_impl_t *
1913dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1914{
1915	dmu_buf_impl_t *db;
1916	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1917	return (err ? NULL : db);
1918}
1919
1920dmu_buf_impl_t *
1921dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1922{
1923	dmu_buf_impl_t *db;
1924	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1925	return (err ? NULL : db);
1926}
1927
1928void
1929dbuf_create_bonus(dnode_t *dn)
1930{
1931	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1932
1933	ASSERT(dn->dn_bonus == NULL);
1934	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
1935}
1936
1937int
1938dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
1939{
1940	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1941	dnode_t *dn;
1942
1943	if (db->db_blkid != DMU_SPILL_BLKID)
1944		return (SET_ERROR(ENOTSUP));
1945	if (blksz == 0)
1946		blksz = SPA_MINBLOCKSIZE;
1947	if (blksz > SPA_MAXBLOCKSIZE)
1948		blksz = SPA_MAXBLOCKSIZE;
1949	else
1950		blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
1951
1952	DB_DNODE_ENTER(db);
1953	dn = DB_DNODE(db);
1954	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1955	dbuf_new_size(db, blksz, tx);
1956	rw_exit(&dn->dn_struct_rwlock);
1957	DB_DNODE_EXIT(db);
1958
1959	return (0);
1960}
1961
1962void
1963dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
1964{
1965	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
1966}
1967
1968#pragma weak dmu_buf_add_ref = dbuf_add_ref
1969void
1970dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
1971{
1972	int64_t holds = refcount_add(&db->db_holds, tag);
1973	ASSERT(holds > 1);
1974}
1975
1976/*
1977 * If you call dbuf_rele() you had better not be referencing the dnode handle
1978 * unless you have some other direct or indirect hold on the dnode. (An indirect
1979 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
1980 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
1981 * dnode's parent dbuf evicting its dnode handles.
1982 */
1983#pragma weak dmu_buf_rele = dbuf_rele
1984void
1985dbuf_rele(dmu_buf_impl_t *db, void *tag)
1986{
1987	mutex_enter(&db->db_mtx);
1988	dbuf_rele_and_unlock(db, tag);
1989}
1990
1991/*
1992 * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
1993 * db_dirtycnt and db_holds to be updated atomically.
1994 */
1995void
1996dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
1997{
1998	int64_t holds;
1999
2000	ASSERT(MUTEX_HELD(&db->db_mtx));
2001	DBUF_VERIFY(db);
2002
2003	/*
2004	 * Remove the reference to the dbuf before removing its hold on the
2005	 * dnode so we can guarantee in dnode_move() that a referenced bonus
2006	 * buffer has a corresponding dnode hold.
2007	 */
2008	holds = refcount_remove(&db->db_holds, tag);
2009	ASSERT(holds >= 0);
2010
2011	/*
2012	 * We can't freeze indirects if there is a possibility that they
2013	 * may be modified in the current syncing context.
2014	 */
2015	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2016		arc_buf_freeze(db->db_buf);
2017
2018	if (holds == db->db_dirtycnt &&
2019	    db->db_level == 0 && db->db_immediate_evict)
2020		dbuf_evict_user(db);
2021
2022	if (holds == 0) {
2023		if (db->db_blkid == DMU_BONUS_BLKID) {
2024			mutex_exit(&db->db_mtx);
2025
2026			/*
2027			 * If the dnode moves here, we cannot cross this barrier
2028			 * until the move completes.
2029			 */
2030			DB_DNODE_ENTER(db);
2031			(void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2032			DB_DNODE_EXIT(db);
2033			/*
2034			 * The bonus buffer's dnode hold is no longer discounted
2035			 * in dnode_move(). The dnode cannot move until after
2036			 * the dnode_rele().
2037			 */
2038			dnode_rele(DB_DNODE(db), db);
2039		} else if (db->db_buf == NULL) {
2040			/*
2041			 * This is a special case: we never associated this
2042			 * dbuf with any data allocated from the ARC.
2043			 */
2044			ASSERT(db->db_state == DB_UNCACHED ||
2045			    db->db_state == DB_NOFILL);
2046			dbuf_evict(db);
2047		} else if (arc_released(db->db_buf)) {
2048			arc_buf_t *buf = db->db_buf;
2049			/*
2050			 * This dbuf has anonymous data associated with it.
2051			 */
2052			dbuf_set_data(db, NULL);
2053			VERIFY(arc_buf_remove_ref(buf, db));
2054			dbuf_evict(db);
2055		} else {
2056			VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2057
2058			/*
2059			 * A dbuf will be eligible for eviction if either the
2060			 * 'primarycache' property is set or a duplicate
2061			 * copy of this buffer is already cached in the arc.
2062			 *
2063			 * In the case of the 'primarycache' a buffer
2064			 * is considered for eviction if it matches the
2065			 * criteria set in the property.
2066			 *
2067			 * To decide if our buffer is considered a
2068			 * duplicate, we must call into the arc to determine
2069			 * if multiple buffers are referencing the same
2070			 * block on-disk. If so, then we simply evict
2071			 * ourselves.
2072			 */
2073			if (!DBUF_IS_CACHEABLE(db) ||
2074			    arc_buf_eviction_needed(db->db_buf))
2075				dbuf_clear(db);
2076			else
2077				mutex_exit(&db->db_mtx);
2078		}
2079	} else {
2080		mutex_exit(&db->db_mtx);
2081	}
2082}
2083
2084#pragma weak dmu_buf_refcount = dbuf_refcount
2085uint64_t
2086dbuf_refcount(dmu_buf_impl_t *db)
2087{
2088	return (refcount_count(&db->db_holds));
2089}
2090
2091void *
2092dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2093    dmu_buf_evict_func_t *evict_func)
2094{
2095	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2096	    user_data_ptr_ptr, evict_func));
2097}
2098
2099void *
2100dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2101    dmu_buf_evict_func_t *evict_func)
2102{
2103	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2104
2105	db->db_immediate_evict = TRUE;
2106	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2107	    user_data_ptr_ptr, evict_func));
2108}
2109
2110void *
2111dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2112    void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2113{
2114	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2115	ASSERT(db->db_level == 0);
2116
2117	ASSERT((user_ptr == NULL) == (evict_func == NULL));
2118
2119	mutex_enter(&db->db_mtx);
2120
2121	if (db->db_user_ptr == old_user_ptr) {
2122		db->db_user_ptr = user_ptr;
2123		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2124		db->db_evict_func = evict_func;
2125
2126		dbuf_update_data(db);
2127	} else {
2128		old_user_ptr = db->db_user_ptr;
2129	}
2130
2131	mutex_exit(&db->db_mtx);
2132	return (old_user_ptr);
2133}
2134
2135void *
2136dmu_buf_get_user(dmu_buf_t *db_fake)
2137{
2138	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2139	ASSERT(!refcount_is_zero(&db->db_holds));
2140
2141	return (db->db_user_ptr);
2142}
2143
2144boolean_t
2145dmu_buf_freeable(dmu_buf_t *dbuf)
2146{
2147	boolean_t res = B_FALSE;
2148	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2149
2150	if (db->db_blkptr)
2151		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2152		    db->db_blkptr, db->db_blkptr->blk_birth);
2153
2154	return (res);
2155}
2156
2157blkptr_t *
2158dmu_buf_get_blkptr(dmu_buf_t *db)
2159{
2160	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2161	return (dbi->db_blkptr);
2162}
2163
2164static void
2165dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2166{
2167	/* ASSERT(dmu_tx_is_syncing(tx) */
2168	ASSERT(MUTEX_HELD(&db->db_mtx));
2169
2170	if (db->db_blkptr != NULL)
2171		return;
2172
2173	if (db->db_blkid == DMU_SPILL_BLKID) {
2174		db->db_blkptr = &dn->dn_phys->dn_spill;
2175		BP_ZERO(db->db_blkptr);
2176		return;
2177	}
2178	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2179		/*
2180		 * This buffer was allocated at a time when there was
2181		 * no available blkptrs from the dnode, or it was
2182		 * inappropriate to hook it in (i.e., nlevels mis-match).
2183		 */
2184		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2185		ASSERT(db->db_parent == NULL);
2186		db->db_parent = dn->dn_dbuf;
2187		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2188		DBUF_VERIFY(db);
2189	} else {
2190		dmu_buf_impl_t *parent = db->db_parent;
2191		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2192
2193		ASSERT(dn->dn_phys->dn_nlevels > 1);
2194		if (parent == NULL) {
2195			mutex_exit(&db->db_mtx);
2196			rw_enter(&dn->dn_struct_rwlock, RW_READER);
2197			(void) dbuf_hold_impl(dn, db->db_level+1,
2198			    db->db_blkid >> epbs, FALSE, db, &parent);
2199			rw_exit(&dn->dn_struct_rwlock);
2200			mutex_enter(&db->db_mtx);
2201			db->db_parent = parent;
2202		}
2203		db->db_blkptr = (blkptr_t *)parent->db.db_data +
2204		    (db->db_blkid & ((1ULL << epbs) - 1));
2205		DBUF_VERIFY(db);
2206	}
2207}
2208
2209static void
2210dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2211{
2212	dmu_buf_impl_t *db = dr->dr_dbuf;
2213	dnode_t *dn;
2214	zio_t *zio;
2215
2216	ASSERT(dmu_tx_is_syncing(tx));
2217
2218	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2219
2220	mutex_enter(&db->db_mtx);
2221
2222	ASSERT(db->db_level > 0);
2223	DBUF_VERIFY(db);
2224
2225	if (db->db_buf == NULL) {
2226		mutex_exit(&db->db_mtx);
2227		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2228		mutex_enter(&db->db_mtx);
2229	}
2230	ASSERT3U(db->db_state, ==, DB_CACHED);
2231	ASSERT(db->db_buf != NULL);
2232
2233	DB_DNODE_ENTER(db);
2234	dn = DB_DNODE(db);
2235	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2236	dbuf_check_blkptr(dn, db);
2237	DB_DNODE_EXIT(db);
2238
2239	db->db_data_pending = dr;
2240
2241	mutex_exit(&db->db_mtx);
2242	dbuf_write(dr, db->db_buf, tx);
2243
2244	zio = dr->dr_zio;
2245	mutex_enter(&dr->dt.di.dr_mtx);
2246	dbuf_sync_list(&dr->dt.di.dr_children, tx);
2247	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2248	mutex_exit(&dr->dt.di.dr_mtx);
2249	zio_nowait(zio);
2250}
2251
2252static void
2253dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2254{
2255	arc_buf_t **datap = &dr->dt.dl.dr_data;
2256	dmu_buf_impl_t *db = dr->dr_dbuf;
2257	dnode_t *dn;
2258	objset_t *os;
2259	uint64_t txg = tx->tx_txg;
2260
2261	ASSERT(dmu_tx_is_syncing(tx));
2262
2263	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2264
2265	mutex_enter(&db->db_mtx);
2266	/*
2267	 * To be synced, we must be dirtied.  But we
2268	 * might have been freed after the dirty.
2269	 */
2270	if (db->db_state == DB_UNCACHED) {
2271		/* This buffer has been freed since it was dirtied */
2272		ASSERT(db->db.db_data == NULL);
2273	} else if (db->db_state == DB_FILL) {
2274		/* This buffer was freed and is now being re-filled */
2275		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2276	} else {
2277		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2278	}
2279	DBUF_VERIFY(db);
2280
2281	DB_DNODE_ENTER(db);
2282	dn = DB_DNODE(db);
2283
2284	if (db->db_blkid == DMU_SPILL_BLKID) {
2285		mutex_enter(&dn->dn_mtx);
2286		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2287		mutex_exit(&dn->dn_mtx);
2288	}
2289
2290	/*
2291	 * If this is a bonus buffer, simply copy the bonus data into the
2292	 * dnode.  It will be written out when the dnode is synced (and it
2293	 * will be synced, since it must have been dirty for dbuf_sync to
2294	 * be called).
2295	 */
2296	if (db->db_blkid == DMU_BONUS_BLKID) {
2297		dbuf_dirty_record_t **drp;
2298
2299		ASSERT(*datap != NULL);
2300		ASSERT0(db->db_level);
2301		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2302		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2303		DB_DNODE_EXIT(db);
2304
2305		if (*datap != db->db.db_data) {
2306			zio_buf_free(*datap, DN_MAX_BONUSLEN);
2307			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2308		}
2309		db->db_data_pending = NULL;
2310		drp = &db->db_last_dirty;
2311		while (*drp != dr)
2312			drp = &(*drp)->dr_next;
2313		ASSERT(dr->dr_next == NULL);
2314		ASSERT(dr->dr_dbuf == db);
2315		*drp = dr->dr_next;
2316		if (dr->dr_dbuf->db_level != 0) {
2317			list_destroy(&dr->dt.di.dr_children);
2318			mutex_destroy(&dr->dt.di.dr_mtx);
2319		}
2320		kmem_free(dr, sizeof (dbuf_dirty_record_t));
2321		ASSERT(db->db_dirtycnt > 0);
2322		db->db_dirtycnt -= 1;
2323		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2324		return;
2325	}
2326
2327	os = dn->dn_objset;
2328
2329	/*
2330	 * This function may have dropped the db_mtx lock allowing a dmu_sync
2331	 * operation to sneak in. As a result, we need to ensure that we
2332	 * don't check the dr_override_state until we have returned from
2333	 * dbuf_check_blkptr.
2334	 */
2335	dbuf_check_blkptr(dn, db);
2336
2337	/*
2338	 * If this buffer is in the middle of an immediate write,
2339	 * wait for the synchronous IO to complete.
2340	 */
2341	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2342		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2343		cv_wait(&db->db_changed, &db->db_mtx);
2344		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2345	}
2346
2347	if (db->db_state != DB_NOFILL &&
2348	    dn->dn_object != DMU_META_DNODE_OBJECT &&
2349	    refcount_count(&db->db_holds) > 1 &&
2350	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2351	    *datap == db->db_buf) {
2352		/*
2353		 * If this buffer is currently "in use" (i.e., there
2354		 * are active holds and db_data still references it),
2355		 * then make a copy before we start the write so that
2356		 * any modifications from the open txg will not leak
2357		 * into this write.
2358		 *
2359		 * NOTE: this copy does not need to be made for
2360		 * objects only modified in the syncing context (e.g.
2361		 * DNONE_DNODE blocks).
2362		 */
2363		int blksz = arc_buf_size(*datap);
2364		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2365		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2366		bcopy(db->db.db_data, (*datap)->b_data, blksz);
2367	}
2368	db->db_data_pending = dr;
2369
2370	mutex_exit(&db->db_mtx);
2371
2372	dbuf_write(dr, *datap, tx);
2373
2374	ASSERT(!list_link_active(&dr->dr_dirty_node));
2375	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2376		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2377		DB_DNODE_EXIT(db);
2378	} else {
2379		/*
2380		 * Although zio_nowait() does not "wait for an IO", it does
2381		 * initiate the IO. If this is an empty write it seems plausible
2382		 * that the IO could actually be completed before the nowait
2383		 * returns. We need to DB_DNODE_EXIT() first in case
2384		 * zio_nowait() invalidates the dbuf.
2385		 */
2386		DB_DNODE_EXIT(db);
2387		zio_nowait(dr->dr_zio);
2388	}
2389}
2390
2391void
2392dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2393{
2394	dbuf_dirty_record_t *dr;
2395
2396	while (dr = list_head(list)) {
2397		if (dr->dr_zio != NULL) {
2398			/*
2399			 * If we find an already initialized zio then we
2400			 * are processing the meta-dnode, and we have finished.
2401			 * The dbufs for all dnodes are put back on the list
2402			 * during processing, so that we can zio_wait()
2403			 * these IOs after initiating all child IOs.
2404			 */
2405			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2406			    DMU_META_DNODE_OBJECT);
2407			break;
2408		}
2409		list_remove(list, dr);
2410		if (dr->dr_dbuf->db_level > 0)
2411			dbuf_sync_indirect(dr, tx);
2412		else
2413			dbuf_sync_leaf(dr, tx);
2414	}
2415}
2416
2417/* ARGSUSED */
2418static void
2419dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2420{
2421	dmu_buf_impl_t *db = vdb;
2422	dnode_t *dn;
2423	blkptr_t *bp = zio->io_bp;
2424	blkptr_t *bp_orig = &zio->io_bp_orig;
2425	spa_t *spa = zio->io_spa;
2426	int64_t delta;
2427	uint64_t fill = 0;
2428	int i;
2429
2430	ASSERT(db->db_blkptr == bp);
2431
2432	DB_DNODE_ENTER(db);
2433	dn = DB_DNODE(db);
2434	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2435	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2436	zio->io_prev_space_delta = delta;
2437
2438	if (BP_IS_HOLE(bp)) {
2439		ASSERT(bp->blk_fill == 0);
2440		DB_DNODE_EXIT(db);
2441		return;
2442	}
2443
2444	ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2445	    BP_GET_TYPE(bp) == dn->dn_type) ||
2446	    (db->db_blkid == DMU_SPILL_BLKID &&
2447	    BP_GET_TYPE(bp) == dn->dn_bonustype));
2448	ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2449
2450	mutex_enter(&db->db_mtx);
2451
2452#ifdef ZFS_DEBUG
2453	if (db->db_blkid == DMU_SPILL_BLKID) {
2454		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2455		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2456		    db->db_blkptr == &dn->dn_phys->dn_spill);
2457	}
2458#endif
2459
2460	if (db->db_level == 0) {
2461		mutex_enter(&dn->dn_mtx);
2462		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2463		    db->db_blkid != DMU_SPILL_BLKID)
2464			dn->dn_phys->dn_maxblkid = db->db_blkid;
2465		mutex_exit(&dn->dn_mtx);
2466
2467		if (dn->dn_type == DMU_OT_DNODE) {
2468			dnode_phys_t *dnp = db->db.db_data;
2469			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2470			    i--, dnp++) {
2471				if (dnp->dn_type != DMU_OT_NONE)
2472					fill++;
2473			}
2474		} else {
2475			fill = 1;
2476		}
2477	} else {
2478		blkptr_t *ibp = db->db.db_data;
2479		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2480		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2481			if (BP_IS_HOLE(ibp))
2482				continue;
2483			fill += ibp->blk_fill;
2484		}
2485	}
2486	DB_DNODE_EXIT(db);
2487
2488	bp->blk_fill = fill;
2489
2490	mutex_exit(&db->db_mtx);
2491}
2492
2493/* ARGSUSED */
2494static void
2495dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2496{
2497	dmu_buf_impl_t *db = vdb;
2498	blkptr_t *bp = zio->io_bp;
2499	blkptr_t *bp_orig = &zio->io_bp_orig;
2500	uint64_t txg = zio->io_txg;
2501	dbuf_dirty_record_t **drp, *dr;
2502
2503	ASSERT0(zio->io_error);
2504	ASSERT(db->db_blkptr == bp);
2505
2506	/*
2507	 * For nopwrites and rewrites we ensure that the bp matches our
2508	 * original and bypass all the accounting.
2509	 */
2510	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2511		ASSERT(BP_EQUAL(bp, bp_orig));
2512	} else {
2513		objset_t *os;
2514		dsl_dataset_t *ds;
2515		dmu_tx_t *tx;
2516
2517		DB_GET_OBJSET(&os, db);
2518		ds = os->os_dsl_dataset;
2519		tx = os->os_synctx;
2520
2521		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2522		dsl_dataset_block_born(ds, bp, tx);
2523	}
2524
2525	mutex_enter(&db->db_mtx);
2526
2527	DBUF_VERIFY(db);
2528
2529	drp = &db->db_last_dirty;
2530	while ((dr = *drp) != db->db_data_pending)
2531		drp = &dr->dr_next;
2532	ASSERT(!list_link_active(&dr->dr_dirty_node));
2533	ASSERT(dr->dr_txg == txg);
2534	ASSERT(dr->dr_dbuf == db);
2535	ASSERT(dr->dr_next == NULL);
2536	*drp = dr->dr_next;
2537
2538#ifdef ZFS_DEBUG
2539	if (db->db_blkid == DMU_SPILL_BLKID) {
2540		dnode_t *dn;
2541
2542		DB_DNODE_ENTER(db);
2543		dn = DB_DNODE(db);
2544		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2545		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2546		    db->db_blkptr == &dn->dn_phys->dn_spill);
2547		DB_DNODE_EXIT(db);
2548	}
2549#endif
2550
2551	if (db->db_level == 0) {
2552		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2553		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2554		if (db->db_state != DB_NOFILL) {
2555			if (dr->dt.dl.dr_data != db->db_buf)
2556				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2557				    db));
2558			else if (!arc_released(db->db_buf))
2559				arc_set_callback(db->db_buf, dbuf_do_evict, db);
2560		}
2561	} else {
2562		dnode_t *dn;
2563
2564		DB_DNODE_ENTER(db);
2565		dn = DB_DNODE(db);
2566		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2567		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2568		if (!BP_IS_HOLE(db->db_blkptr)) {
2569			int epbs =
2570			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2571			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2572			    db->db.db_size);
2573			ASSERT3U(dn->dn_phys->dn_maxblkid
2574			    >> (db->db_level * epbs), >=, db->db_blkid);
2575			arc_set_callback(db->db_buf, dbuf_do_evict, db);
2576		}
2577		DB_DNODE_EXIT(db);
2578		mutex_destroy(&dr->dt.di.dr_mtx);
2579		list_destroy(&dr->dt.di.dr_children);
2580	}
2581	kmem_free(dr, sizeof (dbuf_dirty_record_t));
2582
2583	cv_broadcast(&db->db_changed);
2584	ASSERT(db->db_dirtycnt > 0);
2585	db->db_dirtycnt -= 1;
2586	db->db_data_pending = NULL;
2587	dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2588}
2589
2590static void
2591dbuf_write_nofill_ready(zio_t *zio)
2592{
2593	dbuf_write_ready(zio, NULL, zio->io_private);
2594}
2595
2596static void
2597dbuf_write_nofill_done(zio_t *zio)
2598{
2599	dbuf_write_done(zio, NULL, zio->io_private);
2600}
2601
2602static void
2603dbuf_write_override_ready(zio_t *zio)
2604{
2605	dbuf_dirty_record_t *dr = zio->io_private;
2606	dmu_buf_impl_t *db = dr->dr_dbuf;
2607
2608	dbuf_write_ready(zio, NULL, db);
2609}
2610
2611static void
2612dbuf_write_override_done(zio_t *zio)
2613{
2614	dbuf_dirty_record_t *dr = zio->io_private;
2615	dmu_buf_impl_t *db = dr->dr_dbuf;
2616	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2617
2618	mutex_enter(&db->db_mtx);
2619	if (!BP_EQUAL(zio->io_bp, obp)) {
2620		if (!BP_IS_HOLE(obp))
2621			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2622		arc_release(dr->dt.dl.dr_data, db);
2623	}
2624	mutex_exit(&db->db_mtx);
2625
2626	dbuf_write_done(zio, NULL, db);
2627}
2628
2629static void
2630dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2631{
2632	dmu_buf_impl_t *db = dr->dr_dbuf;
2633	dnode_t *dn;
2634	objset_t *os;
2635	dmu_buf_impl_t *parent = db->db_parent;
2636	uint64_t txg = tx->tx_txg;
2637	zbookmark_t zb;
2638	zio_prop_t zp;
2639	zio_t *zio;
2640	int wp_flag = 0;
2641
2642	DB_DNODE_ENTER(db);
2643	dn = DB_DNODE(db);
2644	os = dn->dn_objset;
2645
2646	if (db->db_state != DB_NOFILL) {
2647		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2648			/*
2649			 * Private object buffers are released here rather
2650			 * than in dbuf_dirty() since they are only modified
2651			 * in the syncing context and we don't want the
2652			 * overhead of making multiple copies of the data.
2653			 */
2654			if (BP_IS_HOLE(db->db_blkptr)) {
2655				arc_buf_thaw(data);
2656			} else {
2657				dbuf_release_bp(db);
2658			}
2659		}
2660	}
2661
2662	if (parent != dn->dn_dbuf) {
2663		ASSERT(parent && parent->db_data_pending);
2664		ASSERT(db->db_level == parent->db_level-1);
2665		ASSERT(arc_released(parent->db_buf));
2666		zio = parent->db_data_pending->dr_zio;
2667	} else {
2668		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2669		    db->db_blkid != DMU_SPILL_BLKID) ||
2670		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2671		if (db->db_blkid != DMU_SPILL_BLKID)
2672			ASSERT3P(db->db_blkptr, ==,
2673			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
2674		zio = dn->dn_zio;
2675	}
2676
2677	ASSERT(db->db_level == 0 || data == db->db_buf);
2678	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2679	ASSERT(zio);
2680
2681	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2682	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2683	    db->db.db_object, db->db_level, db->db_blkid);
2684
2685	if (db->db_blkid == DMU_SPILL_BLKID)
2686		wp_flag = WP_SPILL;
2687	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2688
2689	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2690	DB_DNODE_EXIT(db);
2691
2692	if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2693		ASSERT(db->db_state != DB_NOFILL);
2694		dr->dr_zio = zio_write(zio, os->os_spa, txg,
2695		    db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2696		    dbuf_write_override_ready, dbuf_write_override_done, dr,
2697		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2698		mutex_enter(&db->db_mtx);
2699		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2700		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2701		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2702		mutex_exit(&db->db_mtx);
2703	} else if (db->db_state == DB_NOFILL) {
2704		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
2705		dr->dr_zio = zio_write(zio, os->os_spa, txg,
2706		    db->db_blkptr, NULL, db->db.db_size, &zp,
2707		    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
2708		    ZIO_PRIORITY_ASYNC_WRITE,
2709		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2710	} else {
2711		ASSERT(arc_released(data));
2712		dr->dr_zio = arc_write(zio, os->os_spa, txg,
2713		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
2714		    dbuf_write_ready, dbuf_write_done, db,
2715		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2716	}
2717}
2718