1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23219636Spjd * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24307287Smav * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
25251478Sdelphij * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26255750Sdelphij * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27288549Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28297112Smav * Copyright (c) 2014 Integros [integros.com]
29168404Spjd */
30168404Spjd
31168404Spjd#include <sys/zfs_context.h>
32168404Spjd#include <sys/dmu.h>
33253821Sdelphij#include <sys/dmu_send.h>
34168404Spjd#include <sys/dmu_impl.h>
35168404Spjd#include <sys/dbuf.h>
36168404Spjd#include <sys/dmu_objset.h>
37168404Spjd#include <sys/dsl_dataset.h>
38168404Spjd#include <sys/dsl_dir.h>
39168404Spjd#include <sys/dmu_tx.h>
40168404Spjd#include <sys/spa.h>
41168404Spjd#include <sys/zio.h>
42168404Spjd#include <sys/dmu_zfetch.h>
43219089Spjd#include <sys/sa.h>
44219089Spjd#include <sys/sa_impl.h>
45268649Sdelphij#include <sys/zfeature.h>
46268649Sdelphij#include <sys/blkptr.h>
47265740Sdelphij#include <sys/range_tree.h>
48307266Smav#include <sys/callb.h>
49168404Spjd
50307266Smavuint_t zfs_dbuf_evict_key;
51307266Smav
52254753Sdelphij/*
53254753Sdelphij * Number of times that zfs_free_range() took the slow path while doing
54254753Sdelphij * a zfs receive.  A nonzero value indicates a potential performance problem.
55254753Sdelphij */
56254753Sdelphijuint64_t zfs_free_range_recv_miss;
57254753Sdelphij
58248571Smmstatic boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
59185029Spjdstatic void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
60168404Spjd
61168404Spjd/*
62168404Spjd * Global data structures and functions for the dbuf cache.
63168404Spjd */
64307266Smavstatic kmem_cache_t *dbuf_kmem_cache;
65288549Smavstatic taskq_t *dbu_evict_taskq;
66168404Spjd
67307266Smavstatic kthread_t *dbuf_cache_evict_thread;
68307266Smavstatic kmutex_t dbuf_evict_lock;
69307266Smavstatic kcondvar_t dbuf_evict_cv;
70307266Smavstatic boolean_t dbuf_evict_thread_exit;
71307266Smav
72307266Smav/*
73307266Smav * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
74307266Smav * are not currently held but have been recently released. These dbufs
75307266Smav * are not eligible for arc eviction until they are aged out of the cache.
76307266Smav * Dbufs are added to the dbuf cache once the last hold is released. If a
77307266Smav * dbuf is later accessed and still exists in the dbuf cache, then it will
78307266Smav * be removed from the cache and later re-added to the head of the cache.
79307266Smav * Dbufs that are aged out of the cache will be immediately destroyed and
80307266Smav * become eligible for arc eviction.
81307266Smav */
82307266Smavstatic multilist_t dbuf_cache;
83307266Smavstatic refcount_t dbuf_cache_size;
84307266Smavuint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024;
85307266Smav
86307266Smav/* Cap the size of the dbuf cache to log2 fraction of arc size. */
87307266Smavint dbuf_cache_max_shift = 5;
88307266Smav
89307266Smav/*
90307266Smav * The dbuf cache uses a three-stage eviction policy:
91307266Smav *	- A low water marker designates when the dbuf eviction thread
92307266Smav *	should stop evicting from the dbuf cache.
93307266Smav *	- When we reach the maximum size (aka mid water mark), we
94307266Smav *	signal the eviction thread to run.
95307266Smav *	- The high water mark indicates when the eviction thread
96307266Smav *	is unable to keep up with the incoming load and eviction must
97307266Smav *	happen in the context of the calling thread.
98307266Smav *
99307266Smav * The dbuf cache:
100307266Smav *                                                 (max size)
101307266Smav *                                      low water   mid water   hi water
102307266Smav * +----------------------------------------+----------+----------+
103307266Smav * |                                        |          |          |
104307266Smav * |                                        |          |          |
105307266Smav * |                                        |          |          |
106307266Smav * |                                        |          |          |
107307266Smav * +----------------------------------------+----------+----------+
108307266Smav *                                        stop        signal     evict
109307266Smav *                                      evicting     eviction   directly
110307266Smav *                                                    thread
111307266Smav *
112307266Smav * The high and low water marks indicate the operating range for the eviction
113307266Smav * thread. The low water mark is, by default, 90% of the total size of the
114307266Smav * cache and the high water mark is at 110% (both of these percentages can be
115307266Smav * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
116307266Smav * respectively). The eviction thread will try to ensure that the cache remains
117307266Smav * within this range by waking up every second and checking if the cache is
118307266Smav * above the low water mark. The thread can also be woken up by callers adding
119307266Smav * elements into the cache if the cache is larger than the mid water (i.e max
120307266Smav * cache size). Once the eviction thread is woken up and eviction is required,
121307266Smav * it will continue evicting buffers until it's able to reduce the cache size
122307266Smav * to the low water mark. If the cache size continues to grow and hits the high
123307266Smav * water mark, then callers adding elments to the cache will begin to evict
124307266Smav * directly from the cache until the cache is no longer above the high water
125307266Smav * mark.
126307266Smav */
127307266Smav
128307266Smav/*
129307266Smav * The percentage above and below the maximum cache size.
130307266Smav */
131307266Smavuint_t dbuf_cache_hiwater_pct = 10;
132307266Smavuint_t dbuf_cache_lowater_pct = 10;
133307266Smav
134168404Spjd/* ARGSUSED */
135168404Spjdstatic int
136168404Spjddbuf_cons(void *vdb, void *unused, int kmflag)
137168404Spjd{
138168404Spjd	dmu_buf_impl_t *db = vdb;
139168404Spjd	bzero(db, sizeof (dmu_buf_impl_t));
140168404Spjd
141168404Spjd	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
142168404Spjd	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
143307266Smav	multilist_link_init(&db->db_cache_link);
144168404Spjd	refcount_create(&db->db_holds);
145269845Sdelphij
146168404Spjd	return (0);
147168404Spjd}
148168404Spjd
149168404Spjd/* ARGSUSED */
150168404Spjdstatic void
151168404Spjddbuf_dest(void *vdb, void *unused)
152168404Spjd{
153168404Spjd	dmu_buf_impl_t *db = vdb;
154168404Spjd	mutex_destroy(&db->db_mtx);
155168404Spjd	cv_destroy(&db->db_changed);
156307266Smav	ASSERT(!multilist_link_active(&db->db_cache_link));
157168404Spjd	refcount_destroy(&db->db_holds);
158168404Spjd}
159168404Spjd
160168404Spjd/*
161168404Spjd * dbuf hash table routines
162168404Spjd */
163168404Spjdstatic dbuf_hash_table_t dbuf_hash_table;
164168404Spjd
165168404Spjdstatic uint64_t dbuf_hash_count;
166168404Spjd
167168404Spjdstatic uint64_t
168168404Spjddbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
169168404Spjd{
170168404Spjd	uintptr_t osv = (uintptr_t)os;
171168404Spjd	uint64_t crc = -1ULL;
172168404Spjd
173168404Spjd	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
174168404Spjd	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
175168404Spjd	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
176168404Spjd	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
177168404Spjd	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
178168404Spjd	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
179168404Spjd	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
180168404Spjd
181168404Spjd	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
182168404Spjd
183168404Spjd	return (crc);
184168404Spjd}
185168404Spjd
186168404Spjd#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
187168404Spjd	((dbuf)->db.db_object == (obj) &&		\
188168404Spjd	(dbuf)->db_objset == (os) &&			\
189168404Spjd	(dbuf)->db_level == (level) &&			\
190168404Spjd	(dbuf)->db_blkid == (blkid))
191168404Spjd
192168404Spjddmu_buf_impl_t *
193288538Smavdbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
194168404Spjd{
195168404Spjd	dbuf_hash_table_t *h = &dbuf_hash_table;
196307266Smav	uint64_t hv = dbuf_hash(os, obj, level, blkid);
197168404Spjd	uint64_t idx = hv & h->hash_table_mask;
198168404Spjd	dmu_buf_impl_t *db;
199168404Spjd
200168404Spjd	mutex_enter(DBUF_HASH_MUTEX(h, idx));
201168404Spjd	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
202168404Spjd		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
203168404Spjd			mutex_enter(&db->db_mtx);
204168404Spjd			if (db->db_state != DB_EVICTING) {
205168404Spjd				mutex_exit(DBUF_HASH_MUTEX(h, idx));
206168404Spjd				return (db);
207168404Spjd			}
208168404Spjd			mutex_exit(&db->db_mtx);
209168404Spjd		}
210168404Spjd	}
211168404Spjd	mutex_exit(DBUF_HASH_MUTEX(h, idx));
212168404Spjd	return (NULL);
213168404Spjd}
214168404Spjd
215288538Smavstatic dmu_buf_impl_t *
216288538Smavdbuf_find_bonus(objset_t *os, uint64_t object)
217288538Smav{
218288538Smav	dnode_t *dn;
219288538Smav	dmu_buf_impl_t *db = NULL;
220288538Smav
221288538Smav	if (dnode_hold(os, object, FTAG, &dn) == 0) {
222288538Smav		rw_enter(&dn->dn_struct_rwlock, RW_READER);
223288538Smav		if (dn->dn_bonus != NULL) {
224288538Smav			db = dn->dn_bonus;
225288538Smav			mutex_enter(&db->db_mtx);
226288538Smav		}
227288538Smav		rw_exit(&dn->dn_struct_rwlock);
228288538Smav		dnode_rele(dn, FTAG);
229288538Smav	}
230288538Smav	return (db);
231288538Smav}
232288538Smav
233168404Spjd/*
234168404Spjd * Insert an entry into the hash table.  If there is already an element
235168404Spjd * equal to elem in the hash table, then the already existing element
236168404Spjd * will be returned and the new element will not be inserted.
237168404Spjd * Otherwise returns NULL.
238168404Spjd */
239168404Spjdstatic dmu_buf_impl_t *
240168404Spjddbuf_hash_insert(dmu_buf_impl_t *db)
241168404Spjd{
242168404Spjd	dbuf_hash_table_t *h = &dbuf_hash_table;
243219089Spjd	objset_t *os = db->db_objset;
244168404Spjd	uint64_t obj = db->db.db_object;
245168404Spjd	int level = db->db_level;
246168404Spjd	uint64_t blkid = db->db_blkid;
247307266Smav	uint64_t hv = dbuf_hash(os, obj, level, blkid);
248168404Spjd	uint64_t idx = hv & h->hash_table_mask;
249168404Spjd	dmu_buf_impl_t *dbf;
250168404Spjd
251168404Spjd	mutex_enter(DBUF_HASH_MUTEX(h, idx));
252168404Spjd	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
253168404Spjd		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
254168404Spjd			mutex_enter(&dbf->db_mtx);
255168404Spjd			if (dbf->db_state != DB_EVICTING) {
256168404Spjd				mutex_exit(DBUF_HASH_MUTEX(h, idx));
257168404Spjd				return (dbf);
258168404Spjd			}
259168404Spjd			mutex_exit(&dbf->db_mtx);
260168404Spjd		}
261168404Spjd	}
262168404Spjd
263168404Spjd	mutex_enter(&db->db_mtx);
264168404Spjd	db->db_hash_next = h->hash_table[idx];
265168404Spjd	h->hash_table[idx] = db;
266168404Spjd	mutex_exit(DBUF_HASH_MUTEX(h, idx));
267271001Sdelphij	atomic_inc_64(&dbuf_hash_count);
268168404Spjd
269168404Spjd	return (NULL);
270168404Spjd}
271168404Spjd
272168404Spjd/*
273269417Sdelphij * Remove an entry from the hash table.  It must be in the EVICTING state.
274168404Spjd */
275168404Spjdstatic void
276168404Spjddbuf_hash_remove(dmu_buf_impl_t *db)
277168404Spjd{
278168404Spjd	dbuf_hash_table_t *h = &dbuf_hash_table;
279307266Smav	uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object,
280168404Spjd	    db->db_level, db->db_blkid);
281168404Spjd	uint64_t idx = hv & h->hash_table_mask;
282168404Spjd	dmu_buf_impl_t *dbf, **dbp;
283168404Spjd
284168404Spjd	/*
285269417Sdelphij	 * We musn't hold db_mtx to maintain lock ordering:
286168404Spjd	 * DBUF_HASH_MUTEX > db_mtx.
287168404Spjd	 */
288168404Spjd	ASSERT(refcount_is_zero(&db->db_holds));
289168404Spjd	ASSERT(db->db_state == DB_EVICTING);
290168404Spjd	ASSERT(!MUTEX_HELD(&db->db_mtx));
291168404Spjd
292168404Spjd	mutex_enter(DBUF_HASH_MUTEX(h, idx));
293168404Spjd	dbp = &h->hash_table[idx];
294168404Spjd	while ((dbf = *dbp) != db) {
295168404Spjd		dbp = &dbf->db_hash_next;
296168404Spjd		ASSERT(dbf != NULL);
297168404Spjd	}
298168404Spjd	*dbp = db->db_hash_next;
299168404Spjd	db->db_hash_next = NULL;
300168404Spjd	mutex_exit(DBUF_HASH_MUTEX(h, idx));
301271001Sdelphij	atomic_dec_64(&dbuf_hash_count);
302168404Spjd}
303168404Spjd
304288549Smavtypedef enum {
305288549Smav	DBVU_EVICTING,
306288549Smav	DBVU_NOT_EVICTING
307288549Smav} dbvu_verify_type_t;
308288549Smav
309168404Spjdstatic void
310288549Smavdbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
311288549Smav{
312288549Smav#ifdef ZFS_DEBUG
313288549Smav	int64_t holds;
314288549Smav
315288549Smav	if (db->db_user == NULL)
316288549Smav		return;
317288549Smav
318288549Smav	/* Only data blocks support the attachment of user data. */
319288549Smav	ASSERT(db->db_level == 0);
320288549Smav
321288549Smav	/* Clients must resolve a dbuf before attaching user data. */
322288549Smav	ASSERT(db->db.db_data != NULL);
323288549Smav	ASSERT3U(db->db_state, ==, DB_CACHED);
324288549Smav
325288549Smav	holds = refcount_count(&db->db_holds);
326288549Smav	if (verify_type == DBVU_EVICTING) {
327288549Smav		/*
328288549Smav		 * Immediate eviction occurs when holds == dirtycnt.
329288549Smav		 * For normal eviction buffers, holds is zero on
330288549Smav		 * eviction, except when dbuf_fix_old_data() calls
331288549Smav		 * dbuf_clear_data().  However, the hold count can grow
332288549Smav		 * during eviction even though db_mtx is held (see
333288549Smav		 * dmu_bonus_hold() for an example), so we can only
334288549Smav		 * test the generic invariant that holds >= dirtycnt.
335288549Smav		 */
336288549Smav		ASSERT3U(holds, >=, db->db_dirtycnt);
337288549Smav	} else {
338290754Smav		if (db->db_user_immediate_evict == TRUE)
339288549Smav			ASSERT3U(holds, >=, db->db_dirtycnt);
340288549Smav		else
341288549Smav			ASSERT3U(holds, >, 0);
342288549Smav	}
343288549Smav#endif
344288549Smav}
345288549Smav
346288549Smavstatic void
347168404Spjddbuf_evict_user(dmu_buf_impl_t *db)
348168404Spjd{
349288549Smav	dmu_buf_user_t *dbu = db->db_user;
350288549Smav
351168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
352168404Spjd
353288549Smav	if (dbu == NULL)
354168404Spjd		return;
355168404Spjd
356288549Smav	dbuf_verify_user(db, DBVU_EVICTING);
357288549Smav	db->db_user = NULL;
358288549Smav
359288549Smav#ifdef ZFS_DEBUG
360288549Smav	if (dbu->dbu_clear_on_evict_dbufp != NULL)
361288549Smav		*dbu->dbu_clear_on_evict_dbufp = NULL;
362288549Smav#endif
363288549Smav
364288549Smav	/*
365288549Smav	 * Invoke the callback from a taskq to avoid lock order reversals
366288549Smav	 * and limit stack depth.
367288549Smav	 */
368288549Smav	taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0,
369288549Smav	    &dbu->dbu_tqent);
370168404Spjd}
371168404Spjd
372219089Spjdboolean_t
373219089Spjddbuf_is_metadata(dmu_buf_impl_t *db)
374219089Spjd{
375219089Spjd	if (db->db_level > 0) {
376219089Spjd		return (B_TRUE);
377219089Spjd	} else {
378219089Spjd		boolean_t is_metadata;
379219089Spjd
380219089Spjd		DB_DNODE_ENTER(db);
381236884Smm		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
382219089Spjd		DB_DNODE_EXIT(db);
383219089Spjd
384219089Spjd		return (is_metadata);
385219089Spjd	}
386219089Spjd}
387219089Spjd
388307266Smav/*
389307266Smav * This function *must* return indices evenly distributed between all
390307266Smav * sublists of the multilist. This is needed due to how the dbuf eviction
391307266Smav * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
392307266Smav * distributed between all sublists and uses this assumption when
393307266Smav * deciding which sublist to evict from and how much to evict from it.
394307266Smav */
395307266Smavunsigned int
396307266Smavdbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
397168404Spjd{
398307266Smav	dmu_buf_impl_t *db = obj;
399168404Spjd
400307266Smav	/*
401307266Smav	 * The assumption here, is the hash value for a given
402307266Smav	 * dmu_buf_impl_t will remain constant throughout it's lifetime
403307266Smav	 * (i.e. it's objset, object, level and blkid fields don't change).
404307266Smav	 * Thus, we don't need to store the dbuf's sublist index
405307266Smav	 * on insertion, as this index can be recalculated on removal.
406307266Smav	 *
407307266Smav	 * Also, the low order bits of the hash value are thought to be
408307266Smav	 * distributed evenly. Otherwise, in the case that the multilist
409307266Smav	 * has a power of two number of sublists, each sublists' usage
410307266Smav	 * would not be evenly distributed.
411307266Smav	 */
412307266Smav	return (dbuf_hash(db->db_objset, db->db.db_object,
413307266Smav	    db->db_level, db->db_blkid) %
414307266Smav	    multilist_get_num_sublists(ml));
415168404Spjd}
416168404Spjd
417307266Smavstatic inline boolean_t
418307266Smavdbuf_cache_above_hiwater(void)
419307266Smav{
420307266Smav	uint64_t dbuf_cache_hiwater_bytes =
421307266Smav	    (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
422307266Smav
423307266Smav	return (refcount_count(&dbuf_cache_size) >
424307266Smav	    dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
425307266Smav}
426307266Smav
427307266Smavstatic inline boolean_t
428307266Smavdbuf_cache_above_lowater(void)
429307266Smav{
430307266Smav	uint64_t dbuf_cache_lowater_bytes =
431307266Smav	    (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
432307266Smav
433307266Smav	return (refcount_count(&dbuf_cache_size) >
434307266Smav	    dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
435307266Smav}
436307266Smav
437307266Smav/*
438307266Smav * Evict the oldest eligible dbuf from the dbuf cache.
439307266Smav */
440307266Smavstatic void
441307266Smavdbuf_evict_one(void)
442307266Smav{
443307266Smav	int idx = multilist_get_random_index(&dbuf_cache);
444307266Smav	multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx);
445307266Smav
446307266Smav	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
447307266Smav
448307266Smav	/*
449307266Smav	 * Set the thread's tsd to indicate that it's processing evictions.
450307266Smav	 * Once a thread stops evicting from the dbuf cache it will
451307266Smav	 * reset its tsd to NULL.
452307266Smav	 */
453307266Smav	ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
454307266Smav	(void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
455307266Smav
456307266Smav	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
457307266Smav	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
458307266Smav		db = multilist_sublist_prev(mls, db);
459307266Smav	}
460307266Smav
461307266Smav	DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
462307266Smav	    multilist_sublist_t *, mls);
463307266Smav
464307266Smav	if (db != NULL) {
465307266Smav		multilist_sublist_remove(mls, db);
466307266Smav		multilist_sublist_unlock(mls);
467307266Smav		(void) refcount_remove_many(&dbuf_cache_size,
468307266Smav		    db->db.db_size, db);
469307266Smav		dbuf_destroy(db);
470307266Smav	} else {
471307266Smav		multilist_sublist_unlock(mls);
472307266Smav	}
473307266Smav	(void) tsd_set(zfs_dbuf_evict_key, NULL);
474307266Smav}
475307266Smav
476307266Smav/*
477307266Smav * The dbuf evict thread is responsible for aging out dbufs from the
478307266Smav * cache. Once the cache has reached it's maximum size, dbufs are removed
479307266Smav * and destroyed. The eviction thread will continue running until the size
480307266Smav * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
481307266Smav * out of the cache it is destroyed and becomes eligible for arc eviction.
482307266Smav */
483307266Smavstatic void
484307266Smavdbuf_evict_thread(void *dummy __unused)
485307266Smav{
486307266Smav	callb_cpr_t cpr;
487307266Smav
488307266Smav	CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
489307266Smav
490307266Smav	mutex_enter(&dbuf_evict_lock);
491307266Smav	while (!dbuf_evict_thread_exit) {
492307266Smav		while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
493307266Smav			CALLB_CPR_SAFE_BEGIN(&cpr);
494307266Smav			(void) cv_timedwait_hires(&dbuf_evict_cv,
495307266Smav			    &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
496307266Smav			CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
497307266Smav		}
498307266Smav		mutex_exit(&dbuf_evict_lock);
499307266Smav
500307266Smav		/*
501307266Smav		 * Keep evicting as long as we're above the low water mark
502307266Smav		 * for the cache. We do this without holding the locks to
503307266Smav		 * minimize lock contention.
504307266Smav		 */
505307266Smav		while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
506307266Smav			dbuf_evict_one();
507307266Smav		}
508307266Smav
509307266Smav		mutex_enter(&dbuf_evict_lock);
510307266Smav	}
511307266Smav
512307266Smav	dbuf_evict_thread_exit = B_FALSE;
513307266Smav	cv_broadcast(&dbuf_evict_cv);
514307266Smav	CALLB_CPR_EXIT(&cpr);	/* drops dbuf_evict_lock */
515307266Smav	thread_exit();
516307266Smav}
517307266Smav
518307266Smav/*
519307266Smav * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
520307266Smav * If the dbuf cache is at its high water mark, then evict a dbuf from the
521307266Smav * dbuf cache using the callers context.
522307266Smav */
523307266Smavstatic void
524307266Smavdbuf_evict_notify(void)
525307266Smav{
526307266Smav
527307266Smav	/*
528307266Smav	 * We use thread specific data to track when a thread has
529307266Smav	 * started processing evictions. This allows us to avoid deeply
530307266Smav	 * nested stacks that would have a call flow similar to this:
531307266Smav	 *
532307266Smav	 * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
533307266Smav	 *	^						|
534307266Smav	 *	|						|
535307266Smav	 *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
536307266Smav	 *
537307266Smav	 * The dbuf_eviction_thread will always have its tsd set until
538307266Smav	 * that thread exits. All other threads will only set their tsd
539307266Smav	 * if they are participating in the eviction process. This only
540307266Smav	 * happens if the eviction thread is unable to process evictions
541307266Smav	 * fast enough. To keep the dbuf cache size in check, other threads
542307266Smav	 * can evict from the dbuf cache directly. Those threads will set
543307266Smav	 * their tsd values so that we ensure that they only evict one dbuf
544307266Smav	 * from the dbuf cache.
545307266Smav	 */
546307266Smav	if (tsd_get(zfs_dbuf_evict_key) != NULL)
547307266Smav		return;
548307266Smav
549307266Smav	if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
550307266Smav		boolean_t evict_now = B_FALSE;
551307266Smav
552307266Smav		mutex_enter(&dbuf_evict_lock);
553307266Smav		if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
554307266Smav			evict_now = dbuf_cache_above_hiwater();
555307266Smav			cv_signal(&dbuf_evict_cv);
556307266Smav		}
557307266Smav		mutex_exit(&dbuf_evict_lock);
558307266Smav
559307266Smav		if (evict_now) {
560307266Smav			dbuf_evict_one();
561307266Smav		}
562307266Smav	}
563307266Smav}
564307266Smav
565168404Spjdvoid
566168404Spjddbuf_init(void)
567168404Spjd{
568168404Spjd	uint64_t hsize = 1ULL << 16;
569168404Spjd	dbuf_hash_table_t *h = &dbuf_hash_table;
570168404Spjd	int i;
571168404Spjd
572168404Spjd	/*
573168404Spjd	 * The hash table is big enough to fill all of physical memory
574168404Spjd	 * with an average 4K block size.  The table will take up
575168404Spjd	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
576168404Spjd	 */
577168696Spjd	while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
578168404Spjd		hsize <<= 1;
579168404Spjd
580168404Spjdretry:
581168404Spjd	h->hash_table_mask = hsize - 1;
582168404Spjd	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
583168404Spjd	if (h->hash_table == NULL) {
584168404Spjd		/* XXX - we should really return an error instead of assert */
585168404Spjd		ASSERT(hsize > (1ULL << 10));
586168404Spjd		hsize >>= 1;
587168404Spjd		goto retry;
588168404Spjd	}
589168404Spjd
590307266Smav	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
591168404Spjd	    sizeof (dmu_buf_impl_t),
592168404Spjd	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
593168404Spjd
594168404Spjd	for (i = 0; i < DBUF_MUTEXES; i++)
595168404Spjd		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
596288549Smav
597288549Smav	/*
598307266Smav	 * Setup the parameters for the dbuf cache. We cap the size of the
599307266Smav	 * dbuf cache to 1/32nd (default) of the size of the ARC.
600307266Smav	 */
601307266Smav	dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes,
602307266Smav	    arc_max_bytes() >> dbuf_cache_max_shift);
603307266Smav
604307266Smav	/*
605288549Smav	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
606288549Smav	 * configuration is not required.
607288549Smav	 */
608288549Smav	dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
609307266Smav
610307266Smav	multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t),
611307266Smav	    offsetof(dmu_buf_impl_t, db_cache_link),
612307266Smav	    zfs_arc_num_sublists_per_state,
613307266Smav	    dbuf_cache_multilist_index_func);
614307266Smav	refcount_create(&dbuf_cache_size);
615307266Smav
616307266Smav	tsd_create(&zfs_dbuf_evict_key, NULL);
617307266Smav	dbuf_evict_thread_exit = B_FALSE;
618307266Smav	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
619307266Smav	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
620307266Smav	dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
621307266Smav	    NULL, 0, &p0, TS_RUN, minclsyspri);
622168404Spjd}
623168404Spjd
624168404Spjdvoid
625168404Spjddbuf_fini(void)
626168404Spjd{
627168404Spjd	dbuf_hash_table_t *h = &dbuf_hash_table;
628168404Spjd	int i;
629168404Spjd
630168404Spjd	for (i = 0; i < DBUF_MUTEXES; i++)
631168404Spjd		mutex_destroy(&h->hash_mutexes[i]);
632168404Spjd	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
633307266Smav	kmem_cache_destroy(dbuf_kmem_cache);
634288549Smav	taskq_destroy(dbu_evict_taskq);
635307266Smav
636307266Smav	mutex_enter(&dbuf_evict_lock);
637307266Smav	dbuf_evict_thread_exit = B_TRUE;
638307266Smav	while (dbuf_evict_thread_exit) {
639307266Smav		cv_signal(&dbuf_evict_cv);
640307266Smav		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
641307266Smav	}
642307266Smav	mutex_exit(&dbuf_evict_lock);
643307266Smav	tsd_destroy(&zfs_dbuf_evict_key);
644307266Smav
645307266Smav	mutex_destroy(&dbuf_evict_lock);
646307266Smav	cv_destroy(&dbuf_evict_cv);
647307266Smav
648307266Smav	refcount_destroy(&dbuf_cache_size);
649307266Smav	multilist_destroy(&dbuf_cache);
650168404Spjd}
651168404Spjd
652168404Spjd/*
653168404Spjd * Other stuff.
654168404Spjd */
655168404Spjd
656168404Spjd#ifdef ZFS_DEBUG
657168404Spjdstatic void
658168404Spjddbuf_verify(dmu_buf_impl_t *db)
659168404Spjd{
660219089Spjd	dnode_t *dn;
661219089Spjd	dbuf_dirty_record_t *dr;
662168404Spjd
663168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
664168404Spjd
665168404Spjd	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
666168404Spjd		return;
667168404Spjd
668168404Spjd	ASSERT(db->db_objset != NULL);
669219089Spjd	DB_DNODE_ENTER(db);
670219089Spjd	dn = DB_DNODE(db);
671168404Spjd	if (dn == NULL) {
672168404Spjd		ASSERT(db->db_parent == NULL);
673168404Spjd		ASSERT(db->db_blkptr == NULL);
674168404Spjd	} else {
675168404Spjd		ASSERT3U(db->db.db_object, ==, dn->dn_object);
676168404Spjd		ASSERT3P(db->db_objset, ==, dn->dn_objset);
677168404Spjd		ASSERT3U(db->db_level, <, dn->dn_nlevels);
678219089Spjd		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
679219089Spjd		    db->db_blkid == DMU_SPILL_BLKID ||
680269845Sdelphij		    !avl_is_empty(&dn->dn_dbufs));
681168404Spjd	}
682219089Spjd	if (db->db_blkid == DMU_BONUS_BLKID) {
683168404Spjd		ASSERT(dn != NULL);
684185029Spjd		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
685219089Spjd		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
686219089Spjd	} else if (db->db_blkid == DMU_SPILL_BLKID) {
687219089Spjd		ASSERT(dn != NULL);
688219089Spjd		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
689240415Smm		ASSERT0(db->db.db_offset);
690168404Spjd	} else {
691168404Spjd		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
692168404Spjd	}
693168404Spjd
694219089Spjd	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
695219089Spjd		ASSERT(dr->dr_dbuf == db);
696219089Spjd
697219089Spjd	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
698219089Spjd		ASSERT(dr->dr_dbuf == db);
699219089Spjd
700208047Smm	/*
701208047Smm	 * We can't assert that db_size matches dn_datablksz because it
702208047Smm	 * can be momentarily different when another thread is doing
703208047Smm	 * dnode_set_blksz().
704208047Smm	 */
705208047Smm	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
706219089Spjd		dr = db->db_data_pending;
707208047Smm		/*
708208047Smm		 * It should only be modified in syncing context, so
709208047Smm		 * make sure we only have one copy of the data.
710208047Smm		 */
711208047Smm		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
712168404Spjd	}
713168404Spjd
714168404Spjd	/* verify db->db_blkptr */
715168404Spjd	if (db->db_blkptr) {
716168404Spjd		if (db->db_parent == dn->dn_dbuf) {
717168404Spjd			/* db is pointed to by the dnode */
718168404Spjd			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
719209962Smm			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
720168404Spjd				ASSERT(db->db_parent == NULL);
721168404Spjd			else
722168404Spjd				ASSERT(db->db_parent != NULL);
723219089Spjd			if (db->db_blkid != DMU_SPILL_BLKID)
724219089Spjd				ASSERT3P(db->db_blkptr, ==,
725219089Spjd				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
726168404Spjd		} else {
727168404Spjd			/* db is pointed to by an indirect block */
728168404Spjd			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
729168404Spjd			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
730168404Spjd			ASSERT3U(db->db_parent->db.db_object, ==,
731168404Spjd			    db->db.db_object);
732168404Spjd			/*
733168404Spjd			 * dnode_grow_indblksz() can make this fail if we don't
734168404Spjd			 * have the struct_rwlock.  XXX indblksz no longer
735168404Spjd			 * grows.  safe to do this now?
736168404Spjd			 */
737219089Spjd			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
738168404Spjd				ASSERT3P(db->db_blkptr, ==,
739168404Spjd				    ((blkptr_t *)db->db_parent->db.db_data +
740168404Spjd				    db->db_blkid % epb));
741168404Spjd			}
742168404Spjd		}
743168404Spjd	}
744168404Spjd	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
745219089Spjd	    (db->db_buf == NULL || db->db_buf->b_data) &&
746219089Spjd	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
747168404Spjd	    db->db_state != DB_FILL && !dn->dn_free_txg) {
748168404Spjd		/*
749168404Spjd		 * If the blkptr isn't set but they have nonzero data,
750168404Spjd		 * it had better be dirty, otherwise we'll lose that
751168404Spjd		 * data when we evict this buffer.
752304139Savg		 *
753304139Savg		 * There is an exception to this rule for indirect blocks; in
754304139Savg		 * this case, if the indirect block is a hole, we fill in a few
755304139Savg		 * fields on each of the child blocks (importantly, birth time)
756304139Savg		 * to prevent hole birth times from being lost when you
757304139Savg		 * partially fill in a hole.
758168404Spjd		 */
759168404Spjd		if (db->db_dirtycnt == 0) {
760304139Savg			if (db->db_level == 0) {
761304139Savg				uint64_t *buf = db->db.db_data;
762304139Savg				int i;
763168404Spjd
764304139Savg				for (i = 0; i < db->db.db_size >> 3; i++) {
765304139Savg					ASSERT(buf[i] == 0);
766304139Savg				}
767304139Savg			} else {
768304139Savg				blkptr_t *bps = db->db.db_data;
769304139Savg				ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
770304139Savg				    db->db.db_size);
771304139Savg				/*
772304139Savg				 * We want to verify that all the blkptrs in the
773304139Savg				 * indirect block are holes, but we may have
774304139Savg				 * automatically set up a few fields for them.
775304139Savg				 * We iterate through each blkptr and verify
776304139Savg				 * they only have those fields set.
777304139Savg				 */
778304139Savg				for (int i = 0;
779304139Savg				    i < db->db.db_size / sizeof (blkptr_t);
780304139Savg				    i++) {
781304139Savg					blkptr_t *bp = &bps[i];
782304139Savg					ASSERT(ZIO_CHECKSUM_IS_ZERO(
783304139Savg					    &bp->blk_cksum));
784304139Savg					ASSERT(
785304139Savg					    DVA_IS_EMPTY(&bp->blk_dva[0]) &&
786304139Savg					    DVA_IS_EMPTY(&bp->blk_dva[1]) &&
787304139Savg					    DVA_IS_EMPTY(&bp->blk_dva[2]));
788304139Savg					ASSERT0(bp->blk_fill);
789304139Savg					ASSERT0(bp->blk_pad[0]);
790304139Savg					ASSERT0(bp->blk_pad[1]);
791304139Savg					ASSERT(!BP_IS_EMBEDDED(bp));
792304139Savg					ASSERT(BP_IS_HOLE(bp));
793304139Savg					ASSERT0(bp->blk_phys_birth);
794304139Savg				}
795168404Spjd			}
796168404Spjd		}
797168404Spjd	}
798219089Spjd	DB_DNODE_EXIT(db);
799168404Spjd}
800168404Spjd#endif
801168404Spjd
802168404Spjdstatic void
803288549Smavdbuf_clear_data(dmu_buf_impl_t *db)
804288549Smav{
805288549Smav	ASSERT(MUTEX_HELD(&db->db_mtx));
806288549Smav	dbuf_evict_user(db);
807307266Smav	ASSERT3P(db->db_buf, ==, NULL);
808288549Smav	db->db.db_data = NULL;
809288549Smav	if (db->db_state != DB_NOFILL)
810288549Smav		db->db_state = DB_UNCACHED;
811288549Smav}
812288549Smav
813288549Smavstatic void
814168404Spjddbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
815168404Spjd{
816168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
817288549Smav	ASSERT(buf != NULL);
818288549Smav
819168404Spjd	db->db_buf = buf;
820288549Smav	ASSERT(buf->b_data != NULL);
821288549Smav	db->db.db_data = buf->b_data;
822168404Spjd}
823168404Spjd
824219089Spjd/*
825219089Spjd * Loan out an arc_buf for read.  Return the loaned arc_buf.
826219089Spjd */
827219089Spjdarc_buf_t *
828219089Spjddbuf_loan_arcbuf(dmu_buf_impl_t *db)
829219089Spjd{
830219089Spjd	arc_buf_t *abuf;
831219089Spjd
832307266Smav	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
833219089Spjd	mutex_enter(&db->db_mtx);
834219089Spjd	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
835219089Spjd		int blksz = db->db.db_size;
836263397Sdelphij		spa_t *spa = db->db_objset->os_spa;
837219089Spjd
838219089Spjd		mutex_exit(&db->db_mtx);
839219089Spjd		abuf = arc_loan_buf(spa, blksz);
840219089Spjd		bcopy(db->db.db_data, abuf->b_data, blksz);
841219089Spjd	} else {
842219089Spjd		abuf = db->db_buf;
843219089Spjd		arc_loan_inuse_buf(abuf, db);
844307266Smav		db->db_buf = NULL;
845288549Smav		dbuf_clear_data(db);
846219089Spjd		mutex_exit(&db->db_mtx);
847219089Spjd	}
848219089Spjd	return (abuf);
849219089Spjd}
850219089Spjd
851288571Smav/*
852288571Smav * Calculate which level n block references the data at the level 0 offset
853288571Smav * provided.
854288571Smav */
855168404Spjduint64_t
856288571Smavdbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
857168404Spjd{
858288571Smav	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
859288571Smav		/*
860288571Smav		 * The level n blkid is equal to the level 0 blkid divided by
861288571Smav		 * the number of level 0s in a level n block.
862288571Smav		 *
863288571Smav		 * The level 0 blkid is offset >> datablkshift =
864288571Smav		 * offset / 2^datablkshift.
865288571Smav		 *
866288571Smav		 * The number of level 0s in a level n is the number of block
867288571Smav		 * pointers in an indirect block, raised to the power of level.
868288571Smav		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
869288571Smav		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
870288571Smav		 *
871288571Smav		 * Thus, the level n blkid is: offset /
872288571Smav		 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
873288571Smav		 * = offset / 2^(datablkshift + level *
874288571Smav		 *   (indblkshift - SPA_BLKPTRSHIFT))
875288571Smav		 * = offset >> (datablkshift + level *
876288571Smav		 *   (indblkshift - SPA_BLKPTRSHIFT))
877288571Smav		 */
878288571Smav		return (offset >> (dn->dn_datablkshift + level *
879288571Smav		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
880168404Spjd	} else {
881168404Spjd		ASSERT3U(offset, <, dn->dn_datablksz);
882168404Spjd		return (0);
883168404Spjd	}
884168404Spjd}
885168404Spjd
886168404Spjdstatic void
887168404Spjddbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
888168404Spjd{
889168404Spjd	dmu_buf_impl_t *db = vdb;
890168404Spjd
891168404Spjd	mutex_enter(&db->db_mtx);
892168404Spjd	ASSERT3U(db->db_state, ==, DB_READ);
893168404Spjd	/*
894168404Spjd	 * All reads are synchronous, so we must have a hold on the dbuf
895168404Spjd	 */
896168404Spjd	ASSERT(refcount_count(&db->db_holds) > 0);
897168404Spjd	ASSERT(db->db_buf == NULL);
898168404Spjd	ASSERT(db->db.db_data == NULL);
899168404Spjd	if (db->db_level == 0 && db->db_freed_in_flight) {
900168404Spjd		/* we were freed in flight; disregard any error */
901168404Spjd		arc_release(buf, db);
902168404Spjd		bzero(buf->b_data, db->db.db_size);
903168404Spjd		arc_buf_freeze(buf);
904168404Spjd		db->db_freed_in_flight = FALSE;
905168404Spjd		dbuf_set_data(db, buf);
906168404Spjd		db->db_state = DB_CACHED;
907168404Spjd	} else if (zio == NULL || zio->io_error == 0) {
908168404Spjd		dbuf_set_data(db, buf);
909168404Spjd		db->db_state = DB_CACHED;
910168404Spjd	} else {
911219089Spjd		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
912168404Spjd		ASSERT3P(db->db_buf, ==, NULL);
913307266Smav		arc_buf_destroy(buf, db);
914168404Spjd		db->db_state = DB_UNCACHED;
915168404Spjd	}
916168404Spjd	cv_broadcast(&db->db_changed);
917219089Spjd	dbuf_rele_and_unlock(db, NULL);
918168404Spjd}
919168404Spjd
920168404Spjdstatic void
921288594Smavdbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
922168404Spjd{
923219089Spjd	dnode_t *dn;
924268657Sdelphij	zbookmark_phys_t zb;
925277586Sdelphij	arc_flags_t aflags = ARC_FLAG_NOWAIT;
926168404Spjd
927219089Spjd	DB_DNODE_ENTER(db);
928219089Spjd	dn = DB_DNODE(db);
929168404Spjd	ASSERT(!refcount_is_zero(&db->db_holds));
930168404Spjd	/* We need the struct_rwlock to prevent db_blkptr from changing. */
931185029Spjd	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
932168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
933168404Spjd	ASSERT(db->db_state == DB_UNCACHED);
934168404Spjd	ASSERT(db->db_buf == NULL);
935168404Spjd
936219089Spjd	if (db->db_blkid == DMU_BONUS_BLKID) {
937207624Smm		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
938185029Spjd
939185029Spjd		ASSERT3U(bonuslen, <=, db->db.db_size);
940168404Spjd		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
941208373Smm		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
942185029Spjd		if (bonuslen < DN_MAX_BONUSLEN)
943168404Spjd			bzero(db->db.db_data, DN_MAX_BONUSLEN);
944207624Smm		if (bonuslen)
945207624Smm			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
946219089Spjd		DB_DNODE_EXIT(db);
947168404Spjd		db->db_state = DB_CACHED;
948168404Spjd		mutex_exit(&db->db_mtx);
949168404Spjd		return;
950168404Spjd	}
951168404Spjd
952185029Spjd	/*
953185029Spjd	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
954185029Spjd	 * processes the delete record and clears the bp while we are waiting
955185029Spjd	 * for the dn_mtx (resulting in a "no" from block_freed).
956185029Spjd	 */
957185029Spjd	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
958185029Spjd	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
959185029Spjd	    BP_IS_HOLE(db->db_blkptr)))) {
960168404Spjd		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
961168404Spjd
962307266Smav		dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa,
963168404Spjd		    db->db.db_size, db, type));
964168404Spjd		bzero(db->db.db_data, db->db.db_size);
965304139Savg
966304139Savg		if (db->db_blkptr != NULL && db->db_level > 0 &&
967304139Savg		    BP_IS_HOLE(db->db_blkptr) &&
968304139Savg		    db->db_blkptr->blk_birth != 0) {
969304139Savg			blkptr_t *bps = db->db.db_data;
970304139Savg			for (int i = 0; i < ((1 <<
971304139Savg			    DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
972304139Savg			    i++) {
973304139Savg				blkptr_t *bp = &bps[i];
974304139Savg				ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
975304139Savg				    1 << dn->dn_indblkshift);
976304139Savg				BP_SET_LSIZE(bp,
977304139Savg				    BP_GET_LEVEL(db->db_blkptr) == 1 ?
978304139Savg				    dn->dn_datablksz :
979304139Savg				    BP_GET_LSIZE(db->db_blkptr));
980304139Savg				BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
981304139Savg				BP_SET_LEVEL(bp,
982304139Savg				    BP_GET_LEVEL(db->db_blkptr) - 1);
983304139Savg				BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
984304139Savg			}
985304139Savg		}
986304139Savg		DB_DNODE_EXIT(db);
987168404Spjd		db->db_state = DB_CACHED;
988168404Spjd		mutex_exit(&db->db_mtx);
989168404Spjd		return;
990168404Spjd	}
991168404Spjd
992219089Spjd	DB_DNODE_EXIT(db);
993219089Spjd
994168404Spjd	db->db_state = DB_READ;
995168404Spjd	mutex_exit(&db->db_mtx);
996168404Spjd
997185029Spjd	if (DBUF_IS_L2CACHEABLE(db))
998277586Sdelphij		aflags |= ARC_FLAG_L2CACHE;
999185029Spjd
1000219089Spjd	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
1001219089Spjd	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
1002219089Spjd	    db->db.db_object, db->db_level, db->db_blkid);
1003168404Spjd
1004168404Spjd	dbuf_add_ref(db, NULL);
1005185029Spjd
1006263397Sdelphij	(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
1007168404Spjd	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
1008288594Smav	    (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
1009168404Spjd	    &aflags, &zb);
1010168404Spjd}
1011168404Spjd
1012168404Spjdint
1013168404Spjddbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
1014168404Spjd{
1015168404Spjd	int err = 0;
1016263397Sdelphij	boolean_t havepzio = (zio != NULL);
1017263397Sdelphij	boolean_t prefetch;
1018219089Spjd	dnode_t *dn;
1019168404Spjd
1020168404Spjd	/*
1021168404Spjd	 * We don't have to hold the mutex to check db_state because it
1022168404Spjd	 * can't be freed while we have a hold on the buffer.
1023168404Spjd	 */
1024168404Spjd	ASSERT(!refcount_is_zero(&db->db_holds));
1025168404Spjd
1026219089Spjd	if (db->db_state == DB_NOFILL)
1027249195Smm		return (SET_ERROR(EIO));
1028219089Spjd
1029219089Spjd	DB_DNODE_ENTER(db);
1030219089Spjd	dn = DB_DNODE(db);
1031168404Spjd	if ((flags & DB_RF_HAVESTRUCT) == 0)
1032219089Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1033168404Spjd
1034219089Spjd	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1035219089Spjd	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
1036185029Spjd	    DBUF_IS_CACHEABLE(db);
1037168404Spjd
1038168404Spjd	mutex_enter(&db->db_mtx);
1039168404Spjd	if (db->db_state == DB_CACHED) {
1040168404Spjd		mutex_exit(&db->db_mtx);
1041168404Spjd		if (prefetch)
1042299433Smav			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
1043168404Spjd		if ((flags & DB_RF_HAVESTRUCT) == 0)
1044219089Spjd			rw_exit(&dn->dn_struct_rwlock);
1045219089Spjd		DB_DNODE_EXIT(db);
1046168404Spjd	} else if (db->db_state == DB_UNCACHED) {
1047219089Spjd		spa_t *spa = dn->dn_objset->os_spa;
1048219089Spjd
1049219089Spjd		if (zio == NULL)
1050219089Spjd			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
1051288594Smav		dbuf_read_impl(db, zio, flags);
1052168404Spjd
1053168404Spjd		/* dbuf_read_impl has dropped db_mtx for us */
1054168404Spjd
1055168404Spjd		if (prefetch)
1056299433Smav			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
1057168404Spjd
1058168404Spjd		if ((flags & DB_RF_HAVESTRUCT) == 0)
1059219089Spjd			rw_exit(&dn->dn_struct_rwlock);
1060219089Spjd		DB_DNODE_EXIT(db);
1061168404Spjd
1062168404Spjd		if (!havepzio)
1063168404Spjd			err = zio_wait(zio);
1064168404Spjd	} else {
1065251629Sdelphij		/*
1066251629Sdelphij		 * Another reader came in while the dbuf was in flight
1067251629Sdelphij		 * between UNCACHED and CACHED.  Either a writer will finish
1068251629Sdelphij		 * writing the buffer (sending the dbuf to CACHED) or the
1069251629Sdelphij		 * first reader's request will reach the read_done callback
1070251629Sdelphij		 * and send the dbuf to CACHED.  Otherwise, a failure
1071251629Sdelphij		 * occurred and the dbuf went to UNCACHED.
1072251629Sdelphij		 */
1073168404Spjd		mutex_exit(&db->db_mtx);
1074168404Spjd		if (prefetch)
1075299433Smav			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
1076168404Spjd		if ((flags & DB_RF_HAVESTRUCT) == 0)
1077219089Spjd			rw_exit(&dn->dn_struct_rwlock);
1078219089Spjd		DB_DNODE_EXIT(db);
1079168404Spjd
1080251629Sdelphij		/* Skip the wait per the caller's request. */
1081168404Spjd		mutex_enter(&db->db_mtx);
1082168404Spjd		if ((flags & DB_RF_NEVERWAIT) == 0) {
1083168404Spjd			while (db->db_state == DB_READ ||
1084168404Spjd			    db->db_state == DB_FILL) {
1085168404Spjd				ASSERT(db->db_state == DB_READ ||
1086168404Spjd				    (flags & DB_RF_HAVESTRUCT) == 0);
1087273346Sdelphij				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
1088273346Sdelphij				    db, zio_t *, zio);
1089168404Spjd				cv_wait(&db->db_changed, &db->db_mtx);
1090168404Spjd			}
1091168404Spjd			if (db->db_state == DB_UNCACHED)
1092249195Smm				err = SET_ERROR(EIO);
1093168404Spjd		}
1094168404Spjd		mutex_exit(&db->db_mtx);
1095168404Spjd	}
1096168404Spjd
1097168404Spjd	ASSERT(err || havepzio || db->db_state == DB_CACHED);
1098168404Spjd	return (err);
1099168404Spjd}
1100168404Spjd
1101168404Spjdstatic void
1102168404Spjddbuf_noread(dmu_buf_impl_t *db)
1103168404Spjd{
1104168404Spjd	ASSERT(!refcount_is_zero(&db->db_holds));
1105219089Spjd	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1106168404Spjd	mutex_enter(&db->db_mtx);
1107168404Spjd	while (db->db_state == DB_READ || db->db_state == DB_FILL)
1108168404Spjd		cv_wait(&db->db_changed, &db->db_mtx);
1109168404Spjd	if (db->db_state == DB_UNCACHED) {
1110168404Spjd		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1111263397Sdelphij		spa_t *spa = db->db_objset->os_spa;
1112168404Spjd
1113168404Spjd		ASSERT(db->db_buf == NULL);
1114168404Spjd		ASSERT(db->db.db_data == NULL);
1115307266Smav		dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type));
1116168404Spjd		db->db_state = DB_FILL;
1117219089Spjd	} else if (db->db_state == DB_NOFILL) {
1118288549Smav		dbuf_clear_data(db);
1119168404Spjd	} else {
1120168404Spjd		ASSERT3U(db->db_state, ==, DB_CACHED);
1121168404Spjd	}
1122168404Spjd	mutex_exit(&db->db_mtx);
1123168404Spjd}
1124168404Spjd
1125168404Spjd/*
1126168404Spjd * This is our just-in-time copy function.  It makes a copy of
1127168404Spjd * buffers, that have been modified in a previous transaction
1128168404Spjd * group, before we modify them in the current active group.
1129168404Spjd *
1130168404Spjd * This function is used in two places: when we are dirtying a
1131168404Spjd * buffer for the first time in a txg, and when we are freeing
1132168404Spjd * a range in a dnode that includes this buffer.
1133168404Spjd *
1134168404Spjd * Note that when we are called from dbuf_free_range() we do
1135168404Spjd * not put a hold on the buffer, we just traverse the active
1136168404Spjd * dbuf list for the dnode.
1137168404Spjd */
1138168404Spjdstatic void
1139168404Spjddbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
1140168404Spjd{
1141168404Spjd	dbuf_dirty_record_t *dr = db->db_last_dirty;
1142168404Spjd
1143168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
1144168404Spjd	ASSERT(db->db.db_data != NULL);
1145168404Spjd	ASSERT(db->db_level == 0);
1146168404Spjd	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
1147168404Spjd
1148168404Spjd	if (dr == NULL ||
1149168404Spjd	    (dr->dt.dl.dr_data !=
1150219089Spjd	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
1151168404Spjd		return;
1152168404Spjd
1153168404Spjd	/*
1154168404Spjd	 * If the last dirty record for this dbuf has not yet synced
1155168404Spjd	 * and its referencing the dbuf data, either:
1156219089Spjd	 *	reset the reference to point to a new copy,
1157168404Spjd	 * or (if there a no active holders)
1158168404Spjd	 *	just null out the current db_data pointer.
1159168404Spjd	 */
1160168404Spjd	ASSERT(dr->dr_txg >= txg - 2);
1161219089Spjd	if (db->db_blkid == DMU_BONUS_BLKID) {
1162168404Spjd		/* Note that the data bufs here are zio_bufs */
1163168404Spjd		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
1164208373Smm		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1165168404Spjd		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
1166168404Spjd	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
1167168404Spjd		int size = db->db.db_size;
1168168404Spjd		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1169263397Sdelphij		spa_t *spa = db->db_objset->os_spa;
1170219089Spjd
1171307266Smav		dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type);
1172168404Spjd		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
1173168404Spjd	} else {
1174307266Smav		db->db_buf = NULL;
1175288549Smav		dbuf_clear_data(db);
1176168404Spjd	}
1177168404Spjd}
1178168404Spjd
1179168404Spjdvoid
1180168404Spjddbuf_unoverride(dbuf_dirty_record_t *dr)
1181168404Spjd{
1182168404Spjd	dmu_buf_impl_t *db = dr->dr_dbuf;
1183219089Spjd	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
1184168404Spjd	uint64_t txg = dr->dr_txg;
1185168404Spjd
1186168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
1187168404Spjd	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
1188168404Spjd	ASSERT(db->db_level == 0);
1189168404Spjd
1190219089Spjd	if (db->db_blkid == DMU_BONUS_BLKID ||
1191168404Spjd	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
1192168404Spjd		return;
1193168404Spjd
1194219089Spjd	ASSERT(db->db_data_pending != dr);
1195219089Spjd
1196168404Spjd	/* free this block */
1197263397Sdelphij	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
1198263397Sdelphij		zio_free(db->db_objset->os_spa, txg, bp);
1199219089Spjd
1200168404Spjd	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1201243524Smm	dr->dt.dl.dr_nopwrite = B_FALSE;
1202243524Smm
1203168404Spjd	/*
1204168404Spjd	 * Release the already-written buffer, so we leave it in
1205168404Spjd	 * a consistent dirty state.  Note that all callers are
1206168404Spjd	 * modifying the buffer, so they will immediately do
1207168404Spjd	 * another (redundant) arc_release().  Therefore, leave
1208168404Spjd	 * the buf thawed to save the effort of freezing &
1209168404Spjd	 * immediately re-thawing it.
1210168404Spjd	 */
1211168404Spjd	arc_release(dr->dt.dl.dr_data, db);
1212168404Spjd}
1213168404Spjd
1214185029Spjd/*
1215185029Spjd * Evict (if its unreferenced) or clear (if its referenced) any level-0
1216185029Spjd * data blocks in the free range, so that any future readers will find
1217263397Sdelphij * empty blocks.
1218253821Sdelphij *
1219253821Sdelphij * This is a no-op if the dataset is in the middle of an incremental
1220253821Sdelphij * receive; see comment below for details.
1221185029Spjd */
1222168404Spjdvoid
1223269845Sdelphijdbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
1224269845Sdelphij    dmu_tx_t *tx)
1225168404Spjd{
1226288549Smav	dmu_buf_impl_t db_search;
1227288549Smav	dmu_buf_impl_t *db, *db_next;
1228168404Spjd	uint64_t txg = tx->tx_txg;
1229269845Sdelphij	avl_index_t where;
1230168404Spjd
1231269845Sdelphij	if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
1232269845Sdelphij		end_blkid = dn->dn_maxblkid;
1233269845Sdelphij	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
1234253821Sdelphij
1235269845Sdelphij	db_search.db_level = 0;
1236269845Sdelphij	db_search.db_blkid = start_blkid;
1237270809Sdelphij	db_search.db_state = DB_SEARCH;
1238269845Sdelphij
1239254753Sdelphij	mutex_enter(&dn->dn_dbufs_mtx);
1240269845Sdelphij	if (start_blkid >= dn->dn_unlisted_l0_blkid) {
1241254753Sdelphij		/* There can't be any dbufs in this range; no need to search. */
1242269845Sdelphij#ifdef DEBUG
1243269845Sdelphij		db = avl_find(&dn->dn_dbufs, &db_search, &where);
1244269845Sdelphij		ASSERT3P(db, ==, NULL);
1245269845Sdelphij		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1246269845Sdelphij		ASSERT(db == NULL || db->db_level > 0);
1247269845Sdelphij#endif
1248254753Sdelphij		mutex_exit(&dn->dn_dbufs_mtx);
1249254753Sdelphij		return;
1250254753Sdelphij	} else if (dmu_objset_is_receiving(dn->dn_objset)) {
1251253821Sdelphij		/*
1252254753Sdelphij		 * If we are receiving, we expect there to be no dbufs in
1253254753Sdelphij		 * the range to be freed, because receive modifies each
1254254753Sdelphij		 * block at most once, and in offset order.  If this is
1255254753Sdelphij		 * not the case, it can lead to performance problems,
1256254753Sdelphij		 * so note that we unexpectedly took the slow path.
1257253821Sdelphij		 */
1258254753Sdelphij		atomic_inc_64(&zfs_free_range_recv_miss);
1259253821Sdelphij	}
1260253821Sdelphij
1261269845Sdelphij	db = avl_find(&dn->dn_dbufs, &db_search, &where);
1262269845Sdelphij	ASSERT3P(db, ==, NULL);
1263269845Sdelphij	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1264269845Sdelphij
1265269845Sdelphij	for (; db != NULL; db = db_next) {
1266269845Sdelphij		db_next = AVL_NEXT(&dn->dn_dbufs, db);
1267219089Spjd		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1268185029Spjd
1269269845Sdelphij		if (db->db_level != 0 || db->db_blkid > end_blkid) {
1270269845Sdelphij			break;
1271269845Sdelphij		}
1272269845Sdelphij		ASSERT3U(db->db_blkid, >=, start_blkid);
1273168404Spjd
1274168404Spjd		/* found a level 0 buffer in the range */
1275248571Smm		mutex_enter(&db->db_mtx);
1276248571Smm		if (dbuf_undirty(db, tx)) {
1277248571Smm			/* mutex has been dropped and dbuf destroyed */
1278168404Spjd			continue;
1279248571Smm		}
1280168404Spjd
1281168404Spjd		if (db->db_state == DB_UNCACHED ||
1282219089Spjd		    db->db_state == DB_NOFILL ||
1283168404Spjd		    db->db_state == DB_EVICTING) {
1284168404Spjd			ASSERT(db->db.db_data == NULL);
1285168404Spjd			mutex_exit(&db->db_mtx);
1286168404Spjd			continue;
1287168404Spjd		}
1288168404Spjd		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
1289168404Spjd			/* will be handled in dbuf_read_done or dbuf_rele */
1290168404Spjd			db->db_freed_in_flight = TRUE;
1291168404Spjd			mutex_exit(&db->db_mtx);
1292168404Spjd			continue;
1293168404Spjd		}
1294168404Spjd		if (refcount_count(&db->db_holds) == 0) {
1295168404Spjd			ASSERT(db->db_buf);
1296307266Smav			dbuf_destroy(db);
1297168404Spjd			continue;
1298168404Spjd		}
1299168404Spjd		/* The dbuf is referenced */
1300168404Spjd
1301168404Spjd		if (db->db_last_dirty != NULL) {
1302168404Spjd			dbuf_dirty_record_t *dr = db->db_last_dirty;
1303168404Spjd
1304168404Spjd			if (dr->dr_txg == txg) {
1305168404Spjd				/*
1306168404Spjd				 * This buffer is "in-use", re-adjust the file
1307168404Spjd				 * size to reflect that this buffer may
1308168404Spjd				 * contain new data when we sync.
1309168404Spjd				 */
1310219089Spjd				if (db->db_blkid != DMU_SPILL_BLKID &&
1311219089Spjd				    db->db_blkid > dn->dn_maxblkid)
1312168404Spjd					dn->dn_maxblkid = db->db_blkid;
1313168404Spjd				dbuf_unoverride(dr);
1314168404Spjd			} else {
1315168404Spjd				/*
1316168404Spjd				 * This dbuf is not dirty in the open context.
1317168404Spjd				 * Either uncache it (if its not referenced in
1318168404Spjd				 * the open context) or reset its contents to
1319168404Spjd				 * empty.
1320168404Spjd				 */
1321168404Spjd				dbuf_fix_old_data(db, txg);
1322168404Spjd			}
1323168404Spjd		}
1324168404Spjd		/* clear the contents if its cached */
1325168404Spjd		if (db->db_state == DB_CACHED) {
1326168404Spjd			ASSERT(db->db.db_data != NULL);
1327168404Spjd			arc_release(db->db_buf, db);
1328168404Spjd			bzero(db->db.db_data, db->db.db_size);
1329168404Spjd			arc_buf_freeze(db->db_buf);
1330168404Spjd		}
1331168404Spjd
1332168404Spjd		mutex_exit(&db->db_mtx);
1333168404Spjd	}
1334168404Spjd	mutex_exit(&dn->dn_dbufs_mtx);
1335168404Spjd}
1336168404Spjd
1337168404Spjdstatic int
1338185029Spjddbuf_block_freeable(dmu_buf_impl_t *db)
1339168404Spjd{
1340168404Spjd	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
1341168404Spjd	uint64_t birth_txg = 0;
1342168404Spjd
1343168404Spjd	/*
1344168404Spjd	 * We don't need any locking to protect db_blkptr:
1345168404Spjd	 * If it's syncing, then db_last_dirty will be set
1346168404Spjd	 * so we'll ignore db_blkptr.
1347263397Sdelphij	 *
1348263397Sdelphij	 * This logic ensures that only block births for
1349263397Sdelphij	 * filled blocks are considered.
1350168404Spjd	 */
1351168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
1352263397Sdelphij	if (db->db_last_dirty && (db->db_blkptr == NULL ||
1353263397Sdelphij	    !BP_IS_HOLE(db->db_blkptr))) {
1354168404Spjd		birth_txg = db->db_last_dirty->dr_txg;
1355263397Sdelphij	} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
1356168404Spjd		birth_txg = db->db_blkptr->blk_birth;
1357263397Sdelphij	}
1358168404Spjd
1359219089Spjd	/*
1360263397Sdelphij	 * If this block don't exist or is in a snapshot, it can't be freed.
1361219089Spjd	 * Don't pass the bp to dsl_dataset_block_freeable() since we
1362219089Spjd	 * are holding the db_mtx lock and might deadlock if we are
1363219089Spjd	 * prefetching a dedup-ed block.
1364219089Spjd	 */
1365263397Sdelphij	if (birth_txg != 0)
1366185029Spjd		return (ds == NULL ||
1367219089Spjd		    dsl_dataset_block_freeable(ds, NULL, birth_txg));
1368168404Spjd	else
1369263397Sdelphij		return (B_FALSE);
1370168404Spjd}
1371168404Spjd
1372168404Spjdvoid
1373168404Spjddbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
1374168404Spjd{
1375168404Spjd	arc_buf_t *buf, *obuf;
1376168404Spjd	int osize = db->db.db_size;
1377168404Spjd	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1378219089Spjd	dnode_t *dn;
1379168404Spjd
1380219089Spjd	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1381168404Spjd
1382219089Spjd	DB_DNODE_ENTER(db);
1383219089Spjd	dn = DB_DNODE(db);
1384219089Spjd
1385168404Spjd	/* XXX does *this* func really need the lock? */
1386219089Spjd	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1387168404Spjd
1388168404Spjd	/*
1389263397Sdelphij	 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
1390168404Spjd	 * is OK, because there can be no other references to the db
1391168404Spjd	 * when we are changing its size, so no concurrent DB_FILL can
1392168404Spjd	 * be happening.
1393168404Spjd	 */
1394168404Spjd	/*
1395168404Spjd	 * XXX we should be doing a dbuf_read, checking the return
1396168404Spjd	 * value and returning that up to our callers
1397168404Spjd	 */
1398263397Sdelphij	dmu_buf_will_dirty(&db->db, tx);
1399168404Spjd
1400168404Spjd	/* create the data buffer for the new block */
1401307266Smav	buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type);
1402168404Spjd
1403168404Spjd	/* copy old block data to the new block */
1404168404Spjd	obuf = db->db_buf;
1405168404Spjd	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
1406168404Spjd	/* zero the remainder */
1407168404Spjd	if (size > osize)
1408168404Spjd		bzero((uint8_t *)buf->b_data + osize, size - osize);
1409168404Spjd
1410168404Spjd	mutex_enter(&db->db_mtx);
1411168404Spjd	dbuf_set_data(db, buf);
1412307266Smav	arc_buf_destroy(obuf, db);
1413168404Spjd	db->db.db_size = size;
1414168404Spjd
1415168404Spjd	if (db->db_level == 0) {
1416168404Spjd		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1417168404Spjd		db->db_last_dirty->dt.dl.dr_data = buf;
1418168404Spjd	}
1419168404Spjd	mutex_exit(&db->db_mtx);
1420168404Spjd
1421219089Spjd	dnode_willuse_space(dn, size-osize, tx);
1422219089Spjd	DB_DNODE_EXIT(db);
1423168404Spjd}
1424168404Spjd
1425219089Spjdvoid
1426219089Spjddbuf_release_bp(dmu_buf_impl_t *db)
1427219089Spjd{
1428263397Sdelphij	objset_t *os = db->db_objset;
1429219089Spjd
1430219089Spjd	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1431219089Spjd	ASSERT(arc_released(os->os_phys_buf) ||
1432219089Spjd	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
1433219089Spjd	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1434219089Spjd
1435246666Smm	(void) arc_release(db->db_buf, db);
1436219089Spjd}
1437219089Spjd
1438290750Smav/*
1439290750Smav * We already have a dirty record for this TXG, and we are being
1440290750Smav * dirtied again.
1441290750Smav */
1442290750Smavstatic void
1443290750Smavdbuf_redirty(dbuf_dirty_record_t *dr)
1444290750Smav{
1445290750Smav	dmu_buf_impl_t *db = dr->dr_dbuf;
1446290750Smav
1447290750Smav	ASSERT(MUTEX_HELD(&db->db_mtx));
1448290750Smav
1449290750Smav	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1450290750Smav		/*
1451290750Smav		 * If this buffer has already been written out,
1452290750Smav		 * we now need to reset its state.
1453290750Smav		 */
1454290750Smav		dbuf_unoverride(dr);
1455290750Smav		if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1456290750Smav		    db->db_state != DB_NOFILL) {
1457290750Smav			/* Already released on initial dirty, so just thaw. */
1458290750Smav			ASSERT(arc_released(db->db_buf));
1459290750Smav			arc_buf_thaw(db->db_buf);
1460290750Smav		}
1461290750Smav	}
1462290750Smav}
1463290750Smav
1464168404Spjddbuf_dirty_record_t *
1465168404Spjddbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1466168404Spjd{
1467219089Spjd	dnode_t *dn;
1468219089Spjd	objset_t *os;
1469168404Spjd	dbuf_dirty_record_t **drp, *dr;
1470168404Spjd	int drop_struct_lock = FALSE;
1471185029Spjd	boolean_t do_free_accounting = B_FALSE;
1472168404Spjd	int txgoff = tx->tx_txg & TXG_MASK;
1473168404Spjd
1474168404Spjd	ASSERT(tx->tx_txg != 0);
1475168404Spjd	ASSERT(!refcount_is_zero(&db->db_holds));
1476168404Spjd	DMU_TX_DIRTY_BUF(tx, db);
1477168404Spjd
1478219089Spjd	DB_DNODE_ENTER(db);
1479219089Spjd	dn = DB_DNODE(db);
1480168404Spjd	/*
1481168404Spjd	 * Shouldn't dirty a regular buffer in syncing context.  Private
1482168404Spjd	 * objects may be dirtied in syncing context, but only if they
1483168404Spjd	 * were already pre-dirtied in open context.
1484168404Spjd	 */
1485308083Smav#ifdef DEBUG
1486308083Smav	if (dn->dn_objset->os_dsl_dataset != NULL) {
1487308083Smav		rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
1488308083Smav		    RW_READER, FTAG);
1489308083Smav	}
1490168404Spjd	ASSERT(!dmu_tx_is_syncing(tx) ||
1491168404Spjd	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1492209962Smm	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1493209962Smm	    dn->dn_objset->os_dsl_dataset == NULL);
1494308083Smav	if (dn->dn_objset->os_dsl_dataset != NULL)
1495308083Smav		rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
1496308083Smav#endif
1497168404Spjd	/*
1498168404Spjd	 * We make this assert for private objects as well, but after we
1499168404Spjd	 * check if we're already dirty.  They are allowed to re-dirty
1500168404Spjd	 * in syncing context.
1501168404Spjd	 */
1502168404Spjd	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1503168404Spjd	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1504168404Spjd	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1505168404Spjd
1506168404Spjd	mutex_enter(&db->db_mtx);
1507168404Spjd	/*
1508168404Spjd	 * XXX make this true for indirects too?  The problem is that
1509168404Spjd	 * transactions created with dmu_tx_create_assigned() from
1510168404Spjd	 * syncing context don't bother holding ahead.
1511168404Spjd	 */
1512168404Spjd	ASSERT(db->db_level != 0 ||
1513219089Spjd	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1514219089Spjd	    db->db_state == DB_NOFILL);
1515168404Spjd
1516168404Spjd	mutex_enter(&dn->dn_mtx);
1517168404Spjd	/*
1518168404Spjd	 * Don't set dirtyctx to SYNC if we're just modifying this as we
1519168404Spjd	 * initialize the objset.
1520168404Spjd	 */
1521308083Smav	if (dn->dn_dirtyctx == DN_UNDIRTIED) {
1522308083Smav		if (dn->dn_objset->os_dsl_dataset != NULL) {
1523308083Smav			rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
1524308083Smav			    RW_READER, FTAG);
1525308083Smav		}
1526308083Smav		if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1527308083Smav			dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ?
1528308083Smav			    DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1529308083Smav			ASSERT(dn->dn_dirtyctx_firstset == NULL);
1530308083Smav			dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1531308083Smav		}
1532308083Smav		if (dn->dn_objset->os_dsl_dataset != NULL) {
1533308083Smav			rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
1534308083Smav			    FTAG);
1535308083Smav		}
1536168404Spjd	}
1537168404Spjd	mutex_exit(&dn->dn_mtx);
1538168404Spjd
1539219089Spjd	if (db->db_blkid == DMU_SPILL_BLKID)
1540219089Spjd		dn->dn_have_spill = B_TRUE;
1541219089Spjd
1542168404Spjd	/*
1543168404Spjd	 * If this buffer is already dirty, we're done.
1544168404Spjd	 */
1545168404Spjd	drp = &db->db_last_dirty;
1546168404Spjd	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1547168404Spjd	    db->db.db_object == DMU_META_DNODE_OBJECT);
1548185029Spjd	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1549185029Spjd		drp = &dr->dr_next;
1550185029Spjd	if (dr && dr->dr_txg == tx->tx_txg) {
1551219089Spjd		DB_DNODE_EXIT(db);
1552219089Spjd
1553290750Smav		dbuf_redirty(dr);
1554168404Spjd		mutex_exit(&db->db_mtx);
1555185029Spjd		return (dr);
1556168404Spjd	}
1557168404Spjd
1558168404Spjd	/*
1559168404Spjd	 * Only valid if not already dirty.
1560168404Spjd	 */
1561209962Smm	ASSERT(dn->dn_object == 0 ||
1562209962Smm	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1563168404Spjd	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1564168404Spjd
1565168404Spjd	ASSERT3U(dn->dn_nlevels, >, db->db_level);
1566168404Spjd	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1567168404Spjd	    dn->dn_phys->dn_nlevels > db->db_level ||
1568168404Spjd	    dn->dn_next_nlevels[txgoff] > db->db_level ||
1569168404Spjd	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1570168404Spjd	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1571168404Spjd
1572168404Spjd	/*
1573168404Spjd	 * We should only be dirtying in syncing context if it's the
1574209962Smm	 * mos or we're initializing the os or it's a special object.
1575209962Smm	 * However, we are allowed to dirty in syncing context provided
1576209962Smm	 * we already dirtied it in open context.  Hence we must make
1577209962Smm	 * this assertion only if we're not already dirty.
1578168404Spjd	 */
1579219089Spjd	os = dn->dn_objset;
1580308083Smav#ifdef DEBUG
1581308083Smav	if (dn->dn_objset->os_dsl_dataset != NULL)
1582308083Smav		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
1583209962Smm	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1584209962Smm	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1585308083Smav	if (dn->dn_objset->os_dsl_dataset != NULL)
1586308083Smav		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
1587308083Smav#endif
1588168404Spjd	ASSERT(db->db.db_size != 0);
1589168404Spjd
1590168404Spjd	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1591168404Spjd
1592219089Spjd	if (db->db_blkid != DMU_BONUS_BLKID) {
1593185029Spjd		/*
1594185029Spjd		 * Update the accounting.
1595185029Spjd		 * Note: we delay "free accounting" until after we drop
1596185029Spjd		 * the db_mtx.  This keeps us from grabbing other locks
1597219089Spjd		 * (and possibly deadlocking) in bp_get_dsize() while
1598185029Spjd		 * also holding the db_mtx.
1599185029Spjd		 */
1600185029Spjd		dnode_willuse_space(dn, db->db.db_size, tx);
1601185029Spjd		do_free_accounting = dbuf_block_freeable(db);
1602185029Spjd	}
1603185029Spjd
1604168404Spjd	/*
1605168404Spjd	 * If this buffer is dirty in an old transaction group we need
1606168404Spjd	 * to make a copy of it so that the changes we make in this
1607168404Spjd	 * transaction group won't leak out when we sync the older txg.
1608168404Spjd	 */
1609168404Spjd	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1610168404Spjd	if (db->db_level == 0) {
1611168404Spjd		void *data_old = db->db_buf;
1612168404Spjd
1613219089Spjd		if (db->db_state != DB_NOFILL) {
1614219089Spjd			if (db->db_blkid == DMU_BONUS_BLKID) {
1615219089Spjd				dbuf_fix_old_data(db, tx->tx_txg);
1616219089Spjd				data_old = db->db.db_data;
1617219089Spjd			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1618219089Spjd				/*
1619219089Spjd				 * Release the data buffer from the cache so
1620219089Spjd				 * that we can modify it without impacting
1621219089Spjd				 * possible other users of this cached data
1622219089Spjd				 * block.  Note that indirect blocks and
1623219089Spjd				 * private objects are not released until the
1624219089Spjd				 * syncing state (since they are only modified
1625219089Spjd				 * then).
1626219089Spjd				 */
1627219089Spjd				arc_release(db->db_buf, db);
1628219089Spjd				dbuf_fix_old_data(db, tx->tx_txg);
1629219089Spjd				data_old = db->db_buf;
1630219089Spjd			}
1631219089Spjd			ASSERT(data_old != NULL);
1632168404Spjd		}
1633168404Spjd		dr->dt.dl.dr_data = data_old;
1634168404Spjd	} else {
1635168404Spjd		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1636168404Spjd		list_create(&dr->dt.di.dr_children,
1637168404Spjd		    sizeof (dbuf_dirty_record_t),
1638168404Spjd		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
1639168404Spjd	}
1640260763Savg	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1641260763Savg		dr->dr_accounted = db->db.db_size;
1642168404Spjd	dr->dr_dbuf = db;
1643168404Spjd	dr->dr_txg = tx->tx_txg;
1644168404Spjd	dr->dr_next = *drp;
1645168404Spjd	*drp = dr;
1646168404Spjd
1647168404Spjd	/*
1648168404Spjd	 * We could have been freed_in_flight between the dbuf_noread
1649168404Spjd	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
1650168404Spjd	 * happened after the free.
1651168404Spjd	 */
1652219089Spjd	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1653219089Spjd	    db->db_blkid != DMU_SPILL_BLKID) {
1654168404Spjd		mutex_enter(&dn->dn_mtx);
1655265740Sdelphij		if (dn->dn_free_ranges[txgoff] != NULL) {
1656265740Sdelphij			range_tree_clear(dn->dn_free_ranges[txgoff],
1657265740Sdelphij			    db->db_blkid, 1);
1658265740Sdelphij		}
1659168404Spjd		mutex_exit(&dn->dn_mtx);
1660168404Spjd		db->db_freed_in_flight = FALSE;
1661168404Spjd	}
1662168404Spjd
1663168404Spjd	/*
1664168404Spjd	 * This buffer is now part of this txg
1665168404Spjd	 */
1666168404Spjd	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1667168404Spjd	db->db_dirtycnt += 1;
1668168404Spjd	ASSERT3U(db->db_dirtycnt, <=, 3);
1669168404Spjd
1670168404Spjd	mutex_exit(&db->db_mtx);
1671168404Spjd
1672219089Spjd	if (db->db_blkid == DMU_BONUS_BLKID ||
1673219089Spjd	    db->db_blkid == DMU_SPILL_BLKID) {
1674168404Spjd		mutex_enter(&dn->dn_mtx);
1675168404Spjd		ASSERT(!list_link_active(&dr->dr_dirty_node));
1676168404Spjd		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1677168404Spjd		mutex_exit(&dn->dn_mtx);
1678168404Spjd		dnode_setdirty(dn, tx);
1679219089Spjd		DB_DNODE_EXIT(db);
1680168404Spjd		return (dr);
1681307270Smav	}
1682307270Smav
1683307270Smav	/*
1684307270Smav	 * The dn_struct_rwlock prevents db_blkptr from changing
1685307270Smav	 * due to a write from syncing context completing
1686307270Smav	 * while we are running, so we want to acquire it before
1687307270Smav	 * looking at db_blkptr.
1688307270Smav	 */
1689307270Smav	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1690307270Smav		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1691307270Smav		drop_struct_lock = TRUE;
1692307270Smav	}
1693307270Smav
1694307270Smav	if (do_free_accounting) {
1695185029Spjd		blkptr_t *bp = db->db_blkptr;
1696185029Spjd		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1697219089Spjd		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1698185029Spjd		/*
1699185029Spjd		 * This is only a guess -- if the dbuf is dirty
1700185029Spjd		 * in a previous txg, we don't know how much
1701185029Spjd		 * space it will use on disk yet.  We should
1702185029Spjd		 * really have the struct_rwlock to access
1703185029Spjd		 * db_blkptr, but since this is just a guess,
1704185029Spjd		 * it's OK if we get an odd answer.
1705185029Spjd		 */
1706219089Spjd		ddt_prefetch(os->os_spa, bp);
1707185029Spjd		dnode_willuse_space(dn, -willfree, tx);
1708168404Spjd	}
1709168404Spjd
1710185029Spjd	if (db->db_level == 0) {
1711185029Spjd		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1712185029Spjd		ASSERT(dn->dn_maxblkid >= db->db_blkid);
1713185029Spjd	}
1714185029Spjd
1715168404Spjd	if (db->db_level+1 < dn->dn_nlevels) {
1716168404Spjd		dmu_buf_impl_t *parent = db->db_parent;
1717168404Spjd		dbuf_dirty_record_t *di;
1718168404Spjd		int parent_held = FALSE;
1719168404Spjd
1720168404Spjd		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1721168404Spjd			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1722168404Spjd
1723168404Spjd			parent = dbuf_hold_level(dn, db->db_level+1,
1724168404Spjd			    db->db_blkid >> epbs, FTAG);
1725219089Spjd			ASSERT(parent != NULL);
1726168404Spjd			parent_held = TRUE;
1727168404Spjd		}
1728168404Spjd		if (drop_struct_lock)
1729168404Spjd			rw_exit(&dn->dn_struct_rwlock);
1730168404Spjd		ASSERT3U(db->db_level+1, ==, parent->db_level);
1731168404Spjd		di = dbuf_dirty(parent, tx);
1732168404Spjd		if (parent_held)
1733168404Spjd			dbuf_rele(parent, FTAG);
1734168404Spjd
1735168404Spjd		mutex_enter(&db->db_mtx);
1736260763Savg		/*
1737260763Savg		 * Since we've dropped the mutex, it's possible that
1738260763Savg		 * dbuf_undirty() might have changed this out from under us.
1739260763Savg		 */
1740168404Spjd		if (db->db_last_dirty == dr ||
1741168404Spjd		    dn->dn_object == DMU_META_DNODE_OBJECT) {
1742168404Spjd			mutex_enter(&di->dt.di.dr_mtx);
1743168404Spjd			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1744168404Spjd			ASSERT(!list_link_active(&dr->dr_dirty_node));
1745168404Spjd			list_insert_tail(&di->dt.di.dr_children, dr);
1746168404Spjd			mutex_exit(&di->dt.di.dr_mtx);
1747168404Spjd			dr->dr_parent = di;
1748168404Spjd		}
1749168404Spjd		mutex_exit(&db->db_mtx);
1750168404Spjd	} else {
1751168404Spjd		ASSERT(db->db_level+1 == dn->dn_nlevels);
1752168404Spjd		ASSERT(db->db_blkid < dn->dn_nblkptr);
1753219089Spjd		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1754168404Spjd		mutex_enter(&dn->dn_mtx);
1755168404Spjd		ASSERT(!list_link_active(&dr->dr_dirty_node));
1756168404Spjd		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1757168404Spjd		mutex_exit(&dn->dn_mtx);
1758168404Spjd		if (drop_struct_lock)
1759168404Spjd			rw_exit(&dn->dn_struct_rwlock);
1760168404Spjd	}
1761168404Spjd
1762168404Spjd	dnode_setdirty(dn, tx);
1763219089Spjd	DB_DNODE_EXIT(db);
1764168404Spjd	return (dr);
1765168404Spjd}
1766168404Spjd
1767248571Smm/*
1768251629Sdelphij * Undirty a buffer in the transaction group referenced by the given
1769251629Sdelphij * transaction.  Return whether this evicted the dbuf.
1770248571Smm */
1771248571Smmstatic boolean_t
1772168404Spjddbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1773168404Spjd{
1774219089Spjd	dnode_t *dn;
1775168404Spjd	uint64_t txg = tx->tx_txg;
1776185029Spjd	dbuf_dirty_record_t *dr, **drp;
1777168404Spjd
1778168404Spjd	ASSERT(txg != 0);
1779285202Savg
1780285202Savg	/*
1781285202Savg	 * Due to our use of dn_nlevels below, this can only be called
1782285202Savg	 * in open context, unless we are operating on the MOS.
1783285202Savg	 * From syncing context, dn_nlevels may be different from the
1784285202Savg	 * dn_nlevels used when dbuf was dirtied.
1785285202Savg	 */
1786285202Savg	ASSERT(db->db_objset ==
1787285202Savg	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
1788285202Savg	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
1789219089Spjd	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1790248571Smm	ASSERT0(db->db_level);
1791248571Smm	ASSERT(MUTEX_HELD(&db->db_mtx));
1792168404Spjd
1793168404Spjd	/*
1794168404Spjd	 * If this buffer is not dirty, we're done.
1795168404Spjd	 */
1796185029Spjd	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1797168404Spjd		if (dr->dr_txg <= txg)
1798168404Spjd			break;
1799248571Smm	if (dr == NULL || dr->dr_txg < txg)
1800248571Smm		return (B_FALSE);
1801168404Spjd	ASSERT(dr->dr_txg == txg);
1802219089Spjd	ASSERT(dr->dr_dbuf == db);
1803168404Spjd
1804219089Spjd	DB_DNODE_ENTER(db);
1805219089Spjd	dn = DB_DNODE(db);
1806219089Spjd
1807168404Spjd	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1808168404Spjd
1809168404Spjd	ASSERT(db->db.db_size != 0);
1810168404Spjd
1811285202Savg	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
1812285202Savg	    dr->dr_accounted, txg);
1813168404Spjd
1814185029Spjd	*drp = dr->dr_next;
1815168404Spjd
1816219636Spjd	/*
1817219636Spjd	 * Note that there are three places in dbuf_dirty()
1818219636Spjd	 * where this dirty record may be put on a list.
1819219636Spjd	 * Make sure to do a list_remove corresponding to
1820219636Spjd	 * every one of those list_insert calls.
1821219636Spjd	 */
1822168404Spjd	if (dr->dr_parent) {
1823168404Spjd		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1824168404Spjd		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1825168404Spjd		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1826219636Spjd	} else if (db->db_blkid == DMU_SPILL_BLKID ||
1827285202Savg	    db->db_level + 1 == dn->dn_nlevels) {
1828185029Spjd		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1829168404Spjd		mutex_enter(&dn->dn_mtx);
1830168404Spjd		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1831168404Spjd		mutex_exit(&dn->dn_mtx);
1832168404Spjd	}
1833219089Spjd	DB_DNODE_EXIT(db);
1834168404Spjd
1835248571Smm	if (db->db_state != DB_NOFILL) {
1836248571Smm		dbuf_unoverride(dr);
1837168404Spjd
1838168404Spjd		ASSERT(db->db_buf != NULL);
1839248571Smm		ASSERT(dr->dt.dl.dr_data != NULL);
1840248571Smm		if (dr->dt.dl.dr_data != db->db_buf)
1841307266Smav			arc_buf_destroy(dr->dt.dl.dr_data, db);
1842168404Spjd	}
1843269218Sdelphij
1844168404Spjd	kmem_free(dr, sizeof (dbuf_dirty_record_t));
1845168404Spjd
1846168404Spjd	ASSERT(db->db_dirtycnt > 0);
1847168404Spjd	db->db_dirtycnt -= 1;
1848168404Spjd
1849168404Spjd	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1850307266Smav		ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
1851307266Smav		dbuf_destroy(db);
1852248571Smm		return (B_TRUE);
1853168404Spjd	}
1854168404Spjd
1855248571Smm	return (B_FALSE);
1856168404Spjd}
1857168404Spjd
1858168404Spjdvoid
1859263397Sdelphijdmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1860168404Spjd{
1861263397Sdelphij	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1862185029Spjd	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1863168404Spjd
1864168404Spjd	ASSERT(tx->tx_txg != 0);
1865168404Spjd	ASSERT(!refcount_is_zero(&db->db_holds));
1866168404Spjd
1867290750Smav	/*
1868290750Smav	 * Quick check for dirtyness.  For already dirty blocks, this
1869290750Smav	 * reduces runtime of this function by >90%, and overall performance
1870290750Smav	 * by 50% for some workloads (e.g. file deletion with indirect blocks
1871290750Smav	 * cached).
1872290750Smav	 */
1873290750Smav	mutex_enter(&db->db_mtx);
1874290750Smav	dbuf_dirty_record_t *dr;
1875290750Smav	for (dr = db->db_last_dirty;
1876290750Smav	    dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
1877290750Smav		/*
1878290750Smav		 * It's possible that it is already dirty but not cached,
1879290750Smav		 * because there are some calls to dbuf_dirty() that don't
1880290750Smav		 * go through dmu_buf_will_dirty().
1881290750Smav		 */
1882290750Smav		if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
1883290750Smav			/* This dbuf is already dirty and cached. */
1884290750Smav			dbuf_redirty(dr);
1885290750Smav			mutex_exit(&db->db_mtx);
1886290750Smav			return;
1887290750Smav		}
1888290750Smav	}
1889290750Smav	mutex_exit(&db->db_mtx);
1890290750Smav
1891219089Spjd	DB_DNODE_ENTER(db);
1892219089Spjd	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1893168404Spjd		rf |= DB_RF_HAVESTRUCT;
1894219089Spjd	DB_DNODE_EXIT(db);
1895168404Spjd	(void) dbuf_read(db, NULL, rf);
1896168404Spjd	(void) dbuf_dirty(db, tx);
1897168404Spjd}
1898168404Spjd
1899168404Spjdvoid
1900219089Spjddmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1901219089Spjd{
1902219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1903219089Spjd
1904219089Spjd	db->db_state = DB_NOFILL;
1905219089Spjd
1906219089Spjd	dmu_buf_will_fill(db_fake, tx);
1907219089Spjd}
1908219089Spjd
1909219089Spjdvoid
1910168404Spjddmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1911168404Spjd{
1912168404Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1913168404Spjd
1914219089Spjd	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1915168404Spjd	ASSERT(tx->tx_txg != 0);
1916168404Spjd	ASSERT(db->db_level == 0);
1917168404Spjd	ASSERT(!refcount_is_zero(&db->db_holds));
1918168404Spjd
1919168404Spjd	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1920168404Spjd	    dmu_tx_private_ok(tx));
1921168404Spjd
1922168404Spjd	dbuf_noread(db);
1923168404Spjd	(void) dbuf_dirty(db, tx);
1924168404Spjd}
1925168404Spjd
1926168404Spjd#pragma weak dmu_buf_fill_done = dbuf_fill_done
1927168404Spjd/* ARGSUSED */
1928168404Spjdvoid
1929168404Spjddbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1930168404Spjd{
1931168404Spjd	mutex_enter(&db->db_mtx);
1932168404Spjd	DBUF_VERIFY(db);
1933168404Spjd
1934168404Spjd	if (db->db_state == DB_FILL) {
1935168404Spjd		if (db->db_level == 0 && db->db_freed_in_flight) {
1936219089Spjd			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1937168404Spjd			/* we were freed while filling */
1938168404Spjd			/* XXX dbuf_undirty? */
1939168404Spjd			bzero(db->db.db_data, db->db.db_size);
1940168404Spjd			db->db_freed_in_flight = FALSE;
1941168404Spjd		}
1942168404Spjd		db->db_state = DB_CACHED;
1943168404Spjd		cv_broadcast(&db->db_changed);
1944168404Spjd	}
1945168404Spjd	mutex_exit(&db->db_mtx);
1946168404Spjd}
1947168404Spjd
1948268649Sdelphijvoid
1949268649Sdelphijdmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
1950268649Sdelphij    bp_embedded_type_t etype, enum zio_compress comp,
1951268649Sdelphij    int uncompressed_size, int compressed_size, int byteorder,
1952268649Sdelphij    dmu_tx_t *tx)
1953268649Sdelphij{
1954268649Sdelphij	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
1955268649Sdelphij	struct dirty_leaf *dl;
1956268649Sdelphij	dmu_object_type_t type;
1957268649Sdelphij
1958288572Smav	if (etype == BP_EMBEDDED_TYPE_DATA) {
1959288572Smav		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
1960288572Smav		    SPA_FEATURE_EMBEDDED_DATA));
1961288572Smav	}
1962288572Smav
1963268649Sdelphij	DB_DNODE_ENTER(db);
1964268649Sdelphij	type = DB_DNODE(db)->dn_type;
1965268649Sdelphij	DB_DNODE_EXIT(db);
1966268649Sdelphij
1967268649Sdelphij	ASSERT0(db->db_level);
1968268649Sdelphij	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1969268649Sdelphij
1970268649Sdelphij	dmu_buf_will_not_fill(dbuf, tx);
1971268649Sdelphij
1972268649Sdelphij	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1973268649Sdelphij	dl = &db->db_last_dirty->dt.dl;
1974268649Sdelphij	encode_embedded_bp_compressed(&dl->dr_overridden_by,
1975268649Sdelphij	    data, comp, uncompressed_size, compressed_size);
1976268649Sdelphij	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
1977268649Sdelphij	BP_SET_TYPE(&dl->dr_overridden_by, type);
1978268649Sdelphij	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
1979268649Sdelphij	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
1980268649Sdelphij
1981268649Sdelphij	dl->dr_override_state = DR_OVERRIDDEN;
1982268649Sdelphij	dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
1983268649Sdelphij}
1984268649Sdelphij
1985168404Spjd/*
1986209962Smm * Directly assign a provided arc buf to a given dbuf if it's not referenced
1987209962Smm * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1988209962Smm */
1989209962Smmvoid
1990209962Smmdbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1991209962Smm{
1992209962Smm	ASSERT(!refcount_is_zero(&db->db_holds));
1993219089Spjd	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1994209962Smm	ASSERT(db->db_level == 0);
1995209962Smm	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1996209962Smm	ASSERT(buf != NULL);
1997209962Smm	ASSERT(arc_buf_size(buf) == db->db.db_size);
1998209962Smm	ASSERT(tx->tx_txg != 0);
1999209962Smm
2000209962Smm	arc_return_buf(buf, db);
2001209962Smm	ASSERT(arc_released(buf));
2002209962Smm
2003209962Smm	mutex_enter(&db->db_mtx);
2004209962Smm
2005209962Smm	while (db->db_state == DB_READ || db->db_state == DB_FILL)
2006209962Smm		cv_wait(&db->db_changed, &db->db_mtx);
2007209962Smm
2008209962Smm	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
2009209962Smm
2010209962Smm	if (db->db_state == DB_CACHED &&
2011209962Smm	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
2012209962Smm		mutex_exit(&db->db_mtx);
2013209962Smm		(void) dbuf_dirty(db, tx);
2014209962Smm		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
2015307266Smav		arc_buf_destroy(buf, db);
2016219089Spjd		xuio_stat_wbuf_copied();
2017209962Smm		return;
2018209962Smm	}
2019209962Smm
2020219089Spjd	xuio_stat_wbuf_nocopy();
2021209962Smm	if (db->db_state == DB_CACHED) {
2022209962Smm		dbuf_dirty_record_t *dr = db->db_last_dirty;
2023209962Smm
2024209962Smm		ASSERT(db->db_buf != NULL);
2025209962Smm		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
2026209962Smm			ASSERT(dr->dt.dl.dr_data == db->db_buf);
2027209962Smm			if (!arc_released(db->db_buf)) {
2028209962Smm				ASSERT(dr->dt.dl.dr_override_state ==
2029209962Smm				    DR_OVERRIDDEN);
2030209962Smm				arc_release(db->db_buf, db);
2031209962Smm			}
2032209962Smm			dr->dt.dl.dr_data = buf;
2033307266Smav			arc_buf_destroy(db->db_buf, db);
2034209962Smm		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
2035209962Smm			arc_release(db->db_buf, db);
2036307266Smav			arc_buf_destroy(db->db_buf, db);
2037209962Smm		}
2038209962Smm		db->db_buf = NULL;
2039209962Smm	}
2040209962Smm	ASSERT(db->db_buf == NULL);
2041209962Smm	dbuf_set_data(db, buf);
2042209962Smm	db->db_state = DB_FILL;
2043209962Smm	mutex_exit(&db->db_mtx);
2044209962Smm	(void) dbuf_dirty(db, tx);
2045263397Sdelphij	dmu_buf_fill_done(&db->db, tx);
2046209962Smm}
2047209962Smm
2048168404Spjdvoid
2049307266Smavdbuf_destroy(dmu_buf_impl_t *db)
2050168404Spjd{
2051219089Spjd	dnode_t *dn;
2052168404Spjd	dmu_buf_impl_t *parent = db->db_parent;
2053219089Spjd	dmu_buf_impl_t *dndb;
2054168404Spjd
2055168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
2056168404Spjd	ASSERT(refcount_is_zero(&db->db_holds));
2057168404Spjd
2058307266Smav	if (db->db_buf != NULL) {
2059307266Smav		arc_buf_destroy(db->db_buf, db);
2060307266Smav		db->db_buf = NULL;
2061307266Smav	}
2062168404Spjd
2063307266Smav	if (db->db_blkid == DMU_BONUS_BLKID) {
2064168404Spjd		ASSERT(db->db.db_data != NULL);
2065307266Smav		zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
2066307266Smav		arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2067168404Spjd		db->db_state = DB_UNCACHED;
2068168404Spjd	}
2069168404Spjd
2070307266Smav	dbuf_clear_data(db);
2071307266Smav
2072307266Smav	if (multilist_link_active(&db->db_cache_link)) {
2073307266Smav		multilist_remove(&dbuf_cache, db);
2074307266Smav		(void) refcount_remove_many(&dbuf_cache_size,
2075307266Smav		    db->db.db_size, db);
2076307266Smav	}
2077307266Smav
2078219089Spjd	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
2079168404Spjd	ASSERT(db->db_data_pending == NULL);
2080168404Spjd
2081168404Spjd	db->db_state = DB_EVICTING;
2082168404Spjd	db->db_blkptr = NULL;
2083168404Spjd
2084307266Smav	/*
2085307266Smav	 * Now that db_state is DB_EVICTING, nobody else can find this via
2086307266Smav	 * the hash table.  We can now drop db_mtx, which allows us to
2087307266Smav	 * acquire the dn_dbufs_mtx.
2088307266Smav	 */
2089307266Smav	mutex_exit(&db->db_mtx);
2090307266Smav
2091219089Spjd	DB_DNODE_ENTER(db);
2092219089Spjd	dn = DB_DNODE(db);
2093219089Spjd	dndb = dn->dn_dbuf;
2094307266Smav	if (db->db_blkid != DMU_BONUS_BLKID) {
2095307266Smav		boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
2096307266Smav		if (needlock)
2097307266Smav			mutex_enter(&dn->dn_dbufs_mtx);
2098269845Sdelphij		avl_remove(&dn->dn_dbufs, db);
2099271002Sdelphij		atomic_dec_32(&dn->dn_dbufs_count);
2100219089Spjd		membar_producer();
2101219089Spjd		DB_DNODE_EXIT(db);
2102307266Smav		if (needlock)
2103307266Smav			mutex_exit(&dn->dn_dbufs_mtx);
2104219089Spjd		/*
2105219089Spjd		 * Decrementing the dbuf count means that the hold corresponding
2106219089Spjd		 * to the removed dbuf is no longer discounted in dnode_move(),
2107219089Spjd		 * so the dnode cannot be moved until after we release the hold.
2108219089Spjd		 * The membar_producer() ensures visibility of the decremented
2109219089Spjd		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
2110219089Spjd		 * release any lock.
2111219089Spjd		 */
2112168404Spjd		dnode_rele(dn, db);
2113219089Spjd		db->db_dnode_handle = NULL;
2114307266Smav
2115307266Smav		dbuf_hash_remove(db);
2116219089Spjd	} else {
2117219089Spjd		DB_DNODE_EXIT(db);
2118168404Spjd	}
2119168404Spjd
2120307266Smav	ASSERT(refcount_is_zero(&db->db_holds));
2121168404Spjd
2122307266Smav	db->db_parent = NULL;
2123168404Spjd
2124307266Smav	ASSERT(db->db_buf == NULL);
2125307266Smav	ASSERT(db->db.db_data == NULL);
2126307266Smav	ASSERT(db->db_hash_next == NULL);
2127307266Smav	ASSERT(db->db_blkptr == NULL);
2128307266Smav	ASSERT(db->db_data_pending == NULL);
2129307266Smav	ASSERT(!multilist_link_active(&db->db_cache_link));
2130307266Smav
2131307266Smav	kmem_cache_free(dbuf_kmem_cache, db);
2132307266Smav	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2133307266Smav
2134168404Spjd	/*
2135219089Spjd	 * If this dbuf is referenced from an indirect dbuf,
2136168404Spjd	 * decrement the ref count on the indirect dbuf.
2137168404Spjd	 */
2138168404Spjd	if (parent && parent != dndb)
2139168404Spjd		dbuf_rele(parent, db);
2140168404Spjd}
2141168404Spjd
2142288571Smav/*
2143288571Smav * Note: While bpp will always be updated if the function returns success,
2144288571Smav * parentp will not be updated if the dnode does not have dn_dbuf filled in;
2145288571Smav * this happens when the dnode is the meta-dnode, or a userused or groupused
2146288571Smav * object.
2147288571Smav */
2148168404Spjdstatic int
2149168404Spjddbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
2150168404Spjd    dmu_buf_impl_t **parentp, blkptr_t **bpp)
2151168404Spjd{
2152168404Spjd	int nlevels, epbs;
2153168404Spjd
2154168404Spjd	*parentp = NULL;
2155168404Spjd	*bpp = NULL;
2156168404Spjd
2157219089Spjd	ASSERT(blkid != DMU_BONUS_BLKID);
2158168404Spjd
2159219089Spjd	if (blkid == DMU_SPILL_BLKID) {
2160219089Spjd		mutex_enter(&dn->dn_mtx);
2161219089Spjd		if (dn->dn_have_spill &&
2162219089Spjd		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
2163219089Spjd			*bpp = &dn->dn_phys->dn_spill;
2164219089Spjd		else
2165219089Spjd			*bpp = NULL;
2166219089Spjd		dbuf_add_ref(dn->dn_dbuf, NULL);
2167219089Spjd		*parentp = dn->dn_dbuf;
2168219089Spjd		mutex_exit(&dn->dn_mtx);
2169219089Spjd		return (0);
2170219089Spjd	}
2171219089Spjd
2172168404Spjd	if (dn->dn_phys->dn_nlevels == 0)
2173168404Spjd		nlevels = 1;
2174168404Spjd	else
2175168404Spjd		nlevels = dn->dn_phys->dn_nlevels;
2176168404Spjd
2177168404Spjd	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2178168404Spjd
2179168404Spjd	ASSERT3U(level * epbs, <, 64);
2180168404Spjd	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2181168404Spjd	if (level >= nlevels ||
2182168404Spjd	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
2183168404Spjd		/* the buffer has no parent yet */
2184249195Smm		return (SET_ERROR(ENOENT));
2185168404Spjd	} else if (level < nlevels-1) {
2186168404Spjd		/* this block is referenced from an indirect block */
2187168404Spjd		int err = dbuf_hold_impl(dn, level+1,
2188288571Smav		    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
2189168404Spjd		if (err)
2190168404Spjd			return (err);
2191168404Spjd		err = dbuf_read(*parentp, NULL,
2192168404Spjd		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
2193168404Spjd		if (err) {
2194168404Spjd			dbuf_rele(*parentp, NULL);
2195168404Spjd			*parentp = NULL;
2196168404Spjd			return (err);
2197168404Spjd		}
2198168404Spjd		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
2199168404Spjd		    (blkid & ((1ULL << epbs) - 1));
2200168404Spjd		return (0);
2201168404Spjd	} else {
2202168404Spjd		/* the block is referenced from the dnode */
2203168404Spjd		ASSERT3U(level, ==, nlevels-1);
2204168404Spjd		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
2205168404Spjd		    blkid < dn->dn_phys->dn_nblkptr);
2206168404Spjd		if (dn->dn_dbuf) {
2207168404Spjd			dbuf_add_ref(dn->dn_dbuf, NULL);
2208168404Spjd			*parentp = dn->dn_dbuf;
2209168404Spjd		}
2210168404Spjd		*bpp = &dn->dn_phys->dn_blkptr[blkid];
2211168404Spjd		return (0);
2212168404Spjd	}
2213168404Spjd}
2214168404Spjd
2215168404Spjdstatic dmu_buf_impl_t *
2216168404Spjddbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
2217168404Spjd    dmu_buf_impl_t *parent, blkptr_t *blkptr)
2218168404Spjd{
2219219089Spjd	objset_t *os = dn->dn_objset;
2220168404Spjd	dmu_buf_impl_t *db, *odb;
2221168404Spjd
2222168404Spjd	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2223168404Spjd	ASSERT(dn->dn_type != DMU_OT_NONE);
2224168404Spjd
2225307266Smav	db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
2226168404Spjd
2227168404Spjd	db->db_objset = os;
2228168404Spjd	db->db.db_object = dn->dn_object;
2229168404Spjd	db->db_level = level;
2230168404Spjd	db->db_blkid = blkid;
2231168404Spjd	db->db_last_dirty = NULL;
2232168404Spjd	db->db_dirtycnt = 0;
2233219089Spjd	db->db_dnode_handle = dn->dn_handle;
2234168404Spjd	db->db_parent = parent;
2235168404Spjd	db->db_blkptr = blkptr;
2236168404Spjd
2237288549Smav	db->db_user = NULL;
2238290754Smav	db->db_user_immediate_evict = FALSE;
2239290754Smav	db->db_freed_in_flight = FALSE;
2240290754Smav	db->db_pending_evict = FALSE;
2241168404Spjd
2242219089Spjd	if (blkid == DMU_BONUS_BLKID) {
2243168404Spjd		ASSERT3P(parent, ==, dn->dn_dbuf);
2244185029Spjd		db->db.db_size = DN_MAX_BONUSLEN -
2245185029Spjd		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
2246185029Spjd		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
2247219089Spjd		db->db.db_offset = DMU_BONUS_BLKID;
2248168404Spjd		db->db_state = DB_UNCACHED;
2249168404Spjd		/* the bonus dbuf is not placed in the hash table */
2250208373Smm		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2251168404Spjd		return (db);
2252219089Spjd	} else if (blkid == DMU_SPILL_BLKID) {
2253219089Spjd		db->db.db_size = (blkptr != NULL) ?
2254219089Spjd		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
2255219089Spjd		db->db.db_offset = 0;
2256168404Spjd	} else {
2257168404Spjd		int blocksize =
2258260763Savg		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
2259168404Spjd		db->db.db_size = blocksize;
2260168404Spjd		db->db.db_offset = db->db_blkid * blocksize;
2261168404Spjd	}
2262168404Spjd
2263168404Spjd	/*
2264168404Spjd	 * Hold the dn_dbufs_mtx while we get the new dbuf
2265168404Spjd	 * in the hash table *and* added to the dbufs list.
2266168404Spjd	 * This prevents a possible deadlock with someone
2267168404Spjd	 * trying to look up this dbuf before its added to the
2268168404Spjd	 * dn_dbufs list.
2269168404Spjd	 */
2270168404Spjd	mutex_enter(&dn->dn_dbufs_mtx);
2271168404Spjd	db->db_state = DB_EVICTING;
2272168404Spjd	if ((odb = dbuf_hash_insert(db)) != NULL) {
2273168404Spjd		/* someone else inserted it first */
2274307266Smav		kmem_cache_free(dbuf_kmem_cache, db);
2275168404Spjd		mutex_exit(&dn->dn_dbufs_mtx);
2276168404Spjd		return (odb);
2277168404Spjd	}
2278269845Sdelphij	avl_add(&dn->dn_dbufs, db);
2279254753Sdelphij	if (db->db_level == 0 && db->db_blkid >=
2280254753Sdelphij	    dn->dn_unlisted_l0_blkid)
2281254753Sdelphij		dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
2282168404Spjd	db->db_state = DB_UNCACHED;
2283168404Spjd	mutex_exit(&dn->dn_dbufs_mtx);
2284208373Smm	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2285168404Spjd
2286168404Spjd	if (parent && parent != dn->dn_dbuf)
2287168404Spjd		dbuf_add_ref(parent, db);
2288168404Spjd
2289168404Spjd	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
2290168404Spjd	    refcount_count(&dn->dn_holds) > 0);
2291168404Spjd	(void) refcount_add(&dn->dn_holds, db);
2292271002Sdelphij	atomic_inc_32(&dn->dn_dbufs_count);
2293168404Spjd
2294168404Spjd	dprintf_dbuf(db, "db=%p\n", db);
2295168404Spjd
2296168404Spjd	return (db);
2297168404Spjd}
2298168404Spjd
2299288571Smavtypedef struct dbuf_prefetch_arg {
2300288571Smav	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
2301288571Smav	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
2302288571Smav	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
2303288571Smav	int dpa_curlevel; /* The current level that we're reading */
2304307266Smav	dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
2305288571Smav	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
2306288571Smav	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
2307288571Smav	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
2308288571Smav} dbuf_prefetch_arg_t;
2309288571Smav
2310288571Smav/*
2311288571Smav * Actually issue the prefetch read for the block given.
2312288571Smav */
2313288571Smavstatic void
2314288571Smavdbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
2315288571Smav{
2316288571Smav	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
2317288571Smav		return;
2318288571Smav
2319288571Smav	arc_flags_t aflags =
2320288571Smav	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
2321288571Smav
2322288571Smav	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
2323288571Smav	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
2324288571Smav	ASSERT(dpa->dpa_zio != NULL);
2325288571Smav	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
2326288571Smav	    dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2327288571Smav	    &aflags, &dpa->dpa_zb);
2328288571Smav}
2329288571Smav
2330288571Smav/*
2331288571Smav * Called when an indirect block above our prefetch target is read in.  This
2332288571Smav * will either read in the next indirect block down the tree or issue the actual
2333288571Smav * prefetch if the next block down is our target.
2334288571Smav */
2335288571Smavstatic void
2336288571Smavdbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
2337288571Smav{
2338288571Smav	dbuf_prefetch_arg_t *dpa = private;
2339288571Smav
2340288571Smav	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
2341288571Smav	ASSERT3S(dpa->dpa_curlevel, >, 0);
2342307266Smav
2343307266Smav	/*
2344307266Smav	 * The dpa_dnode is only valid if we are called with a NULL
2345307266Smav	 * zio. This indicates that the arc_read() returned without
2346307266Smav	 * first calling zio_read() to issue a physical read. Once
2347307266Smav	 * a physical read is made the dpa_dnode must be invalidated
2348307266Smav	 * as the locks guarding it may have been dropped. If the
2349307266Smav	 * dpa_dnode is still valid, then we want to add it to the dbuf
2350307266Smav	 * cache. To do so, we must hold the dbuf associated with the block
2351307266Smav	 * we just prefetched, read its contents so that we associate it
2352307266Smav	 * with an arc_buf_t, and then release it.
2353307266Smav	 */
2354288571Smav	if (zio != NULL) {
2355288571Smav		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
2356307266Smav		if (zio->io_flags & ZIO_FLAG_RAW) {
2357307266Smav			ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
2358307266Smav		} else {
2359307266Smav			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
2360307266Smav		}
2361288571Smav		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
2362307266Smav
2363307266Smav		dpa->dpa_dnode = NULL;
2364307266Smav	} else if (dpa->dpa_dnode != NULL) {
2365307266Smav		uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
2366307266Smav		    (dpa->dpa_epbs * (dpa->dpa_curlevel -
2367307266Smav		    dpa->dpa_zb.zb_level));
2368307266Smav		dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
2369307266Smav		    dpa->dpa_curlevel, curblkid, FTAG);
2370307266Smav		(void) dbuf_read(db, NULL,
2371307266Smav		    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
2372307266Smav		dbuf_rele(db, FTAG);
2373288571Smav	}
2374288571Smav
2375288571Smav	dpa->dpa_curlevel--;
2376288571Smav
2377288571Smav	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
2378288571Smav	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
2379288571Smav	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
2380288571Smav	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
2381288571Smav	if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
2382288571Smav		kmem_free(dpa, sizeof (*dpa));
2383288571Smav	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
2384288571Smav		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
2385288571Smav		dbuf_issue_final_prefetch(dpa, bp);
2386288571Smav		kmem_free(dpa, sizeof (*dpa));
2387288571Smav	} else {
2388288571Smav		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
2389288571Smav		zbookmark_phys_t zb;
2390288571Smav
2391325932Savg		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
2392325932Savg		if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
2393325932Savg			iter_aflags |= ARC_FLAG_L2CACHE;
2394325932Savg
2395288571Smav		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
2396288571Smav
2397288571Smav		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
2398288571Smav		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
2399288571Smav
2400288571Smav		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
2401288571Smav		    bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
2402288571Smav		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2403288571Smav		    &iter_aflags, &zb);
2404288571Smav	}
2405307266Smav
2406307266Smav	arc_buf_destroy(abuf, private);
2407288571Smav}
2408288571Smav
2409288571Smav/*
2410288571Smav * Issue prefetch reads for the given block on the given level.  If the indirect
2411288571Smav * blocks above that block are not in memory, we will read them in
2412288571Smav * asynchronously.  As a result, this call never blocks waiting for a read to
2413288571Smav * complete.
2414288571Smav */
2415168404Spjdvoid
2416288571Smavdbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
2417288571Smav    arc_flags_t aflags)
2418168404Spjd{
2419288571Smav	blkptr_t bp;
2420288571Smav	int epbs, nlevels, curlevel;
2421288571Smav	uint64_t curblkid;
2422168404Spjd
2423219089Spjd	ASSERT(blkid != DMU_BONUS_BLKID);
2424168404Spjd	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2425168404Spjd
2426288594Smav	if (blkid > dn->dn_maxblkid)
2427288594Smav		return;
2428288594Smav
2429168404Spjd	if (dnode_block_freed(dn, blkid))
2430168404Spjd		return;
2431168404Spjd
2432288571Smav	/*
2433288571Smav	 * This dnode hasn't been written to disk yet, so there's nothing to
2434288571Smav	 * prefetch.
2435288571Smav	 */
2436288571Smav	nlevels = dn->dn_phys->dn_nlevels;
2437288571Smav	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
2438288571Smav		return;
2439288571Smav
2440288571Smav	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2441288571Smav	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
2442288571Smav		return;
2443288571Smav
2444288571Smav	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
2445288571Smav	    level, blkid);
2446288571Smav	if (db != NULL) {
2447288571Smav		mutex_exit(&db->db_mtx);
2448219089Spjd		/*
2449288571Smav		 * This dbuf already exists.  It is either CACHED, or
2450288571Smav		 * (we assume) about to be read or filled.
2451219089Spjd		 */
2452219089Spjd		return;
2453168404Spjd	}
2454168404Spjd
2455288571Smav	/*
2456288571Smav	 * Find the closest ancestor (indirect block) of the target block
2457288571Smav	 * that is present in the cache.  In this indirect block, we will
2458288571Smav	 * find the bp that is at curlevel, curblkid.
2459288571Smav	 */
2460288571Smav	curlevel = level;
2461288571Smav	curblkid = blkid;
2462288571Smav	while (curlevel < nlevels - 1) {
2463288571Smav		int parent_level = curlevel + 1;
2464288571Smav		uint64_t parent_blkid = curblkid >> epbs;
2465288571Smav		dmu_buf_impl_t *db;
2466168404Spjd
2467288571Smav		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
2468288571Smav		    FALSE, TRUE, FTAG, &db) == 0) {
2469288571Smav			blkptr_t *bpp = db->db_buf->b_data;
2470288571Smav			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
2471288571Smav			dbuf_rele(db, FTAG);
2472288571Smav			break;
2473288571Smav		}
2474219089Spjd
2475288571Smav		curlevel = parent_level;
2476288571Smav		curblkid = parent_blkid;
2477168404Spjd	}
2478288571Smav
2479288571Smav	if (curlevel == nlevels - 1) {
2480288571Smav		/* No cached indirect blocks found. */
2481288571Smav		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
2482288571Smav		bp = dn->dn_phys->dn_blkptr[curblkid];
2483288571Smav	}
2484288571Smav	if (BP_IS_HOLE(&bp))
2485288571Smav		return;
2486288571Smav
2487288571Smav	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
2488288571Smav
2489288571Smav	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
2490288571Smav	    ZIO_FLAG_CANFAIL);
2491288571Smav
2492288571Smav	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
2493288571Smav	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
2494288571Smav	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
2495288571Smav	    dn->dn_object, level, blkid);
2496288571Smav	dpa->dpa_curlevel = curlevel;
2497288571Smav	dpa->dpa_prio = prio;
2498288571Smav	dpa->dpa_aflags = aflags;
2499288571Smav	dpa->dpa_spa = dn->dn_objset->os_spa;
2500307266Smav	dpa->dpa_dnode = dn;
2501288571Smav	dpa->dpa_epbs = epbs;
2502288571Smav	dpa->dpa_zio = pio;
2503288571Smav
2504325932Savg	/* flag if L2ARC eligible, l2arc_noprefetch then decides */
2505325932Savg	if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
2506325932Savg		dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
2507325932Savg
2508288571Smav	/*
2509288571Smav	 * If we have the indirect just above us, no need to do the asynchronous
2510288571Smav	 * prefetch chain; we'll just run the last step ourselves.  If we're at
2511288571Smav	 * a higher level, though, we want to issue the prefetches for all the
2512288571Smav	 * indirect blocks asynchronously, so we can go on with whatever we were
2513288571Smav	 * doing.
2514288571Smav	 */
2515288571Smav	if (curlevel == level) {
2516288571Smav		ASSERT3U(curblkid, ==, blkid);
2517288571Smav		dbuf_issue_final_prefetch(dpa, &bp);
2518288571Smav		kmem_free(dpa, sizeof (*dpa));
2519288571Smav	} else {
2520288571Smav		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
2521288571Smav		zbookmark_phys_t zb;
2522288571Smav
2523325932Savg		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
2524325932Savg		if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
2525325932Savg			iter_aflags |= ARC_FLAG_L2CACHE;
2526325932Savg
2527288571Smav		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
2528288571Smav		    dn->dn_object, curlevel, curblkid);
2529288571Smav		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
2530288571Smav		    &bp, dbuf_prefetch_indirect_done, dpa, prio,
2531288571Smav		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2532288571Smav		    &iter_aflags, &zb);
2533288571Smav	}
2534288571Smav	/*
2535288571Smav	 * We use pio here instead of dpa_zio since it's possible that
2536288571Smav	 * dpa may have already been freed.
2537288571Smav	 */
2538288571Smav	zio_nowait(pio);
2539168404Spjd}
2540168404Spjd
2541168404Spjd/*
2542168404Spjd * Returns with db_holds incremented, and db_mtx not held.
2543168404Spjd * Note: dn_struct_rwlock must be held.
2544168404Spjd */
2545168404Spjdint
2546288571Smavdbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
2547288571Smav    boolean_t fail_sparse, boolean_t fail_uncached,
2548168404Spjd    void *tag, dmu_buf_impl_t **dbp)
2549168404Spjd{
2550168404Spjd	dmu_buf_impl_t *db, *parent = NULL;
2551168404Spjd
2552219089Spjd	ASSERT(blkid != DMU_BONUS_BLKID);
2553168404Spjd	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2554168404Spjd	ASSERT3U(dn->dn_nlevels, >, level);
2555168404Spjd
2556168404Spjd	*dbp = NULL;
2557168404Spjdtop:
2558168404Spjd	/* dbuf_find() returns with db_mtx held */
2559288538Smav	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
2560168404Spjd
2561168404Spjd	if (db == NULL) {
2562168404Spjd		blkptr_t *bp = NULL;
2563168404Spjd		int err;
2564168404Spjd
2565288571Smav		if (fail_uncached)
2566288571Smav			return (SET_ERROR(ENOENT));
2567288571Smav
2568168404Spjd		ASSERT3P(parent, ==, NULL);
2569168404Spjd		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
2570168404Spjd		if (fail_sparse) {
2571168404Spjd			if (err == 0 && bp && BP_IS_HOLE(bp))
2572249195Smm				err = SET_ERROR(ENOENT);
2573168404Spjd			if (err) {
2574168404Spjd				if (parent)
2575168404Spjd					dbuf_rele(parent, NULL);
2576168404Spjd				return (err);
2577168404Spjd			}
2578168404Spjd		}
2579168404Spjd		if (err && err != ENOENT)
2580168404Spjd			return (err);
2581168404Spjd		db = dbuf_create(dn, level, blkid, parent, bp);
2582168404Spjd	}
2583168404Spjd
2584288571Smav	if (fail_uncached && db->db_state != DB_CACHED) {
2585288571Smav		mutex_exit(&db->db_mtx);
2586288571Smav		return (SET_ERROR(ENOENT));
2587288571Smav	}
2588288571Smav
2589307266Smav	if (db->db_buf != NULL)
2590168404Spjd		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
2591168404Spjd
2592168404Spjd	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
2593168404Spjd
2594168404Spjd	/*
2595168404Spjd	 * If this buffer is currently syncing out, and we are are
2596168404Spjd	 * still referencing it from db_data, we need to make a copy
2597168404Spjd	 * of it in case we decide we want to dirty it again in this txg.
2598168404Spjd	 */
2599219089Spjd	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
2600168404Spjd	    dn->dn_object != DMU_META_DNODE_OBJECT &&
2601168404Spjd	    db->db_state == DB_CACHED && db->db_data_pending) {
2602168404Spjd		dbuf_dirty_record_t *dr = db->db_data_pending;
2603168404Spjd
2604168404Spjd		if (dr->dt.dl.dr_data == db->db_buf) {
2605168404Spjd			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2606168404Spjd
2607168404Spjd			dbuf_set_data(db,
2608307266Smav			    arc_alloc_buf(dn->dn_objset->os_spa,
2609168404Spjd			    db->db.db_size, db, type));
2610168404Spjd			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
2611168404Spjd			    db->db.db_size);
2612168404Spjd		}
2613168404Spjd	}
2614168404Spjd
2615307266Smav	if (multilist_link_active(&db->db_cache_link)) {
2616307266Smav		ASSERT(refcount_is_zero(&db->db_holds));
2617307266Smav		multilist_remove(&dbuf_cache, db);
2618307266Smav		(void) refcount_remove_many(&dbuf_cache_size,
2619307266Smav		    db->db.db_size, db);
2620307266Smav	}
2621168404Spjd	(void) refcount_add(&db->db_holds, tag);
2622168404Spjd	DBUF_VERIFY(db);
2623168404Spjd	mutex_exit(&db->db_mtx);
2624168404Spjd
2625168404Spjd	/* NOTE: we can't rele the parent until after we drop the db_mtx */
2626168404Spjd	if (parent)
2627168404Spjd		dbuf_rele(parent, NULL);
2628168404Spjd
2629219089Spjd	ASSERT3P(DB_DNODE(db), ==, dn);
2630168404Spjd	ASSERT3U(db->db_blkid, ==, blkid);
2631168404Spjd	ASSERT3U(db->db_level, ==, level);
2632168404Spjd	*dbp = db;
2633168404Spjd
2634168404Spjd	return (0);
2635168404Spjd}
2636168404Spjd
2637168404Spjddmu_buf_impl_t *
2638168404Spjddbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
2639168404Spjd{
2640288571Smav	return (dbuf_hold_level(dn, 0, blkid, tag));
2641168404Spjd}
2642168404Spjd
2643168404Spjddmu_buf_impl_t *
2644168404Spjddbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
2645168404Spjd{
2646168404Spjd	dmu_buf_impl_t *db;
2647288571Smav	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
2648168404Spjd	return (err ? NULL : db);
2649168404Spjd}
2650168404Spjd
2651185029Spjdvoid
2652168404Spjddbuf_create_bonus(dnode_t *dn)
2653168404Spjd{
2654168404Spjd	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
2655168404Spjd
2656168404Spjd	ASSERT(dn->dn_bonus == NULL);
2657219089Spjd	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
2658168404Spjd}
2659168404Spjd
2660219089Spjdint
2661219089Spjddbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
2662219089Spjd{
2663219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2664219089Spjd	dnode_t *dn;
2665219089Spjd
2666219089Spjd	if (db->db_blkid != DMU_SPILL_BLKID)
2667249195Smm		return (SET_ERROR(ENOTSUP));
2668219089Spjd	if (blksz == 0)
2669219089Spjd		blksz = SPA_MINBLOCKSIZE;
2670276081Sdelphij	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
2671276081Sdelphij	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
2672219089Spjd
2673219089Spjd	DB_DNODE_ENTER(db);
2674219089Spjd	dn = DB_DNODE(db);
2675219089Spjd	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2676219089Spjd	dbuf_new_size(db, blksz, tx);
2677219089Spjd	rw_exit(&dn->dn_struct_rwlock);
2678219089Spjd	DB_DNODE_EXIT(db);
2679219089Spjd
2680219089Spjd	return (0);
2681219089Spjd}
2682219089Spjd
2683219089Spjdvoid
2684219089Spjddbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2685219089Spjd{
2686219089Spjd	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2687219089Spjd}
2688219089Spjd
2689168404Spjd#pragma weak dmu_buf_add_ref = dbuf_add_ref
2690168404Spjdvoid
2691168404Spjddbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2692168404Spjd{
2693168404Spjd	int64_t holds = refcount_add(&db->db_holds, tag);
2694307266Smav	ASSERT3S(holds, >, 1);
2695168404Spjd}
2696168404Spjd
2697288538Smav#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
2698288538Smavboolean_t
2699288538Smavdbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
2700288538Smav    void *tag)
2701288538Smav{
2702288538Smav	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2703288538Smav	dmu_buf_impl_t *found_db;
2704288538Smav	boolean_t result = B_FALSE;
2705288538Smav
2706288538Smav	if (db->db_blkid == DMU_BONUS_BLKID)
2707288538Smav		found_db = dbuf_find_bonus(os, obj);
2708288538Smav	else
2709288538Smav		found_db = dbuf_find(os, obj, 0, blkid);
2710288538Smav
2711288538Smav	if (found_db != NULL) {
2712288538Smav		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
2713288538Smav			(void) refcount_add(&db->db_holds, tag);
2714288538Smav			result = B_TRUE;
2715288538Smav		}
2716288538Smav		mutex_exit(&db->db_mtx);
2717288538Smav	}
2718288538Smav	return (result);
2719288538Smav}
2720288538Smav
2721219089Spjd/*
2722219089Spjd * If you call dbuf_rele() you had better not be referencing the dnode handle
2723219089Spjd * unless you have some other direct or indirect hold on the dnode. (An indirect
2724219089Spjd * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2725219089Spjd * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2726219089Spjd * dnode's parent dbuf evicting its dnode handles.
2727219089Spjd */
2728168404Spjdvoid
2729168404Spjddbuf_rele(dmu_buf_impl_t *db, void *tag)
2730168404Spjd{
2731219089Spjd	mutex_enter(&db->db_mtx);
2732219089Spjd	dbuf_rele_and_unlock(db, tag);
2733219089Spjd}
2734219089Spjd
2735263397Sdelphijvoid
2736263397Sdelphijdmu_buf_rele(dmu_buf_t *db, void *tag)
2737263397Sdelphij{
2738263397Sdelphij	dbuf_rele((dmu_buf_impl_t *)db, tag);
2739263397Sdelphij}
2740263397Sdelphij
2741219089Spjd/*
2742219089Spjd * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
2743219089Spjd * db_dirtycnt and db_holds to be updated atomically.
2744219089Spjd */
2745219089Spjdvoid
2746219089Spjddbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2747219089Spjd{
2748168404Spjd	int64_t holds;
2749168404Spjd
2750219089Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
2751168404Spjd	DBUF_VERIFY(db);
2752168404Spjd
2753219089Spjd	/*
2754219089Spjd	 * Remove the reference to the dbuf before removing its hold on the
2755219089Spjd	 * dnode so we can guarantee in dnode_move() that a referenced bonus
2756219089Spjd	 * buffer has a corresponding dnode hold.
2757219089Spjd	 */
2758168404Spjd	holds = refcount_remove(&db->db_holds, tag);
2759168404Spjd	ASSERT(holds >= 0);
2760168404Spjd
2761168404Spjd	/*
2762168404Spjd	 * We can't freeze indirects if there is a possibility that they
2763168404Spjd	 * may be modified in the current syncing context.
2764168404Spjd	 */
2765307266Smav	if (db->db_buf != NULL &&
2766307266Smav	    holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
2767168404Spjd		arc_buf_freeze(db->db_buf);
2768307266Smav	}
2769168404Spjd
2770168404Spjd	if (holds == db->db_dirtycnt &&
2771290754Smav	    db->db_level == 0 && db->db_user_immediate_evict)
2772168404Spjd		dbuf_evict_user(db);
2773168404Spjd
2774168404Spjd	if (holds == 0) {
2775219089Spjd		if (db->db_blkid == DMU_BONUS_BLKID) {
2776288541Smav			dnode_t *dn;
2777290754Smav			boolean_t evict_dbuf = db->db_pending_evict;
2778219089Spjd
2779219089Spjd			/*
2780288541Smav			 * If the dnode moves here, we cannot cross this
2781288541Smav			 * barrier until the move completes.
2782219089Spjd			 */
2783219089Spjd			DB_DNODE_ENTER(db);
2784288541Smav
2785288541Smav			dn = DB_DNODE(db);
2786288541Smav			atomic_dec_32(&dn->dn_dbufs_count);
2787288541Smav
2788288541Smav			/*
2789288541Smav			 * Decrementing the dbuf count means that the bonus
2790288541Smav			 * buffer's dnode hold is no longer discounted in
2791288541Smav			 * dnode_move(). The dnode cannot move until after
2792290754Smav			 * the dnode_rele() below.
2793288541Smav			 */
2794219089Spjd			DB_DNODE_EXIT(db);
2795288541Smav
2796219089Spjd			/*
2797288541Smav			 * Do not reference db after its lock is dropped.
2798288541Smav			 * Another thread may evict it.
2799219089Spjd			 */
2800288541Smav			mutex_exit(&db->db_mtx);
2801288541Smav
2802290754Smav			if (evict_dbuf)
2803288541Smav				dnode_evict_bonus(dn);
2804290754Smav
2805290754Smav			dnode_rele(dn, db);
2806168404Spjd		} else if (db->db_buf == NULL) {
2807168404Spjd			/*
2808168404Spjd			 * This is a special case: we never associated this
2809168404Spjd			 * dbuf with any data allocated from the ARC.
2810168404Spjd			 */
2811219089Spjd			ASSERT(db->db_state == DB_UNCACHED ||
2812219089Spjd			    db->db_state == DB_NOFILL);
2813307266Smav			dbuf_destroy(db);
2814168404Spjd		} else if (arc_released(db->db_buf)) {
2815168404Spjd			/*
2816168404Spjd			 * This dbuf has anonymous data associated with it.
2817168404Spjd			 */
2818307266Smav			dbuf_destroy(db);
2819168404Spjd		} else {
2820307266Smav			boolean_t do_arc_evict = B_FALSE;
2821307266Smav			blkptr_t bp;
2822307266Smav			spa_t *spa = dmu_objset_spa(db->db_objset);
2823242845Sdelphij
2824307266Smav			if (!DBUF_IS_CACHEABLE(db) &&
2825307266Smav			    db->db_blkptr != NULL &&
2826307266Smav			    !BP_IS_HOLE(db->db_blkptr) &&
2827307266Smav			    !BP_IS_EMBEDDED(db->db_blkptr)) {
2828307266Smav				do_arc_evict = B_TRUE;
2829307266Smav				bp = *db->db_blkptr;
2830307266Smav			}
2831307266Smav
2832307266Smav			if (!DBUF_IS_CACHEABLE(db) ||
2833307266Smav			    db->db_pending_evict) {
2834307266Smav				dbuf_destroy(db);
2835307266Smav			} else if (!multilist_link_active(&db->db_cache_link)) {
2836307266Smav				multilist_insert(&dbuf_cache, db);
2837307266Smav				(void) refcount_add_many(&dbuf_cache_size,
2838307266Smav				    db->db.db_size, db);
2839185029Spjd				mutex_exit(&db->db_mtx);
2840307266Smav
2841307266Smav				dbuf_evict_notify();
2842269417Sdelphij			}
2843307266Smav
2844307266Smav			if (do_arc_evict)
2845307266Smav				arc_freed(spa, &bp);
2846168404Spjd		}
2847168404Spjd	} else {
2848168404Spjd		mutex_exit(&db->db_mtx);
2849168404Spjd	}
2850307266Smav
2851168404Spjd}
2852168404Spjd
2853168404Spjd#pragma weak dmu_buf_refcount = dbuf_refcount
2854168404Spjduint64_t
2855168404Spjddbuf_refcount(dmu_buf_impl_t *db)
2856168404Spjd{
2857168404Spjd	return (refcount_count(&db->db_holds));
2858168404Spjd}
2859168404Spjd
2860168404Spjdvoid *
2861288549Smavdmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
2862288549Smav    dmu_buf_user_t *new_user)
2863168404Spjd{
2864288549Smav	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2865288549Smav
2866288549Smav	mutex_enter(&db->db_mtx);
2867288549Smav	dbuf_verify_user(db, DBVU_NOT_EVICTING);
2868288549Smav	if (db->db_user == old_user)
2869288549Smav		db->db_user = new_user;
2870288549Smav	else
2871288549Smav		old_user = db->db_user;
2872288549Smav	dbuf_verify_user(db, DBVU_NOT_EVICTING);
2873288549Smav	mutex_exit(&db->db_mtx);
2874288549Smav
2875288549Smav	return (old_user);
2876168404Spjd}
2877168404Spjd
2878168404Spjdvoid *
2879288549Smavdmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2880168404Spjd{
2881288549Smav	return (dmu_buf_replace_user(db_fake, NULL, user));
2882288549Smav}
2883288549Smav
2884288549Smavvoid *
2885288549Smavdmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2886288549Smav{
2887168404Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2888168404Spjd
2889290754Smav	db->db_user_immediate_evict = TRUE;
2890288549Smav	return (dmu_buf_set_user(db_fake, user));
2891168404Spjd}
2892168404Spjd
2893168404Spjdvoid *
2894288549Smavdmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2895168404Spjd{
2896288549Smav	return (dmu_buf_replace_user(db_fake, user, NULL));
2897168404Spjd}
2898168404Spjd
2899168404Spjdvoid *
2900168404Spjddmu_buf_get_user(dmu_buf_t *db_fake)
2901168404Spjd{
2902168404Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2903168404Spjd
2904288549Smav	dbuf_verify_user(db, DBVU_NOT_EVICTING);
2905288549Smav	return (db->db_user);
2906168404Spjd}
2907168404Spjd
2908288549Smavvoid
2909288549Smavdmu_buf_user_evict_wait()
2910288549Smav{
2911288549Smav	taskq_wait(dbu_evict_taskq);
2912288549Smav}
2913288549Smav
2914209962Smmboolean_t
2915209962Smmdmu_buf_freeable(dmu_buf_t *dbuf)
2916209962Smm{
2917209962Smm	boolean_t res = B_FALSE;
2918209962Smm	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2919209962Smm
2920209962Smm	if (db->db_blkptr)
2921209962Smm		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2922219089Spjd		    db->db_blkptr, db->db_blkptr->blk_birth);
2923209962Smm
2924209962Smm	return (res);
2925209962Smm}
2926209962Smm
2927243524Smmblkptr_t *
2928243524Smmdmu_buf_get_blkptr(dmu_buf_t *db)
2929243524Smm{
2930243524Smm	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2931243524Smm	return (dbi->db_blkptr);
2932243524Smm}
2933243524Smm
2934307287Smavobjset_t *
2935307287Smavdmu_buf_get_objset(dmu_buf_t *db)
2936307287Smav{
2937307287Smav	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2938307287Smav	return (dbi->db_objset);
2939307287Smav}
2940307287Smav
2941307292Smavdnode_t *
2942307292Smavdmu_buf_dnode_enter(dmu_buf_t *db)
2943307292Smav{
2944307292Smav	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2945307292Smav	DB_DNODE_ENTER(dbi);
2946307292Smav	return (DB_DNODE(dbi));
2947307292Smav}
2948307292Smav
2949307292Smavvoid
2950307292Smavdmu_buf_dnode_exit(dmu_buf_t *db)
2951307292Smav{
2952307292Smav	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2953307292Smav	DB_DNODE_EXIT(dbi);
2954307292Smav}
2955307292Smav
2956168404Spjdstatic void
2957168404Spjddbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2958168404Spjd{
2959168404Spjd	/* ASSERT(dmu_tx_is_syncing(tx) */
2960168404Spjd	ASSERT(MUTEX_HELD(&db->db_mtx));
2961168404Spjd
2962168404Spjd	if (db->db_blkptr != NULL)
2963168404Spjd		return;
2964168404Spjd
2965219089Spjd	if (db->db_blkid == DMU_SPILL_BLKID) {
2966219089Spjd		db->db_blkptr = &dn->dn_phys->dn_spill;
2967219089Spjd		BP_ZERO(db->db_blkptr);
2968219089Spjd		return;
2969219089Spjd	}
2970168404Spjd	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2971168404Spjd		/*
2972168404Spjd		 * This buffer was allocated at a time when there was
2973168404Spjd		 * no available blkptrs from the dnode, or it was
2974168404Spjd		 * inappropriate to hook it in (i.e., nlevels mis-match).
2975168404Spjd		 */
2976168404Spjd		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2977168404Spjd		ASSERT(db->db_parent == NULL);
2978168404Spjd		db->db_parent = dn->dn_dbuf;
2979168404Spjd		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2980168404Spjd		DBUF_VERIFY(db);
2981168404Spjd	} else {
2982168404Spjd		dmu_buf_impl_t *parent = db->db_parent;
2983168404Spjd		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2984168404Spjd
2985168404Spjd		ASSERT(dn->dn_phys->dn_nlevels > 1);
2986168404Spjd		if (parent == NULL) {
2987168404Spjd			mutex_exit(&db->db_mtx);
2988168404Spjd			rw_enter(&dn->dn_struct_rwlock, RW_READER);
2989288571Smav			parent = dbuf_hold_level(dn, db->db_level + 1,
2990288571Smav			    db->db_blkid >> epbs, db);
2991168404Spjd			rw_exit(&dn->dn_struct_rwlock);
2992168404Spjd			mutex_enter(&db->db_mtx);
2993168404Spjd			db->db_parent = parent;
2994168404Spjd		}
2995168404Spjd		db->db_blkptr = (blkptr_t *)parent->db.db_data +
2996168404Spjd		    (db->db_blkid & ((1ULL << epbs) - 1));
2997168404Spjd		DBUF_VERIFY(db);
2998168404Spjd	}
2999168404Spjd}
3000168404Spjd
3001168404Spjdstatic void
3002168404Spjddbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
3003168404Spjd{
3004168404Spjd	dmu_buf_impl_t *db = dr->dr_dbuf;
3005219089Spjd	dnode_t *dn;
3006168404Spjd	zio_t *zio;
3007168404Spjd
3008168404Spjd	ASSERT(dmu_tx_is_syncing(tx));
3009168404Spjd
3010168404Spjd	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
3011168404Spjd
3012168404Spjd	mutex_enter(&db->db_mtx);
3013168404Spjd
3014168404Spjd	ASSERT(db->db_level > 0);
3015168404Spjd	DBUF_VERIFY(db);
3016168404Spjd
3017251629Sdelphij	/* Read the block if it hasn't been read yet. */
3018168404Spjd	if (db->db_buf == NULL) {
3019168404Spjd		mutex_exit(&db->db_mtx);
3020168404Spjd		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
3021168404Spjd		mutex_enter(&db->db_mtx);
3022168404Spjd	}
3023168404Spjd	ASSERT3U(db->db_state, ==, DB_CACHED);
3024168404Spjd	ASSERT(db->db_buf != NULL);
3025168404Spjd
3026219089Spjd	DB_DNODE_ENTER(db);
3027219089Spjd	dn = DB_DNODE(db);
3028251629Sdelphij	/* Indirect block size must match what the dnode thinks it is. */
3029219089Spjd	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
3030168404Spjd	dbuf_check_blkptr(dn, db);
3031219089Spjd	DB_DNODE_EXIT(db);
3032168404Spjd
3033251629Sdelphij	/* Provide the pending dirty record to child dbufs */
3034168404Spjd	db->db_data_pending = dr;
3035168404Spjd
3036168404Spjd	mutex_exit(&db->db_mtx);
3037185029Spjd	dbuf_write(dr, db->db_buf, tx);
3038168404Spjd
3039168404Spjd	zio = dr->dr_zio;
3040168404Spjd	mutex_enter(&dr->dt.di.dr_mtx);
3041285202Savg	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
3042168404Spjd	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
3043168404Spjd	mutex_exit(&dr->dt.di.dr_mtx);
3044168404Spjd	zio_nowait(zio);
3045168404Spjd}
3046168404Spjd
3047168404Spjdstatic void
3048168404Spjddbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
3049168404Spjd{
3050168404Spjd	arc_buf_t **datap = &dr->dt.dl.dr_data;
3051168404Spjd	dmu_buf_impl_t *db = dr->dr_dbuf;
3052219089Spjd	dnode_t *dn;
3053219089Spjd	objset_t *os;
3054168404Spjd	uint64_t txg = tx->tx_txg;
3055168404Spjd
3056168404Spjd	ASSERT(dmu_tx_is_syncing(tx));
3057168404Spjd
3058168404Spjd	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
3059168404Spjd
3060168404Spjd	mutex_enter(&db->db_mtx);
3061168404Spjd	/*
3062168404Spjd	 * To be synced, we must be dirtied.  But we
3063168404Spjd	 * might have been freed after the dirty.
3064168404Spjd	 */
3065168404Spjd	if (db->db_state == DB_UNCACHED) {
3066168404Spjd		/* This buffer has been freed since it was dirtied */
3067168404Spjd		ASSERT(db->db.db_data == NULL);
3068168404Spjd	} else if (db->db_state == DB_FILL) {
3069168404Spjd		/* This buffer was freed and is now being re-filled */
3070168404Spjd		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
3071168404Spjd	} else {
3072219089Spjd		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
3073168404Spjd	}
3074168404Spjd	DBUF_VERIFY(db);
3075168404Spjd
3076219089Spjd	DB_DNODE_ENTER(db);
3077219089Spjd	dn = DB_DNODE(db);
3078219089Spjd
3079219089Spjd	if (db->db_blkid == DMU_SPILL_BLKID) {
3080219089Spjd		mutex_enter(&dn->dn_mtx);
3081219089Spjd		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
3082219089Spjd		mutex_exit(&dn->dn_mtx);
3083219089Spjd	}
3084219089Spjd
3085168404Spjd	/*
3086168404Spjd	 * If this is a bonus buffer, simply copy the bonus data into the
3087168404Spjd	 * dnode.  It will be written out when the dnode is synced (and it
3088168404Spjd	 * will be synced, since it must have been dirty for dbuf_sync to
3089168404Spjd	 * be called).
3090168404Spjd	 */
3091219089Spjd	if (db->db_blkid == DMU_BONUS_BLKID) {
3092168404Spjd		dbuf_dirty_record_t **drp;
3093185029Spjd
3094168404Spjd		ASSERT(*datap != NULL);
3095240415Smm		ASSERT0(db->db_level);
3096168404Spjd		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
3097168404Spjd		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
3098219089Spjd		DB_DNODE_EXIT(db);
3099219089Spjd
3100185029Spjd		if (*datap != db->db.db_data) {
3101168404Spjd			zio_buf_free(*datap, DN_MAX_BONUSLEN);
3102208373Smm			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
3103185029Spjd		}
3104168404Spjd		db->db_data_pending = NULL;
3105168404Spjd		drp = &db->db_last_dirty;
3106168404Spjd		while (*drp != dr)
3107168404Spjd			drp = &(*drp)->dr_next;
3108185029Spjd		ASSERT(dr->dr_next == NULL);
3109219089Spjd		ASSERT(dr->dr_dbuf == db);
3110185029Spjd		*drp = dr->dr_next;
3111169325Spjd		if (dr->dr_dbuf->db_level != 0) {
3112169325Spjd			list_destroy(&dr->dt.di.dr_children);
3113169325Spjd			mutex_destroy(&dr->dt.di.dr_mtx);
3114169325Spjd		}
3115168404Spjd		kmem_free(dr, sizeof (dbuf_dirty_record_t));
3116168404Spjd		ASSERT(db->db_dirtycnt > 0);
3117168404Spjd		db->db_dirtycnt -= 1;
3118219089Spjd		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
3119168404Spjd		return;
3120168404Spjd	}
3121168404Spjd
3122219089Spjd	os = dn->dn_objset;
3123219089Spjd
3124168404Spjd	/*
3125185029Spjd	 * This function may have dropped the db_mtx lock allowing a dmu_sync
3126185029Spjd	 * operation to sneak in. As a result, we need to ensure that we
3127185029Spjd	 * don't check the dr_override_state until we have returned from
3128185029Spjd	 * dbuf_check_blkptr.
3129185029Spjd	 */
3130185029Spjd	dbuf_check_blkptr(dn, db);
3131185029Spjd
3132185029Spjd	/*
3133219089Spjd	 * If this buffer is in the middle of an immediate write,
3134168404Spjd	 * wait for the synchronous IO to complete.
3135168404Spjd	 */
3136168404Spjd	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
3137168404Spjd		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
3138168404Spjd		cv_wait(&db->db_changed, &db->db_mtx);
3139168404Spjd		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
3140168404Spjd	}
3141168404Spjd
3142219089Spjd	if (db->db_state != DB_NOFILL &&
3143219089Spjd	    dn->dn_object != DMU_META_DNODE_OBJECT &&
3144208050Smm	    refcount_count(&db->db_holds) > 1 &&
3145219089Spjd	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
3146208050Smm	    *datap == db->db_buf) {
3147168404Spjd		/*
3148208050Smm		 * If this buffer is currently "in use" (i.e., there
3149208050Smm		 * are active holds and db_data still references it),
3150208050Smm		 * then make a copy before we start the write so that
3151208050Smm		 * any modifications from the open txg will not leak
3152208050Smm		 * into this write.
3153168404Spjd		 *
3154208050Smm		 * NOTE: this copy does not need to be made for
3155208050Smm		 * objects only modified in the syncing context (e.g.
3156208050Smm		 * DNONE_DNODE blocks).
3157168404Spjd		 */
3158208050Smm		int blksz = arc_buf_size(*datap);
3159208050Smm		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
3160307266Smav		*datap = arc_alloc_buf(os->os_spa, blksz, db, type);
3161208050Smm		bcopy(db->db.db_data, (*datap)->b_data, blksz);
3162168404Spjd	}
3163168404Spjd	db->db_data_pending = dr;
3164168404Spjd
3165168404Spjd	mutex_exit(&db->db_mtx);
3166168404Spjd
3167185029Spjd	dbuf_write(dr, *datap, tx);
3168168404Spjd
3169168404Spjd	ASSERT(!list_link_active(&dr->dr_dirty_node));
3170219089Spjd	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
3171168404Spjd		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
3172219089Spjd		DB_DNODE_EXIT(db);
3173219089Spjd	} else {
3174219089Spjd		/*
3175219089Spjd		 * Although zio_nowait() does not "wait for an IO", it does
3176219089Spjd		 * initiate the IO. If this is an empty write it seems plausible
3177219089Spjd		 * that the IO could actually be completed before the nowait
3178219089Spjd		 * returns. We need to DB_DNODE_EXIT() first in case
3179219089Spjd		 * zio_nowait() invalidates the dbuf.
3180219089Spjd		 */
3181219089Spjd		DB_DNODE_EXIT(db);
3182168404Spjd		zio_nowait(dr->dr_zio);
3183219089Spjd	}
3184168404Spjd}
3185168404Spjd
3186168404Spjdvoid
3187285202Savgdbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
3188168404Spjd{
3189168404Spjd	dbuf_dirty_record_t *dr;
3190168404Spjd
3191168404Spjd	while (dr = list_head(list)) {
3192168404Spjd		if (dr->dr_zio != NULL) {
3193168404Spjd			/*
3194168404Spjd			 * If we find an already initialized zio then we
3195168404Spjd			 * are processing the meta-dnode, and we have finished.
3196168404Spjd			 * The dbufs for all dnodes are put back on the list
3197168404Spjd			 * during processing, so that we can zio_wait()
3198168404Spjd			 * these IOs after initiating all child IOs.
3199168404Spjd			 */
3200168404Spjd			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
3201168404Spjd			    DMU_META_DNODE_OBJECT);
3202168404Spjd			break;
3203168404Spjd		}
3204285202Savg		if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
3205285202Savg		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
3206285202Savg			VERIFY3U(dr->dr_dbuf->db_level, ==, level);
3207285202Savg		}
3208168404Spjd		list_remove(list, dr);
3209168404Spjd		if (dr->dr_dbuf->db_level > 0)
3210168404Spjd			dbuf_sync_indirect(dr, tx);
3211168404Spjd		else
3212168404Spjd			dbuf_sync_leaf(dr, tx);
3213168404Spjd	}
3214168404Spjd}
3215168404Spjd
3216168404Spjd/* ARGSUSED */
3217168404Spjdstatic void
3218168404Spjddbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
3219168404Spjd{
3220168404Spjd	dmu_buf_impl_t *db = vdb;
3221219089Spjd	dnode_t *dn;
3222185029Spjd	blkptr_t *bp = zio->io_bp;
3223168404Spjd	blkptr_t *bp_orig = &zio->io_bp_orig;
3224219089Spjd	spa_t *spa = zio->io_spa;
3225219089Spjd	int64_t delta;
3226168404Spjd	uint64_t fill = 0;
3227219089Spjd	int i;
3228168404Spjd
3229304136Savg	ASSERT3P(db->db_blkptr, !=, NULL);
3230304136Savg	ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
3231185029Spjd
3232219089Spjd	DB_DNODE_ENTER(db);
3233219089Spjd	dn = DB_DNODE(db);
3234219089Spjd	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
3235219089Spjd	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
3236219089Spjd	zio->io_prev_space_delta = delta;
3237168404Spjd
3238263397Sdelphij	if (bp->blk_birth != 0) {
3239263397Sdelphij		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
3240263397Sdelphij		    BP_GET_TYPE(bp) == dn->dn_type) ||
3241263397Sdelphij		    (db->db_blkid == DMU_SPILL_BLKID &&
3242268649Sdelphij		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
3243268649Sdelphij		    BP_IS_EMBEDDED(bp));
3244263397Sdelphij		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
3245168404Spjd	}
3246168404Spjd
3247168404Spjd	mutex_enter(&db->db_mtx);
3248168404Spjd
3249219089Spjd#ifdef ZFS_DEBUG
3250219089Spjd	if (db->db_blkid == DMU_SPILL_BLKID) {
3251219089Spjd		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
3252304136Savg		ASSERT(!(BP_IS_HOLE(bp)) &&
3253219089Spjd		    db->db_blkptr == &dn->dn_phys->dn_spill);
3254219089Spjd	}
3255219089Spjd#endif
3256219089Spjd
3257168404Spjd	if (db->db_level == 0) {
3258168404Spjd		mutex_enter(&dn->dn_mtx);
3259219089Spjd		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
3260219089Spjd		    db->db_blkid != DMU_SPILL_BLKID)
3261168404Spjd			dn->dn_phys->dn_maxblkid = db->db_blkid;
3262168404Spjd		mutex_exit(&dn->dn_mtx);
3263168404Spjd
3264168404Spjd		if (dn->dn_type == DMU_OT_DNODE) {
3265168404Spjd			dnode_phys_t *dnp = db->db.db_data;
3266168404Spjd			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
3267168404Spjd			    i--, dnp++) {
3268168404Spjd				if (dnp->dn_type != DMU_OT_NONE)
3269168404Spjd					fill++;
3270168404Spjd			}
3271168404Spjd		} else {
3272263397Sdelphij			if (BP_IS_HOLE(bp)) {
3273263397Sdelphij				fill = 0;
3274263397Sdelphij			} else {
3275263397Sdelphij				fill = 1;
3276263397Sdelphij			}
3277168404Spjd		}
3278168404Spjd	} else {
3279185029Spjd		blkptr_t *ibp = db->db.db_data;
3280168404Spjd		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
3281185029Spjd		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
3282185029Spjd			if (BP_IS_HOLE(ibp))
3283168404Spjd				continue;
3284268649Sdelphij			fill += BP_GET_FILL(ibp);
3285168404Spjd		}
3286168404Spjd	}
3287219089Spjd	DB_DNODE_EXIT(db);
3288168404Spjd
3289268649Sdelphij	if (!BP_IS_EMBEDDED(bp))
3290268649Sdelphij		bp->blk_fill = fill;
3291168404Spjd
3292168404Spjd	mutex_exit(&db->db_mtx);
3293304136Savg
3294304136Savg	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
3295304136Savg	*db->db_blkptr = *bp;
3296304136Savg	rw_exit(&dn->dn_struct_rwlock);
3297168404Spjd}
3298168404Spjd
3299304139Savg/* ARGSUSED */
3300260763Savg/*
3301304139Savg * This function gets called just prior to running through the compression
3302304139Savg * stage of the zio pipeline. If we're an indirect block comprised of only
3303304139Savg * holes, then we want this indirect to be compressed away to a hole. In
3304304139Savg * order to do that we must zero out any information about the holes that
3305304139Savg * this indirect points to prior to before we try to compress it.
3306304139Savg */
3307304139Savgstatic void
3308304139Savgdbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
3309304139Savg{
3310304139Savg	dmu_buf_impl_t *db = vdb;
3311304139Savg	dnode_t *dn;
3312304139Savg	blkptr_t *bp;
3313304139Savg	uint64_t i;
3314304139Savg	int epbs;
3315304139Savg
3316304139Savg	ASSERT3U(db->db_level, >, 0);
3317304139Savg	DB_DNODE_ENTER(db);
3318304139Savg	dn = DB_DNODE(db);
3319304139Savg	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
3320304139Savg
3321304139Savg	/* Determine if all our children are holes */
3322304139Savg	for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
3323304139Savg		if (!BP_IS_HOLE(bp))
3324304139Savg			break;
3325304139Savg	}
3326304139Savg
3327304139Savg	/*
3328304139Savg	 * If all the children are holes, then zero them all out so that
3329304139Savg	 * we may get compressed away.
3330304139Savg	 */
3331304139Savg	if (i == 1 << epbs) {
3332304139Savg		/* didn't find any non-holes */
3333304139Savg		bzero(db->db.db_data, db->db.db_size);
3334304139Savg	}
3335304139Savg	DB_DNODE_EXIT(db);
3336304139Savg}
3337304139Savg
3338304139Savg/*
3339260763Savg * The SPA will call this callback several times for each zio - once
3340260763Savg * for every physical child i/o (zio->io_phys_children times).  This
3341260763Savg * allows the DMU to monitor the progress of each logical i/o.  For example,
3342260763Savg * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
3343260763Savg * block.  There may be a long delay before all copies/fragments are completed,
3344260763Savg * so this callback allows us to retire dirty space gradually, as the physical
3345260763Savg * i/os complete.
3346260763Savg */
3347168404Spjd/* ARGSUSED */
3348168404Spjdstatic void
3349260763Savgdbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
3350260763Savg{
3351260763Savg	dmu_buf_impl_t *db = arg;
3352260763Savg	objset_t *os = db->db_objset;
3353260763Savg	dsl_pool_t *dp = dmu_objset_pool(os);
3354260763Savg	dbuf_dirty_record_t *dr;
3355260763Savg	int delta = 0;
3356260763Savg
3357260763Savg	dr = db->db_data_pending;
3358260763Savg	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
3359260763Savg
3360260763Savg	/*
3361260763Savg	 * The callback will be called io_phys_children times.  Retire one
3362260763Savg	 * portion of our dirty space each time we are called.  Any rounding
3363260763Savg	 * error will be cleaned up by dsl_pool_sync()'s call to
3364260763Savg	 * dsl_pool_undirty_space().
3365260763Savg	 */
3366260763Savg	delta = dr->dr_accounted / zio->io_phys_children;
3367260763Savg	dsl_pool_undirty_space(dp, delta, zio->io_txg);
3368260763Savg}
3369260763Savg
3370260763Savg/* ARGSUSED */
3371260763Savgstatic void
3372168404Spjddbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
3373168404Spjd{
3374168404Spjd	dmu_buf_impl_t *db = vdb;
3375219089Spjd	blkptr_t *bp_orig = &zio->io_bp_orig;
3376263397Sdelphij	blkptr_t *bp = db->db_blkptr;
3377263397Sdelphij	objset_t *os = db->db_objset;
3378263397Sdelphij	dmu_tx_t *tx = os->os_synctx;
3379168404Spjd	dbuf_dirty_record_t **drp, *dr;
3380168404Spjd
3381240415Smm	ASSERT0(zio->io_error);
3382219089Spjd	ASSERT(db->db_blkptr == bp);
3383168404Spjd
3384243524Smm	/*
3385243524Smm	 * For nopwrites and rewrites we ensure that the bp matches our
3386243524Smm	 * original and bypass all the accounting.
3387243524Smm	 */
3388243524Smm	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
3389219089Spjd		ASSERT(BP_EQUAL(bp, bp_orig));
3390219089Spjd	} else {
3391263397Sdelphij		dsl_dataset_t *ds = os->os_dsl_dataset;
3392219089Spjd		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
3393219089Spjd		dsl_dataset_block_born(ds, bp, tx);
3394219089Spjd	}
3395219089Spjd
3396168404Spjd	mutex_enter(&db->db_mtx);
3397168404Spjd
3398219089Spjd	DBUF_VERIFY(db);
3399219089Spjd
3400168404Spjd	drp = &db->db_last_dirty;
3401185029Spjd	while ((dr = *drp) != db->db_data_pending)
3402185029Spjd		drp = &dr->dr_next;
3403185029Spjd	ASSERT(!list_link_active(&dr->dr_dirty_node));
3404219089Spjd	ASSERT(dr->dr_dbuf == db);
3405185029Spjd	ASSERT(dr->dr_next == NULL);
3406185029Spjd	*drp = dr->dr_next;
3407168404Spjd
3408219089Spjd#ifdef ZFS_DEBUG
3409219089Spjd	if (db->db_blkid == DMU_SPILL_BLKID) {
3410219089Spjd		dnode_t *dn;
3411219089Spjd
3412219089Spjd		DB_DNODE_ENTER(db);
3413219089Spjd		dn = DB_DNODE(db);
3414219089Spjd		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
3415219089Spjd		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
3416219089Spjd		    db->db_blkptr == &dn->dn_phys->dn_spill);
3417219089Spjd		DB_DNODE_EXIT(db);
3418219089Spjd	}
3419219089Spjd#endif
3420219089Spjd
3421168404Spjd	if (db->db_level == 0) {
3422219089Spjd		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
3423168404Spjd		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
3424219089Spjd		if (db->db_state != DB_NOFILL) {
3425219089Spjd			if (dr->dt.dl.dr_data != db->db_buf)
3426307266Smav				arc_buf_destroy(dr->dt.dl.dr_data, db);
3427219089Spjd		}
3428168404Spjd	} else {
3429219089Spjd		dnode_t *dn;
3430168404Spjd
3431219089Spjd		DB_DNODE_ENTER(db);
3432219089Spjd		dn = DB_DNODE(db);
3433168404Spjd		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
3434263397Sdelphij		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
3435168404Spjd		if (!BP_IS_HOLE(db->db_blkptr)) {
3436168404Spjd			int epbs =
3437168404Spjd			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
3438263397Sdelphij			ASSERT3U(db->db_blkid, <=,
3439263397Sdelphij			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
3440168404Spjd			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
3441168404Spjd			    db->db.db_size);
3442168404Spjd		}
3443219089Spjd		DB_DNODE_EXIT(db);
3444185029Spjd		mutex_destroy(&dr->dt.di.dr_mtx);
3445169325Spjd		list_destroy(&dr->dt.di.dr_children);
3446168404Spjd	}
3447168404Spjd	kmem_free(dr, sizeof (dbuf_dirty_record_t));
3448168404Spjd
3449168404Spjd	cv_broadcast(&db->db_changed);
3450168404Spjd	ASSERT(db->db_dirtycnt > 0);
3451168404Spjd	db->db_dirtycnt -= 1;
3452168404Spjd	db->db_data_pending = NULL;
3453263397Sdelphij	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
3454219089Spjd}
3455219089Spjd
3456219089Spjdstatic void
3457219089Spjddbuf_write_nofill_ready(zio_t *zio)
3458219089Spjd{
3459219089Spjd	dbuf_write_ready(zio, NULL, zio->io_private);
3460219089Spjd}
3461219089Spjd
3462219089Spjdstatic void
3463219089Spjddbuf_write_nofill_done(zio_t *zio)
3464219089Spjd{
3465219089Spjd	dbuf_write_done(zio, NULL, zio->io_private);
3466219089Spjd}
3467219089Spjd
3468219089Spjdstatic void
3469219089Spjddbuf_write_override_ready(zio_t *zio)
3470219089Spjd{
3471219089Spjd	dbuf_dirty_record_t *dr = zio->io_private;
3472219089Spjd	dmu_buf_impl_t *db = dr->dr_dbuf;
3473219089Spjd
3474219089Spjd	dbuf_write_ready(zio, NULL, db);
3475219089Spjd}
3476219089Spjd
3477219089Spjdstatic void
3478219089Spjddbuf_write_override_done(zio_t *zio)
3479219089Spjd{
3480219089Spjd	dbuf_dirty_record_t *dr = zio->io_private;
3481219089Spjd	dmu_buf_impl_t *db = dr->dr_dbuf;
3482219089Spjd	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
3483219089Spjd
3484219089Spjd	mutex_enter(&db->db_mtx);
3485219089Spjd	if (!BP_EQUAL(zio->io_bp, obp)) {
3486219089Spjd		if (!BP_IS_HOLE(obp))
3487219089Spjd			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
3488219089Spjd		arc_release(dr->dt.dl.dr_data, db);
3489219089Spjd	}
3490168404Spjd	mutex_exit(&db->db_mtx);
3491168404Spjd
3492219089Spjd	dbuf_write_done(zio, NULL, db);
3493219089Spjd}
3494168404Spjd
3495251629Sdelphij/* Issue I/O to commit a dirty buffer to disk. */
3496219089Spjdstatic void
3497219089Spjddbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
3498219089Spjd{
3499219089Spjd	dmu_buf_impl_t *db = dr->dr_dbuf;
3500219089Spjd	dnode_t *dn;
3501219089Spjd	objset_t *os;
3502219089Spjd	dmu_buf_impl_t *parent = db->db_parent;
3503219089Spjd	uint64_t txg = tx->tx_txg;
3504268657Sdelphij	zbookmark_phys_t zb;
3505219089Spjd	zio_prop_t zp;
3506219089Spjd	zio_t *zio;
3507219089Spjd	int wp_flag = 0;
3508219089Spjd
3509304136Savg	ASSERT(dmu_tx_is_syncing(tx));
3510304136Savg
3511219089Spjd	DB_DNODE_ENTER(db);
3512219089Spjd	dn = DB_DNODE(db);
3513219089Spjd	os = dn->dn_objset;
3514219089Spjd
3515219089Spjd	if (db->db_state != DB_NOFILL) {
3516219089Spjd		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
3517219089Spjd			/*
3518219089Spjd			 * Private object buffers are released here rather
3519219089Spjd			 * than in dbuf_dirty() since they are only modified
3520219089Spjd			 * in the syncing context and we don't want the
3521219089Spjd			 * overhead of making multiple copies of the data.
3522219089Spjd			 */
3523219089Spjd			if (BP_IS_HOLE(db->db_blkptr)) {
3524219089Spjd				arc_buf_thaw(data);
3525219089Spjd			} else {
3526219089Spjd				dbuf_release_bp(db);
3527219089Spjd			}
3528219089Spjd		}
3529219089Spjd	}
3530219089Spjd
3531219089Spjd	if (parent != dn->dn_dbuf) {
3532251629Sdelphij		/* Our parent is an indirect block. */
3533251629Sdelphij		/* We have a dirty parent that has been scheduled for write. */
3534219089Spjd		ASSERT(parent && parent->db_data_pending);
3535251629Sdelphij		/* Our parent's buffer is one level closer to the dnode. */
3536219089Spjd		ASSERT(db->db_level == parent->db_level-1);
3537251629Sdelphij		/*
3538251629Sdelphij		 * We're about to modify our parent's db_data by modifying
3539251629Sdelphij		 * our block pointer, so the parent must be released.
3540251629Sdelphij		 */
3541219089Spjd		ASSERT(arc_released(parent->db_buf));
3542219089Spjd		zio = parent->db_data_pending->dr_zio;
3543219089Spjd	} else {
3544251629Sdelphij		/* Our parent is the dnode itself. */
3545219089Spjd		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
3546219089Spjd		    db->db_blkid != DMU_SPILL_BLKID) ||
3547219089Spjd		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
3548219089Spjd		if (db->db_blkid != DMU_SPILL_BLKID)
3549219089Spjd			ASSERT3P(db->db_blkptr, ==,
3550219089Spjd			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
3551219089Spjd		zio = dn->dn_zio;
3552219089Spjd	}
3553219089Spjd
3554219089Spjd	ASSERT(db->db_level == 0 || data == db->db_buf);
3555219089Spjd	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
3556219089Spjd	ASSERT(zio);
3557219089Spjd
3558219089Spjd	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
3559219089Spjd	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
3560219089Spjd	    db->db.db_object, db->db_level, db->db_blkid);
3561219089Spjd
3562219089Spjd	if (db->db_blkid == DMU_SPILL_BLKID)
3563219089Spjd		wp_flag = WP_SPILL;
3564219089Spjd	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
3565219089Spjd
3566219089Spjd	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
3567219089Spjd	DB_DNODE_EXIT(db);
3568219089Spjd
3569304136Savg	/*
3570304136Savg	 * We copy the blkptr now (rather than when we instantiate the dirty
3571304136Savg	 * record), because its value can change between open context and
3572304136Savg	 * syncing context. We do not need to hold dn_struct_rwlock to read
3573304136Savg	 * db_blkptr because we are in syncing context.
3574304136Savg	 */
3575304136Savg	dr->dr_bp_copy = *db->db_blkptr;
3576304136Savg
3577268649Sdelphij	if (db->db_level == 0 &&
3578268649Sdelphij	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
3579268649Sdelphij		/*
3580268649Sdelphij		 * The BP for this block has been provided by open context
3581268649Sdelphij		 * (by dmu_sync() or dmu_buf_write_embedded()).
3582268649Sdelphij		 */
3583268649Sdelphij		void *contents = (data != NULL) ? data->b_data : NULL;
3584268649Sdelphij
3585219089Spjd		dr->dr_zio = zio_write(zio, os->os_spa, txg,
3586304136Savg		    &dr->dr_bp_copy, contents, db->db.db_size, &zp,
3587304139Savg		    dbuf_write_override_ready, NULL, NULL,
3588304139Savg		    dbuf_write_override_done,
3589260763Savg		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
3590219089Spjd		mutex_enter(&db->db_mtx);
3591219089Spjd		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
3592219089Spjd		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
3593243524Smm		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
3594219089Spjd		mutex_exit(&db->db_mtx);
3595219089Spjd	} else if (db->db_state == DB_NOFILL) {
3596255750Sdelphij		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
3597255750Sdelphij		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
3598219089Spjd		dr->dr_zio = zio_write(zio, os->os_spa, txg,
3599304136Savg		    &dr->dr_bp_copy, NULL, db->db.db_size, &zp,
3600304139Savg		    dbuf_write_nofill_ready, NULL, NULL,
3601304139Savg		    dbuf_write_nofill_done, db,
3602219089Spjd		    ZIO_PRIORITY_ASYNC_WRITE,
3603219089Spjd		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
3604219089Spjd	} else {
3605219089Spjd		ASSERT(arc_released(data));
3606304139Savg
3607304139Savg		/*
3608304139Savg		 * For indirect blocks, we want to setup the children
3609304139Savg		 * ready callback so that we can properly handle an indirect
3610304139Savg		 * block that only contains holes.
3611304139Savg		 */
3612304139Savg		arc_done_func_t *children_ready_cb = NULL;
3613304139Savg		if (db->db_level != 0)
3614304139Savg			children_ready_cb = dbuf_write_children_ready;
3615304139Savg
3616219089Spjd		dr->dr_zio = arc_write(zio, os->os_spa, txg,
3617304136Savg		    &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
3618307266Smav		    &zp, dbuf_write_ready, children_ready_cb,
3619260763Savg		    dbuf_write_physdone, dbuf_write_done, db,
3620260763Savg		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
3621219089Spjd	}
3622168404Spjd}
3623