dnode_sync.c revision 285202
1163953Srrs/*
2163953Srrs * CDDL HEADER START
3163953Srrs *
4163953Srrs * The contents of this file are subject to the terms of the
5163953Srrs * Common Development and Distribution License (the "License").
6163953Srrs * You may not use this file except in compliance with the License.
7163953Srrs *
8163953Srrs * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9163953Srrs * or http://www.opensolaris.org/os/licensing.
10163953Srrs * See the License for the specific language governing permissions
11163953Srrs * and limitations under the License.
12163953Srrs *
13163953Srrs * When distributing Covered Code, include this CDDL HEADER in each
14163953Srrs * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15163953Srrs * If applicable, add the following below this CDDL HEADER, with the
16163953Srrs * fields enclosed by brackets "[]" replaced with your own identifying
17163953Srrs * information: Portions Copyright [yyyy] [name of copyright owner]
18163953Srrs *
19163953Srrs * CDDL HEADER END
20163953Srrs */
21163953Srrs
22163953Srrs/*
23163953Srrs * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24163953Srrs * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
25163953Srrs */
26163953Srrs
27163953Srrs#include <sys/zfs_context.h>
28163953Srrs#include <sys/dbuf.h>
29163953Srrs#include <sys/dnode.h>
30163953Srrs#include <sys/dmu.h>
31163953Srrs#include <sys/dmu_tx.h>
32163953Srrs#include <sys/dmu_objset.h>
33163953Srrs#include <sys/dsl_dataset.h>
34163953Srrs#include <sys/spa.h>
35163953Srrs#include <sys/range_tree.h>
36163953Srrs#include <sys/zfeature.h>
37166086Srrs
38163953Srrsstatic void
39163953Srrsdnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
40163953Srrs{
41163953Srrs	dmu_buf_impl_t *db;
42163953Srrs	int txgoff = tx->tx_txg & TXG_MASK;
43163953Srrs	int nblkptr = dn->dn_phys->dn_nblkptr;
44163953Srrs	int old_toplvl = dn->dn_phys->dn_nlevels - 1;
45163953Srrs	int new_level = dn->dn_next_nlevels[txgoff];
46163953Srrs	int i;
47163953Srrs
48163953Srrs	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
49163953Srrs
50163953Srrs	/* this dnode can't be paged out because it's dirty */
51163953Srrs	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
52163953Srrs	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
53163953Srrs	ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
54163953Srrs
55163953Srrs	db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
56163953Srrs	ASSERT(db != NULL);
57163953Srrs
58163953Srrs	dn->dn_phys->dn_nlevels = new_level;
59163953Srrs	dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
60163953Srrs	    dn->dn_object, dn->dn_phys->dn_nlevels);
61163953Srrs
62163953Srrs	/* check for existing blkptrs in the dnode */
63163953Srrs	for (i = 0; i < nblkptr; i++)
64163953Srrs		if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
65163953Srrs			break;
66163953Srrs	if (i != nblkptr) {
67163953Srrs		/* transfer dnode's block pointers to new indirect block */
68163953Srrs		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
69163953Srrs		ASSERT(db->db.db_data);
70163953Srrs		ASSERT(arc_released(db->db_buf));
71163953Srrs		ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
72163953Srrs		bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
73163953Srrs		    sizeof (blkptr_t) * nblkptr);
74163953Srrs		arc_buf_freeze(db->db_buf);
75163953Srrs	}
76163953Srrs
77163953Srrs	/* set dbuf's parent pointers to new indirect buf */
78163953Srrs	for (i = 0; i < nblkptr; i++) {
79163953Srrs		dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
80163953Srrs
81163953Srrs		if (child == NULL)
82163953Srrs			continue;
83163953Srrs#ifdef	DEBUG
84163953Srrs		DB_DNODE_ENTER(child);
85163953Srrs		ASSERT3P(DB_DNODE(child), ==, dn);
86163953Srrs		DB_DNODE_EXIT(child);
87163953Srrs#endif	/* DEBUG */
88163953Srrs		if (child->db_parent && child->db_parent != dn->dn_dbuf) {
89163953Srrs			ASSERT(child->db_parent->db_level == db->db_level);
90163953Srrs			ASSERT(child->db_blkptr !=
91163953Srrs			    &dn->dn_phys->dn_blkptr[child->db_blkid]);
92163953Srrs			mutex_exit(&child->db_mtx);
93163953Srrs			continue;
94163953Srrs		}
95163953Srrs		ASSERT(child->db_parent == NULL ||
96163953Srrs		    child->db_parent == dn->dn_dbuf);
97163953Srrs
98163953Srrs		child->db_parent = db;
99163953Srrs		dbuf_add_ref(db, child);
100163953Srrs		if (db->db.db_data)
101163953Srrs			child->db_blkptr = (blkptr_t *)db->db.db_data + i;
102163953Srrs		else
103163953Srrs			child->db_blkptr = NULL;
104163953Srrs		dprintf_dbuf_bp(child, child->db_blkptr,
105163953Srrs		    "changed db_blkptr to new indirect %s", "");
106163953Srrs
107163953Srrs		mutex_exit(&child->db_mtx);
108163953Srrs	}
109163953Srrs
110163953Srrs	bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
111163953Srrs
112163953Srrs	dbuf_rele(db, FTAG);
113163953Srrs
114163953Srrs	rw_exit(&dn->dn_struct_rwlock);
115163953Srrs}
116163953Srrs
117163953Srrsstatic void
118163953Srrsfree_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
119163953Srrs{
120163953Srrs	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
121163953Srrs	uint64_t bytesfreed = 0;
122163953Srrs
123163953Srrs	dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
124163953Srrs
125163953Srrs	for (int i = 0; i < num; i++, bp++) {
126163953Srrs		if (BP_IS_HOLE(bp))
127163953Srrs			continue;
128163953Srrs
129163953Srrs		bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
130163953Srrs		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
131163953Srrs
132163953Srrs		/*
133163953Srrs		 * Save some useful information on the holes being
134163953Srrs		 * punched, including logical size, type, and indirection
135163953Srrs		 * level. Retaining birth time enables detection of when
136163953Srrs		 * holes are punched for reducing the number of free
137163953Srrs		 * records transmitted during a zfs send.
138163953Srrs		 */
139163953Srrs
140163953Srrs		uint64_t lsize = BP_GET_LSIZE(bp);
141163953Srrs		dmu_object_type_t type = BP_GET_TYPE(bp);
142163953Srrs		uint64_t lvl = BP_GET_LEVEL(bp);
143163953Srrs
144163953Srrs		bzero(bp, sizeof (blkptr_t));
145163953Srrs
146163953Srrs		if (spa_feature_is_active(dn->dn_objset->os_spa,
147163953Srrs		    SPA_FEATURE_HOLE_BIRTH)) {
148163953Srrs			BP_SET_LSIZE(bp, lsize);
149163953Srrs			BP_SET_TYPE(bp, type);
150163953Srrs			BP_SET_LEVEL(bp, lvl);
151163953Srrs			BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
152163953Srrs		}
153163953Srrs	}
154163953Srrs	dnode_diduse_space(dn, -bytesfreed);
155163953Srrs}
156163953Srrs
157163953Srrs#ifdef ZFS_DEBUG
158163953Srrsstatic void
159163953Srrsfree_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
160163953Srrs{
161163953Srrs	int off, num;
162163953Srrs	int i, err, epbs;
163163953Srrs	uint64_t txg = tx->tx_txg;
164163953Srrs	dnode_t *dn;
165163953Srrs
166163953Srrs	DB_DNODE_ENTER(db);
167163953Srrs	dn = DB_DNODE(db);
168163953Srrs	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
169163953Srrs	off = start - (db->db_blkid * 1<<epbs);
170163953Srrs	num = end - start + 1;
171163953Srrs
172163953Srrs	ASSERT3U(off, >=, 0);
173163953Srrs	ASSERT3U(num, >=, 0);
174163953Srrs	ASSERT3U(db->db_level, >, 0);
175163953Srrs	ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
176163953Srrs	ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
177163953Srrs	ASSERT(db->db_blkptr != NULL);
178163953Srrs
179163953Srrs	for (i = off; i < off+num; i++) {
180163953Srrs		uint64_t *buf;
181163953Srrs		dmu_buf_impl_t *child;
182163953Srrs		dbuf_dirty_record_t *dr;
183163953Srrs		int j;
184163953Srrs
185163953Srrs		ASSERT(db->db_level == 1);
186163953Srrs
187163953Srrs		rw_enter(&dn->dn_struct_rwlock, RW_READER);
188163953Srrs		err = dbuf_hold_impl(dn, db->db_level-1,
189163953Srrs		    (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
190163953Srrs		rw_exit(&dn->dn_struct_rwlock);
191163953Srrs		if (err == ENOENT)
192163953Srrs			continue;
193163953Srrs		ASSERT(err == 0);
194163953Srrs		ASSERT(child->db_level == 0);
195163953Srrs		dr = child->db_last_dirty;
196163953Srrs		while (dr && dr->dr_txg > txg)
197163953Srrs			dr = dr->dr_next;
198163953Srrs		ASSERT(dr == NULL || dr->dr_txg == txg);
199163953Srrs
200163953Srrs		/* data_old better be zeroed */
201163953Srrs		if (dr) {
202163953Srrs			buf = dr->dt.dl.dr_data->b_data;
203163953Srrs			for (j = 0; j < child->db.db_size >> 3; j++) {
204163953Srrs				if (buf[j] != 0) {
205163953Srrs					panic("freed data not zero: "
206163953Srrs					    "child=%p i=%d off=%d num=%d\n",
207163953Srrs					    (void *)child, i, off, num);
208163953Srrs				}
209163953Srrs			}
210163953Srrs		}
211163953Srrs
212163953Srrs		/*
213163953Srrs		 * db_data better be zeroed unless it's dirty in a
214163953Srrs		 * future txg.
215163953Srrs		 */
216163953Srrs		mutex_enter(&child->db_mtx);
217163953Srrs		buf = child->db.db_data;
218163953Srrs		if (buf != NULL && child->db_state != DB_FILL &&
219163953Srrs		    child->db_last_dirty == NULL) {
220163953Srrs			for (j = 0; j < child->db.db_size >> 3; j++) {
221163953Srrs				if (buf[j] != 0) {
222163953Srrs					panic("freed data not zero: "
223163953Srrs					    "child=%p i=%d off=%d num=%d\n",
224163953Srrs					    (void *)child, i, off, num);
225163953Srrs				}
226163953Srrs			}
227163953Srrs		}
228163953Srrs		mutex_exit(&child->db_mtx);
229163953Srrs
230163953Srrs		dbuf_rele(child, FTAG);
231163953Srrs	}
232163953Srrs	DB_DNODE_EXIT(db);
233163953Srrs}
234163953Srrs#endif
235163953Srrs
236163953Srrsstatic void
237163953Srrsfree_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
238163953Srrs    dmu_tx_t *tx)
239163953Srrs{
240163953Srrs	dnode_t *dn;
241163953Srrs	blkptr_t *bp;
242163953Srrs	dmu_buf_impl_t *subdb;
243163953Srrs	uint64_t start, end, dbstart, dbend, i;
244163953Srrs	int epbs, shift;
245163953Srrs
246163953Srrs	/*
247163953Srrs	 * There is a small possibility that this block will not be cached:
248163953Srrs	 *   1 - if level > 1 and there are no children with level <= 1
249163953Srrs	 *   2 - if this block was evicted since we read it from
250163953Srrs	 *	 dmu_tx_hold_free().
251163953Srrs	 */
252163953Srrs	if (db->db_state != DB_CACHED)
253163953Srrs		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
254163953Srrs
255163953Srrs	dbuf_release_bp(db);
256163953Srrs	bp = db->db.db_data;
257163953Srrs
258163953Srrs	DB_DNODE_ENTER(db);
259163953Srrs	dn = DB_DNODE(db);
260163953Srrs	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
261163953Srrs	shift = (db->db_level - 1) * epbs;
262163953Srrs	dbstart = db->db_blkid << epbs;
263163953Srrs	start = blkid >> shift;
264163953Srrs	if (dbstart < start) {
265163953Srrs		bp += start - dbstart;
266163953Srrs	} else {
267163953Srrs		start = dbstart;
268163953Srrs	}
269163953Srrs	dbend = ((db->db_blkid + 1) << epbs) - 1;
270163953Srrs	end = (blkid + nblks - 1) >> shift;
271163953Srrs	if (dbend <= end)
272163953Srrs		end = dbend;
273163953Srrs
274163953Srrs	ASSERT3U(start, <=, end);
275163953Srrs
276165647Srrs	if (db->db_level == 1) {
277163953Srrs		FREE_VERIFY(db, start, end, tx);
278163953Srrs		free_blocks(dn, bp, end-start+1, tx);
279163953Srrs	} else {
280165647Srrs		for (i = start; i <= end; i++, bp++) {
281163953Srrs			if (BP_IS_HOLE(bp))
282165220Srrs				continue;
283163953Srrs			rw_enter(&dn->dn_struct_rwlock, RW_READER);
284165220Srrs			VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
285163953Srrs			    i, B_TRUE, FTAG, &subdb));
286163953Srrs			rw_exit(&dn->dn_struct_rwlock);
287163953Srrs			ASSERT3P(bp, ==, subdb->db_blkptr);
288163953Srrs
289163953Srrs			free_children(subdb, blkid, nblks, tx);
290163953Srrs			dbuf_rele(subdb, FTAG);
291163953Srrs		}
292163953Srrs	}
293163953Srrs
294163953Srrs	/* If this whole block is free, free ourself too. */
295163953Srrs	for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
296163953Srrs		if (!BP_IS_HOLE(bp))
297163953Srrs			break;
298163953Srrs	}
299163953Srrs	if (i == 1 << epbs) {
300163953Srrs		/* didn't find any non-holes */
301163953Srrs		bzero(db->db.db_data, db->db.db_size);
302163953Srrs		free_blocks(dn, db->db_blkptr, 1, tx);
303163953Srrs	} else {
304163953Srrs		/*
305163953Srrs		 * Partial block free; must be marked dirty so that it
306163953Srrs		 * will be written out.
307163953Srrs		 */
308163953Srrs		ASSERT(db->db_dirtycnt > 0);
309163953Srrs	}
310163953Srrs
311163953Srrs	DB_DNODE_EXIT(db);
312163953Srrs	arc_buf_freeze(db->db_buf);
313163953Srrs}
314163953Srrs
315163953Srrs/*
316163953Srrs * Traverse the indicated range of the provided file
317163953Srrs * and "free" all the blocks contained there.
318163953Srrs */
319163953Srrsstatic void
320163953Srrsdnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
321163953Srrs    dmu_tx_t *tx)
322163953Srrs{
323163953Srrs	blkptr_t *bp = dn->dn_phys->dn_blkptr;
324163953Srrs	int dnlevel = dn->dn_phys->dn_nlevels;
325163953Srrs	boolean_t trunc = B_FALSE;
326163953Srrs
327163953Srrs	if (blkid > dn->dn_phys->dn_maxblkid)
328163953Srrs		return;
329163953Srrs
330163953Srrs	ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
331163953Srrs	if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
332163953Srrs		nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
333163953Srrs		trunc = B_TRUE;
334163953Srrs	}
335163953Srrs
336163953Srrs	/* There are no indirect blocks in the object */
337163953Srrs	if (dnlevel == 1) {
338163953Srrs		if (blkid >= dn->dn_phys->dn_nblkptr) {
339163953Srrs			/* this range was never made persistent */
340163953Srrs			return;
341163953Srrs		}
342163953Srrs		ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
343163953Srrs		free_blocks(dn, bp + blkid, nblks, tx);
344163953Srrs	} else {
345163953Srrs		int shift = (dnlevel - 1) *
346163953Srrs		    (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
347163953Srrs		int start = blkid >> shift;
348163953Srrs		int end = (blkid + nblks - 1) >> shift;
349163953Srrs		dmu_buf_impl_t *db;
350163953Srrs
351163953Srrs		ASSERT(start < dn->dn_phys->dn_nblkptr);
352163953Srrs		bp += start;
353163953Srrs		for (int i = start; i <= end; i++, bp++) {
354163953Srrs			if (BP_IS_HOLE(bp))
355163953Srrs				continue;
356163953Srrs			rw_enter(&dn->dn_struct_rwlock, RW_READER);
357163953Srrs			VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
358163953Srrs			    TRUE, FTAG, &db));
359163953Srrs			rw_exit(&dn->dn_struct_rwlock);
360163953Srrs
361163953Srrs			free_children(db, blkid, nblks, tx);
362163953Srrs			dbuf_rele(db, FTAG);
363163953Srrs		}
364163953Srrs	}
365163953Srrs
366163953Srrs	if (trunc) {
367163953Srrs		dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
368163953Srrs
369163953Srrs		uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
370163953Srrs		    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
371163953Srrs		ASSERT(off < dn->dn_phys->dn_maxblkid ||
372163953Srrs		    dn->dn_phys->dn_maxblkid == 0 ||
373163953Srrs		    dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
374163953Srrs	}
375163953Srrs}
376163953Srrs
377163953Srrstypedef struct dnode_sync_free_range_arg {
378163953Srrs	dnode_t *dsfra_dnode;
379163953Srrs	dmu_tx_t *dsfra_tx;
380163953Srrs} dnode_sync_free_range_arg_t;
381163953Srrs
382163953Srrsstatic void
383163953Srrsdnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
384163953Srrs{
385163953Srrs	dnode_sync_free_range_arg_t *dsfra = arg;
386163953Srrs	dnode_t *dn = dsfra->dsfra_dnode;
387163953Srrs
388163953Srrs	mutex_exit(&dn->dn_mtx);
389163953Srrs	dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
390163953Srrs	mutex_enter(&dn->dn_mtx);
391163953Srrs}
392163953Srrs
393163953Srrs/*
394163953Srrs * Try to kick all the dnode's dbufs out of the cache...
395163953Srrs */
396163953Srrsvoid
397163953Srrsdnode_evict_dbufs(dnode_t *dn)
398163953Srrs{
399163953Srrs	int progress;
400163953Srrs	int pass = 0;
401163953Srrs
402163953Srrs	do {
403163953Srrs		dmu_buf_impl_t *db, *db_next;
404163953Srrs		int evicting = FALSE;
405163953Srrs
406163953Srrs		progress = FALSE;
407163953Srrs		mutex_enter(&dn->dn_dbufs_mtx);
408163953Srrs		for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
409163953Srrs			db_next = AVL_NEXT(&dn->dn_dbufs, db);
410163953Srrs#ifdef	DEBUG
411163953Srrs			DB_DNODE_ENTER(db);
412163953Srrs			ASSERT3P(DB_DNODE(db), ==, dn);
413163953Srrs			DB_DNODE_EXIT(db);
414163953Srrs#endif	/* DEBUG */
415163953Srrs
416163953Srrs			mutex_enter(&db->db_mtx);
417163953Srrs			if (db->db_state == DB_EVICTING) {
418163953Srrs				progress = TRUE;
419163953Srrs				evicting = TRUE;
420163953Srrs				mutex_exit(&db->db_mtx);
421163953Srrs			} else if (refcount_is_zero(&db->db_holds)) {
422163953Srrs				progress = TRUE;
423163953Srrs				dbuf_clear(db); /* exits db_mtx for us */
424163953Srrs			} else {
425163953Srrs				mutex_exit(&db->db_mtx);
426163953Srrs			}
427163953Srrs
428163953Srrs		}
429163953Srrs		/*
430163953Srrs		 * NB: we need to drop dn_dbufs_mtx between passes so
431163953Srrs		 * that any DB_EVICTING dbufs can make progress.
432163953Srrs		 * Ideally, we would have some cv we could wait on, but
433163953Srrs		 * since we don't, just wait a bit to give the other
434163953Srrs		 * thread a chance to run.
435163953Srrs		 */
436163953Srrs		mutex_exit(&dn->dn_dbufs_mtx);
437163953Srrs		if (evicting)
438163953Srrs			delay(1);
439163953Srrs		pass++;
440163953Srrs		ASSERT(pass < 100); /* sanity check */
441163953Srrs	} while (progress);
442163953Srrs
443163953Srrs	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
444163953Srrs	if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
445163953Srrs		mutex_enter(&dn->dn_bonus->db_mtx);
446163953Srrs		dbuf_evict(dn->dn_bonus);
447163953Srrs		dn->dn_bonus = NULL;
448163953Srrs	}
449163953Srrs	rw_exit(&dn->dn_struct_rwlock);
450163953Srrs}
451163953Srrs
452163953Srrsstatic void
453163953Srrsdnode_undirty_dbufs(list_t *list)
454163953Srrs{
455163953Srrs	dbuf_dirty_record_t *dr;
456163953Srrs
457163953Srrs	while (dr = list_head(list)) {
458163953Srrs		dmu_buf_impl_t *db = dr->dr_dbuf;
459163953Srrs		uint64_t txg = dr->dr_txg;
460163953Srrs
461163953Srrs		if (db->db_level != 0)
462163953Srrs			dnode_undirty_dbufs(&dr->dt.di.dr_children);
463163953Srrs
464163953Srrs		mutex_enter(&db->db_mtx);
465163953Srrs		/* XXX - use dbuf_undirty()? */
466163953Srrs		list_remove(list, dr);
467163953Srrs		ASSERT(db->db_last_dirty == dr);
468163953Srrs		db->db_last_dirty = NULL;
469163953Srrs		db->db_dirtycnt -= 1;
470163953Srrs		if (db->db_level == 0) {
471163953Srrs			ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
472163953Srrs			    dr->dt.dl.dr_data == db->db_buf);
473163953Srrs			dbuf_unoverride(dr);
474163953Srrs		} else {
475163953Srrs			mutex_destroy(&dr->dt.di.dr_mtx);
476163953Srrs			list_destroy(&dr->dt.di.dr_children);
477163953Srrs		}
478163953Srrs		kmem_free(dr, sizeof (dbuf_dirty_record_t));
479163953Srrs		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
480163953Srrs	}
481163953Srrs}
482163953Srrs
483163953Srrsstatic void
484163953Srrsdnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
485163953Srrs{
486163953Srrs	int txgoff = tx->tx_txg & TXG_MASK;
487163953Srrs
488163953Srrs	ASSERT(dmu_tx_is_syncing(tx));
489163953Srrs
490163953Srrs	/*
491163953Srrs	 * Our contents should have been freed in dnode_sync() by the
492163953Srrs	 * free range record inserted by the caller of dnode_free().
493163953Srrs	 */
494163953Srrs	ASSERT0(DN_USED_BYTES(dn->dn_phys));
495163953Srrs	ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
496163953Srrs
497163953Srrs	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
498163953Srrs	dnode_evict_dbufs(dn);
499163953Srrs	ASSERT(avl_is_empty(&dn->dn_dbufs));
500163953Srrs	ASSERT3P(dn->dn_bonus, ==, NULL);
501163953Srrs
502165647Srrs	/*
503165647Srrs	 * XXX - It would be nice to assert this, but we may still
504163953Srrs	 * have residual holds from async evictions from the arc...
505163953Srrs	 *
506163953Srrs	 * zfs_obj_to_path() also depends on this being
507163953Srrs	 * commented out.
508163953Srrs	 *
509163953Srrs	 * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
510163953Srrs	 */
511163953Srrs
512163953Srrs	/* Undirty next bits */
513163953Srrs	dn->dn_next_nlevels[txgoff] = 0;
514163953Srrs	dn->dn_next_indblkshift[txgoff] = 0;
515163953Srrs	dn->dn_next_blksz[txgoff] = 0;
516163953Srrs
517163953Srrs	/* ASSERT(blkptrs are zero); */
518163953Srrs	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
519163953Srrs	ASSERT(dn->dn_type != DMU_OT_NONE);
520163953Srrs
521163953Srrs	ASSERT(dn->dn_free_txg > 0);
522163953Srrs	if (dn->dn_allocated_txg != dn->dn_free_txg)
523163953Srrs		dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
524163953Srrs	bzero(dn->dn_phys, sizeof (dnode_phys_t));
525163953Srrs
526163953Srrs	mutex_enter(&dn->dn_mtx);
527163953Srrs	dn->dn_type = DMU_OT_NONE;
528163953Srrs	dn->dn_maxblkid = 0;
529163953Srrs	dn->dn_allocated_txg = 0;
530163953Srrs	dn->dn_free_txg = 0;
531163953Srrs	dn->dn_have_spill = B_FALSE;
532163953Srrs	mutex_exit(&dn->dn_mtx);
533163953Srrs
534163953Srrs	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
535163953Srrs
536163953Srrs	dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
537163953Srrs	/*
538163953Srrs	 * Now that we've released our hold, the dnode may
539163953Srrs	 * be evicted, so we musn't access it.
540163953Srrs	 */
541163953Srrs}
542163953Srrs
543163953Srrs/*
544163953Srrs * Write out the dnode's dirty buffers.
545163953Srrs */
546163953Srrsvoid
547163953Srrsdnode_sync(dnode_t *dn, dmu_tx_t *tx)
548163953Srrs{
549163953Srrs	dnode_phys_t *dnp = dn->dn_phys;
550163953Srrs	int txgoff = tx->tx_txg & TXG_MASK;
551163953Srrs	list_t *list = &dn->dn_dirty_records[txgoff];
552163953Srrs	static const dnode_phys_t zerodn = { 0 };
553163953Srrs	boolean_t kill_spill = B_FALSE;
554163953Srrs
555163953Srrs	ASSERT(dmu_tx_is_syncing(tx));
556163953Srrs	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
557163953Srrs	ASSERT(dnp->dn_type != DMU_OT_NONE ||
558163953Srrs	    bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
559163953Srrs	DNODE_VERIFY(dn);
560163953Srrs
561163953Srrs	ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
562163953Srrs
563163953Srrs	if (dmu_objset_userused_enabled(dn->dn_objset) &&
564163953Srrs	    !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
565163953Srrs		mutex_enter(&dn->dn_mtx);
566163953Srrs		dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
567163953Srrs		dn->dn_oldflags = dn->dn_phys->dn_flags;
568163953Srrs		dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
569163953Srrs		mutex_exit(&dn->dn_mtx);
570163953Srrs		dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
571163953Srrs	} else {
572163953Srrs		/* Once we account for it, we should always account for it. */
573163953Srrs		ASSERT(!(dn->dn_phys->dn_flags &
574163953Srrs		    DNODE_FLAG_USERUSED_ACCOUNTED));
575163953Srrs	}
576163953Srrs
577163953Srrs	mutex_enter(&dn->dn_mtx);
578163953Srrs	if (dn->dn_allocated_txg == tx->tx_txg) {
579163953Srrs		/* The dnode is newly allocated or reallocated */
580163953Srrs		if (dnp->dn_type == DMU_OT_NONE) {
581163953Srrs			/* this is a first alloc, not a realloc */
582163953Srrs			dnp->dn_nlevels = 1;
583163953Srrs			dnp->dn_nblkptr = dn->dn_nblkptr;
584163953Srrs		}
585163953Srrs
586163953Srrs		dnp->dn_type = dn->dn_type;
587163953Srrs		dnp->dn_bonustype = dn->dn_bonustype;
588163953Srrs		dnp->dn_bonuslen = dn->dn_bonuslen;
589163953Srrs	}
590163953Srrs	ASSERT(dnp->dn_nlevels > 1 ||
591163953Srrs	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
592163953Srrs	    BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
593163953Srrs	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
594163953Srrs	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
595163953Srrs	ASSERT(dnp->dn_nlevels < 2 ||
596163953Srrs	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
597163953Srrs	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
598163953Srrs
599163953Srrs	if (dn->dn_next_type[txgoff] != 0) {
600163953Srrs		dnp->dn_type = dn->dn_type;
601163953Srrs		dn->dn_next_type[txgoff] = 0;
602163953Srrs	}
603163953Srrs
604163953Srrs	if (dn->dn_next_blksz[txgoff] != 0) {
605163953Srrs		ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
606163953Srrs		    SPA_MINBLOCKSIZE) == 0);
607163953Srrs		ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
608163953Srrs		    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
609163953Srrs		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
610163953Srrs		    dnp->dn_datablkszsec ||
611163953Srrs		    range_tree_space(dn->dn_free_ranges[txgoff]) != 0);
612163953Srrs		dnp->dn_datablkszsec =
613163953Srrs		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
614163953Srrs		dn->dn_next_blksz[txgoff] = 0;
615163953Srrs	}
616163953Srrs
617163953Srrs	if (dn->dn_next_bonuslen[txgoff] != 0) {
618163953Srrs		if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
619163953Srrs			dnp->dn_bonuslen = 0;
620163953Srrs		else
621163953Srrs			dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
622163953Srrs		ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
623163953Srrs		dn->dn_next_bonuslen[txgoff] = 0;
624163953Srrs	}
625163953Srrs
626163953Srrs	if (dn->dn_next_bonustype[txgoff] != 0) {
627163953Srrs		ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
628163953Srrs		dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
629163953Srrs		dn->dn_next_bonustype[txgoff] = 0;
630163953Srrs	}
631163953Srrs
632163953Srrs	boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
633163953Srrs	    dn->dn_free_txg <= tx->tx_txg;
634163953Srrs
635163953Srrs	/*
636163953Srrs	 * Remove the spill block if we have been explicitly asked to
637163953Srrs	 * remove it, or if the object is being removed.
638163953Srrs	 */
639163953Srrs	if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {
640165220Srrs		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
641165220Srrs			kill_spill = B_TRUE;
642165220Srrs		dn->dn_rm_spillblk[txgoff] = 0;
643165220Srrs	}
644165220Srrs
645165220Srrs	if (dn->dn_next_indblkshift[txgoff] != 0) {
646165220Srrs		ASSERT(dnp->dn_nlevels == 1);
647165220Srrs		dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
648165220Srrs		dn->dn_next_indblkshift[txgoff] = 0;
649165220Srrs	}
650165220Srrs
651165220Srrs	/*
652165220Srrs	 * Just take the live (open-context) values for checksum and compress.
653163953Srrs	 * Strictly speaking it's a future leak, but nothing bad happens if we
654163953Srrs	 * start using the new checksum or compress algorithm a little early.
655163953Srrs	 */
656163953Srrs	dnp->dn_checksum = dn->dn_checksum;
657163953Srrs	dnp->dn_compress = dn->dn_compress;
658163953Srrs
659163953Srrs	mutex_exit(&dn->dn_mtx);
660163953Srrs
661163953Srrs	if (kill_spill) {
662163953Srrs		free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
663163953Srrs		mutex_enter(&dn->dn_mtx);
664163953Srrs		dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
665163953Srrs		mutex_exit(&dn->dn_mtx);
666163953Srrs	}
667163953Srrs
668163953Srrs	/* process all the "freed" ranges in the file */
669163953Srrs	if (dn->dn_free_ranges[txgoff] != NULL) {
670163953Srrs		dnode_sync_free_range_arg_t dsfra;
671163953Srrs		dsfra.dsfra_dnode = dn;
672163953Srrs		dsfra.dsfra_tx = tx;
673163953Srrs		mutex_enter(&dn->dn_mtx);
674163953Srrs		range_tree_vacate(dn->dn_free_ranges[txgoff],
675163953Srrs		    dnode_sync_free_range, &dsfra);
676163953Srrs		range_tree_destroy(dn->dn_free_ranges[txgoff]);
677163953Srrs		dn->dn_free_ranges[txgoff] = NULL;
678163953Srrs		mutex_exit(&dn->dn_mtx);
679163953Srrs	}
680163953Srrs
681163953Srrs	if (freeing_dnode) {
682163953Srrs		dnode_sync_free(dn, tx);
683163953Srrs		return;
684163953Srrs	}
685163953Srrs
686163953Srrs	if (dn->dn_next_nlevels[txgoff]) {
687163953Srrs		dnode_increase_indirection(dn, tx);
688163953Srrs		dn->dn_next_nlevels[txgoff] = 0;
689163953Srrs	}
690163953Srrs
691163953Srrs	if (dn->dn_next_nblkptr[txgoff]) {
692163953Srrs		/* this should only happen on a realloc */
693163953Srrs		ASSERT(dn->dn_allocated_txg == tx->tx_txg);
694163953Srrs		if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
695163953Srrs			/* zero the new blkptrs we are gaining */
696163953Srrs			bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
697163953Srrs			    sizeof (blkptr_t) *
698163953Srrs			    (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
699163953Srrs#ifdef ZFS_DEBUG
700163953Srrs		} else {
701163953Srrs			int i;
702163953Srrs			ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
703163953Srrs			/* the blkptrs we are losing better be unallocated */
704163953Srrs			for (i = dn->dn_next_nblkptr[txgoff];
705163953Srrs			    i < dnp->dn_nblkptr; i++)
706163953Srrs				ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
707163953Srrs#endif
708163953Srrs		}
709163953Srrs		mutex_enter(&dn->dn_mtx);
710163953Srrs		dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
711163953Srrs		dn->dn_next_nblkptr[txgoff] = 0;
712163953Srrs		mutex_exit(&dn->dn_mtx);
713163953Srrs	}
714163953Srrs
715163953Srrs	dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
716163953Srrs
717163953Srrs	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
718165220Srrs		ASSERT3P(list_head(list), ==, NULL);
719163953Srrs		dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
720163953Srrs	}
721163953Srrs
722163953Srrs	/*
723163953Srrs	 * Although we have dropped our reference to the dnode, it
724163953Srrs	 * can't be evicted until its written, and we haven't yet
725163953Srrs	 * initiated the IO for the dnode's dbuf.
726163953Srrs	 */
727163953Srrs}
728163953Srrs