dmu.c revision 226620
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23168404Spjd */
24168404Spjd
25168404Spjd#include <sys/dmu.h>
26168404Spjd#include <sys/dmu_impl.h>
27168404Spjd#include <sys/dmu_tx.h>
28168404Spjd#include <sys/dbuf.h>
29168404Spjd#include <sys/dnode.h>
30168404Spjd#include <sys/zfs_context.h>
31168404Spjd#include <sys/dmu_objset.h>
32168404Spjd#include <sys/dmu_traverse.h>
33168404Spjd#include <sys/dsl_dataset.h>
34168404Spjd#include <sys/dsl_dir.h>
35168404Spjd#include <sys/dsl_pool.h>
36168404Spjd#include <sys/dsl_synctask.h>
37168404Spjd#include <sys/dsl_prop.h>
38168404Spjd#include <sys/dmu_zfetch.h>
39168404Spjd#include <sys/zfs_ioctl.h>
40168404Spjd#include <sys/zap.h>
41168404Spjd#include <sys/zio_checksum.h>
42219089Spjd#include <sys/sa.h>
43219089Spjd#ifdef _KERNEL
44185029Spjd#include <sys/zfs_znode.h>
45219089Spjd#endif
46168404Spjd
47168404Spjdconst dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
48168404Spjd	{	byteswap_uint8_array,	TRUE,	"unallocated"		},
49168404Spjd	{	zap_byteswap,		TRUE,	"object directory"	},
50168404Spjd	{	byteswap_uint64_array,	TRUE,	"object array"		},
51168404Spjd	{	byteswap_uint8_array,	TRUE,	"packed nvlist"		},
52168404Spjd	{	byteswap_uint64_array,	TRUE,	"packed nvlist size"	},
53219089Spjd	{	byteswap_uint64_array,	TRUE,	"bpobj"			},
54219089Spjd	{	byteswap_uint64_array,	TRUE,	"bpobj header"		},
55168404Spjd	{	byteswap_uint64_array,	TRUE,	"SPA space map header"	},
56168404Spjd	{	byteswap_uint64_array,	TRUE,	"SPA space map"		},
57168404Spjd	{	byteswap_uint64_array,	TRUE,	"ZIL intent log"	},
58168404Spjd	{	dnode_buf_byteswap,	TRUE,	"DMU dnode"		},
59168404Spjd	{	dmu_objset_byteswap,	TRUE,	"DMU objset"		},
60168404Spjd	{	byteswap_uint64_array,	TRUE,	"DSL directory"		},
61168404Spjd	{	zap_byteswap,		TRUE,	"DSL directory child map"},
62168404Spjd	{	zap_byteswap,		TRUE,	"DSL dataset snap map"	},
63168404Spjd	{	zap_byteswap,		TRUE,	"DSL props"		},
64168404Spjd	{	byteswap_uint64_array,	TRUE,	"DSL dataset"		},
65168404Spjd	{	zfs_znode_byteswap,	TRUE,	"ZFS znode"		},
66185029Spjd	{	zfs_oldacl_byteswap,	TRUE,	"ZFS V0 ACL"		},
67168404Spjd	{	byteswap_uint8_array,	FALSE,	"ZFS plain file"	},
68168404Spjd	{	zap_byteswap,		TRUE,	"ZFS directory"		},
69168404Spjd	{	zap_byteswap,		TRUE,	"ZFS master node"	},
70168404Spjd	{	zap_byteswap,		TRUE,	"ZFS delete queue"	},
71168404Spjd	{	byteswap_uint8_array,	FALSE,	"zvol object"		},
72168404Spjd	{	zap_byteswap,		TRUE,	"zvol prop"		},
73168404Spjd	{	byteswap_uint8_array,	FALSE,	"other uint8[]"		},
74168404Spjd	{	byteswap_uint64_array,	FALSE,	"other uint64[]"	},
75168404Spjd	{	zap_byteswap,		TRUE,	"other ZAP"		},
76168404Spjd	{	zap_byteswap,		TRUE,	"persistent error log"	},
77168404Spjd	{	byteswap_uint8_array,	TRUE,	"SPA history"		},
78168404Spjd	{	byteswap_uint64_array,	TRUE,	"SPA history offsets"	},
79185029Spjd	{	zap_byteswap,		TRUE,	"Pool properties"	},
80185029Spjd	{	zap_byteswap,		TRUE,	"DSL permissions"	},
81185029Spjd	{	zfs_acl_byteswap,	TRUE,	"ZFS ACL"		},
82185029Spjd	{	byteswap_uint8_array,	TRUE,	"ZFS SYSACL"		},
83185029Spjd	{	byteswap_uint8_array,	TRUE,	"FUID table"		},
84185029Spjd	{	byteswap_uint64_array,	TRUE,	"FUID table size"	},
85185029Spjd	{	zap_byteswap,		TRUE,	"DSL dataset next clones"},
86219089Spjd	{	zap_byteswap,		TRUE,	"scan work queue"	},
87209962Smm	{	zap_byteswap,		TRUE,	"ZFS user/group used"	},
88209962Smm	{	zap_byteswap,		TRUE,	"ZFS user/group quota"	},
89219089Spjd	{	zap_byteswap,		TRUE,	"snapshot refcount tags"},
90219089Spjd	{	zap_byteswap,		TRUE,	"DDT ZAP algorithm"	},
91219089Spjd	{	zap_byteswap,		TRUE,	"DDT statistics"	},
92219089Spjd	{	byteswap_uint8_array,	TRUE,	"System attributes"	},
93219089Spjd	{	zap_byteswap,		TRUE,	"SA master node"	},
94219089Spjd	{	zap_byteswap,		TRUE,	"SA attr registration"	},
95219089Spjd	{	zap_byteswap,		TRUE,	"SA attr layouts"	},
96219089Spjd	{	zap_byteswap,		TRUE,	"scan translations"	},
97219089Spjd	{	byteswap_uint8_array,	FALSE,	"deduplicated block"	},
98219089Spjd	{	zap_byteswap,		TRUE,	"DSL deadlist map"	},
99219089Spjd	{	byteswap_uint64_array,	TRUE,	"DSL deadlist map hdr"	},
100219089Spjd	{	zap_byteswap,		TRUE,	"DSL dir clones"	},
101219089Spjd	{	byteswap_uint64_array,	TRUE,	"bpobj subobj"		},
102168404Spjd};
103168404Spjd
104168404Spjdint
105168404Spjddmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
106219089Spjd    void *tag, dmu_buf_t **dbp, int flags)
107168404Spjd{
108168404Spjd	dnode_t *dn;
109168404Spjd	uint64_t blkid;
110168404Spjd	dmu_buf_impl_t *db;
111168404Spjd	int err;
112219089Spjd	int db_flags = DB_RF_CANFAIL;
113168404Spjd
114219089Spjd	if (flags & DMU_READ_NO_PREFETCH)
115219089Spjd		db_flags |= DB_RF_NOPREFETCH;
116219089Spjd
117219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
118168404Spjd	if (err)
119168404Spjd		return (err);
120168404Spjd	blkid = dbuf_whichblock(dn, offset);
121168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
122168404Spjd	db = dbuf_hold(dn, blkid, tag);
123168404Spjd	rw_exit(&dn->dn_struct_rwlock);
124168404Spjd	if (db == NULL) {
125168404Spjd		err = EIO;
126168404Spjd	} else {
127219089Spjd		err = dbuf_read(db, NULL, db_flags);
128168404Spjd		if (err) {
129168404Spjd			dbuf_rele(db, tag);
130168404Spjd			db = NULL;
131168404Spjd		}
132168404Spjd	}
133168404Spjd
134168404Spjd	dnode_rele(dn, FTAG);
135219089Spjd	*dbp = &db->db; /* NULL db plus first field offset is NULL */
136168404Spjd	return (err);
137168404Spjd}
138168404Spjd
139168404Spjdint
140168404Spjddmu_bonus_max(void)
141168404Spjd{
142168404Spjd	return (DN_MAX_BONUSLEN);
143168404Spjd}
144168404Spjd
145185029Spjdint
146219089Spjddmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
147185029Spjd{
148219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
149219089Spjd	dnode_t *dn;
150219089Spjd	int error;
151185029Spjd
152219089Spjd	DB_DNODE_ENTER(db);
153219089Spjd	dn = DB_DNODE(db);
154219089Spjd
155219089Spjd	if (dn->dn_bonus != db) {
156219089Spjd		error = EINVAL;
157219089Spjd	} else if (newsize < 0 || newsize > db_fake->db_size) {
158219089Spjd		error = EINVAL;
159219089Spjd	} else {
160219089Spjd		dnode_setbonuslen(dn, newsize, tx);
161219089Spjd		error = 0;
162219089Spjd	}
163219089Spjd
164219089Spjd	DB_DNODE_EXIT(db);
165219089Spjd	return (error);
166185029Spjd}
167185029Spjd
168219089Spjdint
169219089Spjddmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
170219089Spjd{
171219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
172219089Spjd	dnode_t *dn;
173219089Spjd	int error;
174219089Spjd
175219089Spjd	DB_DNODE_ENTER(db);
176219089Spjd	dn = DB_DNODE(db);
177219089Spjd
178219089Spjd	if (type > DMU_OT_NUMTYPES) {
179219089Spjd		error = EINVAL;
180219089Spjd	} else if (dn->dn_bonus != db) {
181219089Spjd		error = EINVAL;
182219089Spjd	} else {
183219089Spjd		dnode_setbonus_type(dn, type, tx);
184219089Spjd		error = 0;
185219089Spjd	}
186219089Spjd
187219089Spjd	DB_DNODE_EXIT(db);
188219089Spjd	return (error);
189219089Spjd}
190219089Spjd
191219089Spjddmu_object_type_t
192219089Spjddmu_get_bonustype(dmu_buf_t *db_fake)
193219089Spjd{
194219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
195219089Spjd	dnode_t *dn;
196219089Spjd	dmu_object_type_t type;
197219089Spjd
198219089Spjd	DB_DNODE_ENTER(db);
199219089Spjd	dn = DB_DNODE(db);
200219089Spjd	type = dn->dn_bonustype;
201219089Spjd	DB_DNODE_EXIT(db);
202219089Spjd
203219089Spjd	return (type);
204219089Spjd}
205219089Spjd
206219089Spjdint
207219089Spjddmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
208219089Spjd{
209219089Spjd	dnode_t *dn;
210219089Spjd	int error;
211219089Spjd
212219089Spjd	error = dnode_hold(os, object, FTAG, &dn);
213219089Spjd	dbuf_rm_spill(dn, tx);
214219089Spjd	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
215219089Spjd	dnode_rm_spill(dn, tx);
216219089Spjd	rw_exit(&dn->dn_struct_rwlock);
217219089Spjd	dnode_rele(dn, FTAG);
218219089Spjd	return (error);
219219089Spjd}
220219089Spjd
221168404Spjd/*
222168404Spjd * returns ENOENT, EIO, or 0.
223168404Spjd */
224168404Spjdint
225168404Spjddmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
226168404Spjd{
227168404Spjd	dnode_t *dn;
228168404Spjd	dmu_buf_impl_t *db;
229185029Spjd	int error;
230168404Spjd
231219089Spjd	error = dnode_hold(os, object, FTAG, &dn);
232185029Spjd	if (error)
233185029Spjd		return (error);
234168404Spjd
235168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
236168404Spjd	if (dn->dn_bonus == NULL) {
237168404Spjd		rw_exit(&dn->dn_struct_rwlock);
238168404Spjd		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
239168404Spjd		if (dn->dn_bonus == NULL)
240185029Spjd			dbuf_create_bonus(dn);
241168404Spjd	}
242168404Spjd	db = dn->dn_bonus;
243185029Spjd
244185029Spjd	/* as long as the bonus buf is held, the dnode will be held */
245219089Spjd	if (refcount_add(&db->db_holds, tag) == 1) {
246185029Spjd		VERIFY(dnode_add_ref(dn, db));
247219089Spjd		(void) atomic_inc_32_nv(&dn->dn_dbufs_count);
248219089Spjd	}
249185029Spjd
250219089Spjd	/*
251219089Spjd	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
252219089Spjd	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
253219089Spjd	 * a dnode hold for every dbuf.
254219089Spjd	 */
255219089Spjd	rw_exit(&dn->dn_struct_rwlock);
256219089Spjd
257168404Spjd	dnode_rele(dn, FTAG);
258168404Spjd
259219089Spjd	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
260168404Spjd
261168404Spjd	*dbp = &db->db;
262168404Spjd	return (0);
263168404Spjd}
264168404Spjd
265168404Spjd/*
266219089Spjd * returns ENOENT, EIO, or 0.
267219089Spjd *
268219089Spjd * This interface will allocate a blank spill dbuf when a spill blk
269219089Spjd * doesn't already exist on the dnode.
270219089Spjd *
271219089Spjd * if you only want to find an already existing spill db, then
272219089Spjd * dmu_spill_hold_existing() should be used.
273219089Spjd */
274219089Spjdint
275219089Spjddmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
276219089Spjd{
277219089Spjd	dmu_buf_impl_t *db = NULL;
278219089Spjd	int err;
279219089Spjd
280219089Spjd	if ((flags & DB_RF_HAVESTRUCT) == 0)
281219089Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
282219089Spjd
283219089Spjd	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
284219089Spjd
285219089Spjd	if ((flags & DB_RF_HAVESTRUCT) == 0)
286219089Spjd		rw_exit(&dn->dn_struct_rwlock);
287219089Spjd
288219089Spjd	ASSERT(db != NULL);
289219089Spjd	err = dbuf_read(db, NULL, flags);
290219089Spjd	if (err == 0)
291219089Spjd		*dbp = &db->db;
292219089Spjd	else
293219089Spjd		dbuf_rele(db, tag);
294219089Spjd	return (err);
295219089Spjd}
296219089Spjd
297219089Spjdint
298219089Spjddmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
299219089Spjd{
300219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
301219089Spjd	dnode_t *dn;
302219089Spjd	int err;
303219089Spjd
304219089Spjd	DB_DNODE_ENTER(db);
305219089Spjd	dn = DB_DNODE(db);
306219089Spjd
307219089Spjd	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
308219089Spjd		err = EINVAL;
309219089Spjd	} else {
310219089Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
311219089Spjd
312219089Spjd		if (!dn->dn_have_spill) {
313219089Spjd			err = ENOENT;
314219089Spjd		} else {
315219089Spjd			err = dmu_spill_hold_by_dnode(dn,
316219089Spjd			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
317219089Spjd		}
318219089Spjd
319219089Spjd		rw_exit(&dn->dn_struct_rwlock);
320219089Spjd	}
321219089Spjd
322219089Spjd	DB_DNODE_EXIT(db);
323219089Spjd	return (err);
324219089Spjd}
325219089Spjd
326219089Spjdint
327219089Spjddmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
328219089Spjd{
329219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
330219089Spjd	dnode_t *dn;
331219089Spjd	int err;
332219089Spjd
333219089Spjd	DB_DNODE_ENTER(db);
334219089Spjd	dn = DB_DNODE(db);
335219089Spjd	err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
336219089Spjd	DB_DNODE_EXIT(db);
337219089Spjd
338219089Spjd	return (err);
339219089Spjd}
340219089Spjd
341219089Spjd/*
342168404Spjd * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
343168404Spjd * to take a held dnode rather than <os, object> -- the lookup is wasteful,
344168404Spjd * and can induce severe lock contention when writing to several files
345168404Spjd * whose dnodes are in the same block.
346168404Spjd */
347168404Spjdstatic int
348209962Smmdmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
349209962Smm    int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
350168404Spjd{
351185029Spjd	dsl_pool_t *dp = NULL;
352168404Spjd	dmu_buf_t **dbp;
353168404Spjd	uint64_t blkid, nblks, i;
354209962Smm	uint32_t dbuf_flags;
355168404Spjd	int err;
356168404Spjd	zio_t *zio;
357185029Spjd	hrtime_t start;
358168404Spjd
359168404Spjd	ASSERT(length <= DMU_MAX_ACCESS);
360168404Spjd
361214378Smm	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
362209962Smm	if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
363209962Smm		dbuf_flags |= DB_RF_NOPREFETCH;
364168404Spjd
365168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
366168404Spjd	if (dn->dn_datablkshift) {
367168404Spjd		int blkshift = dn->dn_datablkshift;
368168404Spjd		nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
369168404Spjd		    P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
370168404Spjd	} else {
371168404Spjd		if (offset + length > dn->dn_datablksz) {
372168404Spjd			zfs_panic_recover("zfs: accessing past end of object "
373168404Spjd			    "%llx/%llx (size=%u access=%llu+%llu)",
374168404Spjd			    (longlong_t)dn->dn_objset->
375168404Spjd			    os_dsl_dataset->ds_object,
376168404Spjd			    (longlong_t)dn->dn_object, dn->dn_datablksz,
377168404Spjd			    (longlong_t)offset, (longlong_t)length);
378214378Smm			rw_exit(&dn->dn_struct_rwlock);
379168404Spjd			return (EIO);
380168404Spjd		}
381168404Spjd		nblks = 1;
382168404Spjd	}
383168404Spjd	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
384168404Spjd
385185029Spjd	if (dn->dn_objset->os_dsl_dataset)
386185029Spjd		dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
387185029Spjd	if (dp && dsl_pool_sync_context(dp))
388185029Spjd		start = gethrtime();
389185029Spjd	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
390168404Spjd	blkid = dbuf_whichblock(dn, offset);
391168404Spjd	for (i = 0; i < nblks; i++) {
392168404Spjd		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
393168404Spjd		if (db == NULL) {
394168404Spjd			rw_exit(&dn->dn_struct_rwlock);
395168404Spjd			dmu_buf_rele_array(dbp, nblks, tag);
396168404Spjd			zio_nowait(zio);
397168404Spjd			return (EIO);
398168404Spjd		}
399168404Spjd		/* initiate async i/o */
400226620Spjd		if (read)
401209962Smm			(void) dbuf_read(db, zio, dbuf_flags);
402226620Spjd#ifdef _KERNEL
403226620Spjd		else
404226620Spjd			curthread->td_ru.ru_oublock++;
405226620Spjd#endif
406168404Spjd		dbp[i] = &db->db;
407168404Spjd	}
408168404Spjd	rw_exit(&dn->dn_struct_rwlock);
409168404Spjd
410168404Spjd	/* wait for async i/o */
411168404Spjd	err = zio_wait(zio);
412185029Spjd	/* track read overhead when we are in sync context */
413185029Spjd	if (dp && dsl_pool_sync_context(dp))
414185029Spjd		dp->dp_read_overhead += gethrtime() - start;
415168404Spjd	if (err) {
416168404Spjd		dmu_buf_rele_array(dbp, nblks, tag);
417168404Spjd		return (err);
418168404Spjd	}
419168404Spjd
420168404Spjd	/* wait for other io to complete */
421168404Spjd	if (read) {
422168404Spjd		for (i = 0; i < nblks; i++) {
423168404Spjd			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
424168404Spjd			mutex_enter(&db->db_mtx);
425168404Spjd			while (db->db_state == DB_READ ||
426168404Spjd			    db->db_state == DB_FILL)
427168404Spjd				cv_wait(&db->db_changed, &db->db_mtx);
428168404Spjd			if (db->db_state == DB_UNCACHED)
429168404Spjd				err = EIO;
430168404Spjd			mutex_exit(&db->db_mtx);
431168404Spjd			if (err) {
432168404Spjd				dmu_buf_rele_array(dbp, nblks, tag);
433168404Spjd				return (err);
434168404Spjd			}
435168404Spjd		}
436168404Spjd	}
437168404Spjd
438168404Spjd	*numbufsp = nblks;
439168404Spjd	*dbpp = dbp;
440168404Spjd	return (0);
441168404Spjd}
442168404Spjd
443168404Spjdstatic int
444168404Spjddmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
445168404Spjd    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
446168404Spjd{
447168404Spjd	dnode_t *dn;
448168404Spjd	int err;
449168404Spjd
450219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
451168404Spjd	if (err)
452168404Spjd		return (err);
453168404Spjd
454168404Spjd	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
455209962Smm	    numbufsp, dbpp, DMU_READ_PREFETCH);
456168404Spjd
457168404Spjd	dnode_rele(dn, FTAG);
458168404Spjd
459168404Spjd	return (err);
460168404Spjd}
461168404Spjd
462168404Spjdint
463219089Spjddmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
464168404Spjd    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
465168404Spjd{
466219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
467219089Spjd	dnode_t *dn;
468168404Spjd	int err;
469168404Spjd
470219089Spjd	DB_DNODE_ENTER(db);
471219089Spjd	dn = DB_DNODE(db);
472168404Spjd	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
473209962Smm	    numbufsp, dbpp, DMU_READ_PREFETCH);
474219089Spjd	DB_DNODE_EXIT(db);
475168404Spjd
476168404Spjd	return (err);
477168404Spjd}
478168404Spjd
479168404Spjdvoid
480168404Spjddmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
481168404Spjd{
482168404Spjd	int i;
483168404Spjd	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
484168404Spjd
485168404Spjd	if (numbufs == 0)
486168404Spjd		return;
487168404Spjd
488168404Spjd	for (i = 0; i < numbufs; i++) {
489168404Spjd		if (dbp[i])
490168404Spjd			dbuf_rele(dbp[i], tag);
491168404Spjd	}
492168404Spjd
493168404Spjd	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
494168404Spjd}
495168404Spjd
496168404Spjdvoid
497168404Spjddmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
498168404Spjd{
499168404Spjd	dnode_t *dn;
500168404Spjd	uint64_t blkid;
501168404Spjd	int nblks, i, err;
502168404Spjd
503194043Skmacy	if (zfs_prefetch_disable)
504168404Spjd		return;
505168404Spjd
506168404Spjd	if (len == 0) {  /* they're interested in the bonus buffer */
507219089Spjd		dn = DMU_META_DNODE(os);
508168404Spjd
509168404Spjd		if (object == 0 || object >= DN_MAX_OBJECT)
510168404Spjd			return;
511168404Spjd
512168404Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
513168404Spjd		blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
514168404Spjd		dbuf_prefetch(dn, blkid);
515168404Spjd		rw_exit(&dn->dn_struct_rwlock);
516168404Spjd		return;
517168404Spjd	}
518168404Spjd
519168404Spjd	/*
520168404Spjd	 * XXX - Note, if the dnode for the requested object is not
521168404Spjd	 * already cached, we will do a *synchronous* read in the
522168404Spjd	 * dnode_hold() call.  The same is true for any indirects.
523168404Spjd	 */
524219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
525168404Spjd	if (err != 0)
526168404Spjd		return;
527168404Spjd
528168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
529168404Spjd	if (dn->dn_datablkshift) {
530168404Spjd		int blkshift = dn->dn_datablkshift;
531168404Spjd		nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
532168404Spjd		    P2ALIGN(offset, 1<<blkshift)) >> blkshift;
533168404Spjd	} else {
534168404Spjd		nblks = (offset < dn->dn_datablksz);
535168404Spjd	}
536168404Spjd
537168404Spjd	if (nblks != 0) {
538168404Spjd		blkid = dbuf_whichblock(dn, offset);
539168404Spjd		for (i = 0; i < nblks; i++)
540168404Spjd			dbuf_prefetch(dn, blkid+i);
541168404Spjd	}
542168404Spjd
543168404Spjd	rw_exit(&dn->dn_struct_rwlock);
544168404Spjd
545168404Spjd	dnode_rele(dn, FTAG);
546168404Spjd}
547168404Spjd
548208775Smm/*
549208775Smm * Get the next "chunk" of file data to free.  We traverse the file from
550208775Smm * the end so that the file gets shorter over time (if we crashes in the
551208775Smm * middle, this will leave us in a better state).  We find allocated file
552208775Smm * data by simply searching the allocated level 1 indirects.
553208775Smm */
554185029Spjdstatic int
555208775Smmget_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit)
556185029Spjd{
557208775Smm	uint64_t len = *start - limit;
558208775Smm	uint64_t blkcnt = 0;
559208775Smm	uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1));
560208775Smm	uint64_t iblkrange =
561185029Spjd	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
562185029Spjd
563208775Smm	ASSERT(limit <= *start);
564185029Spjd
565208775Smm	if (len <= iblkrange * maxblks) {
566208775Smm		*start = limit;
567185029Spjd		return (0);
568185029Spjd	}
569208775Smm	ASSERT(ISP2(iblkrange));
570185029Spjd
571208775Smm	while (*start > limit && blkcnt < maxblks) {
572185029Spjd		int err;
573185029Spjd
574208775Smm		/* find next allocated L1 indirect */
575185029Spjd		err = dnode_next_offset(dn,
576208775Smm		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
577185029Spjd
578208775Smm		/* if there are no more, then we are done */
579208775Smm		if (err == ESRCH) {
580208775Smm			*start = limit;
581185029Spjd			return (0);
582208775Smm		} else if (err) {
583208775Smm			return (err);
584185029Spjd		}
585208775Smm		blkcnt += 1;
586185029Spjd
587208775Smm		/* reset offset to end of "next" block back */
588208775Smm		*start = P2ALIGN(*start, iblkrange);
589208775Smm		if (*start <= limit)
590208775Smm			*start = limit;
591208775Smm		else
592208775Smm			*start -= 1;
593185029Spjd	}
594185029Spjd	return (0);
595185029Spjd}
596185029Spjd
597185029Spjdstatic int
598185029Spjddmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
599185029Spjd    uint64_t length, boolean_t free_dnode)
600185029Spjd{
601185029Spjd	dmu_tx_t *tx;
602185029Spjd	uint64_t object_size, start, end, len;
603185029Spjd	boolean_t trunc = (length == DMU_OBJECT_END);
604185029Spjd	int align, err;
605185029Spjd
606185029Spjd	align = 1 << dn->dn_datablkshift;
607185029Spjd	ASSERT(align > 0);
608185029Spjd	object_size = align == 1 ? dn->dn_datablksz :
609185029Spjd	    (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
610185029Spjd
611209962Smm	end = offset + length;
612209962Smm	if (trunc || end > object_size)
613185029Spjd		end = object_size;
614185029Spjd	if (end <= offset)
615185029Spjd		return (0);
616185029Spjd	length = end - offset;
617185029Spjd
618185029Spjd	while (length) {
619185029Spjd		start = end;
620209962Smm		/* assert(offset <= start) */
621185029Spjd		err = get_next_chunk(dn, &start, offset);
622185029Spjd		if (err)
623185029Spjd			return (err);
624185029Spjd		len = trunc ? DMU_OBJECT_END : end - start;
625185029Spjd
626185029Spjd		tx = dmu_tx_create(os);
627185029Spjd		dmu_tx_hold_free(tx, dn->dn_object, start, len);
628185029Spjd		err = dmu_tx_assign(tx, TXG_WAIT);
629185029Spjd		if (err) {
630185029Spjd			dmu_tx_abort(tx);
631185029Spjd			return (err);
632185029Spjd		}
633185029Spjd
634185029Spjd		dnode_free_range(dn, start, trunc ? -1 : len, tx);
635185029Spjd
636185029Spjd		if (start == 0 && free_dnode) {
637185029Spjd			ASSERT(trunc);
638185029Spjd			dnode_free(dn, tx);
639185029Spjd		}
640185029Spjd
641185029Spjd		length -= end - start;
642185029Spjd
643185029Spjd		dmu_tx_commit(tx);
644185029Spjd		end = start;
645185029Spjd	}
646185029Spjd	return (0);
647185029Spjd}
648185029Spjd
649168404Spjdint
650185029Spjddmu_free_long_range(objset_t *os, uint64_t object,
651185029Spjd    uint64_t offset, uint64_t length)
652185029Spjd{
653185029Spjd	dnode_t *dn;
654185029Spjd	int err;
655185029Spjd
656219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
657185029Spjd	if (err != 0)
658185029Spjd		return (err);
659185029Spjd	err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
660185029Spjd	dnode_rele(dn, FTAG);
661185029Spjd	return (err);
662185029Spjd}
663185029Spjd
664185029Spjdint
665185029Spjddmu_free_object(objset_t *os, uint64_t object)
666185029Spjd{
667185029Spjd	dnode_t *dn;
668185029Spjd	dmu_tx_t *tx;
669185029Spjd	int err;
670185029Spjd
671219089Spjd	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
672185029Spjd	    FTAG, &dn);
673185029Spjd	if (err != 0)
674185029Spjd		return (err);
675185029Spjd	if (dn->dn_nlevels == 1) {
676185029Spjd		tx = dmu_tx_create(os);
677185029Spjd		dmu_tx_hold_bonus(tx, object);
678185029Spjd		dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
679185029Spjd		err = dmu_tx_assign(tx, TXG_WAIT);
680185029Spjd		if (err == 0) {
681185029Spjd			dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
682185029Spjd			dnode_free(dn, tx);
683185029Spjd			dmu_tx_commit(tx);
684185029Spjd		} else {
685185029Spjd			dmu_tx_abort(tx);
686185029Spjd		}
687185029Spjd	} else {
688185029Spjd		err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
689185029Spjd	}
690185029Spjd	dnode_rele(dn, FTAG);
691185029Spjd	return (err);
692185029Spjd}
693185029Spjd
694185029Spjdint
695168404Spjddmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
696168404Spjd    uint64_t size, dmu_tx_t *tx)
697168404Spjd{
698168404Spjd	dnode_t *dn;
699219089Spjd	int err = dnode_hold(os, object, FTAG, &dn);
700168404Spjd	if (err)
701168404Spjd		return (err);
702168404Spjd	ASSERT(offset < UINT64_MAX);
703168404Spjd	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
704168404Spjd	dnode_free_range(dn, offset, size, tx);
705168404Spjd	dnode_rele(dn, FTAG);
706168404Spjd	return (0);
707168404Spjd}
708168404Spjd
709168404Spjdint
710168404Spjddmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
711209962Smm    void *buf, uint32_t flags)
712168404Spjd{
713168404Spjd	dnode_t *dn;
714168404Spjd	dmu_buf_t **dbp;
715214378Smm	int numbufs, err;
716168404Spjd
717219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
718168404Spjd	if (err)
719168404Spjd		return (err);
720168404Spjd
721168404Spjd	/*
722168404Spjd	 * Deal with odd block sizes, where there can't be data past the first
723168404Spjd	 * block.  If we ever do the tail block optimization, we will need to
724168404Spjd	 * handle that here as well.
725168404Spjd	 */
726214378Smm	if (dn->dn_maxblkid == 0) {
727168404Spjd		int newsz = offset > dn->dn_datablksz ? 0 :
728168404Spjd		    MIN(size, dn->dn_datablksz - offset);
729168404Spjd		bzero((char *)buf + newsz, size - newsz);
730168404Spjd		size = newsz;
731168404Spjd	}
732168404Spjd
733168404Spjd	while (size > 0) {
734168404Spjd		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
735214378Smm		int i;
736168404Spjd
737168404Spjd		/*
738168404Spjd		 * NB: we could do this block-at-a-time, but it's nice
739168404Spjd		 * to be reading in parallel.
740168404Spjd		 */
741168404Spjd		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
742209962Smm		    TRUE, FTAG, &numbufs, &dbp, flags);
743168404Spjd		if (err)
744185029Spjd			break;
745168404Spjd
746168404Spjd		for (i = 0; i < numbufs; i++) {
747168404Spjd			int tocpy;
748168404Spjd			int bufoff;
749168404Spjd			dmu_buf_t *db = dbp[i];
750168404Spjd
751168404Spjd			ASSERT(size > 0);
752168404Spjd
753168404Spjd			bufoff = offset - db->db_offset;
754168404Spjd			tocpy = (int)MIN(db->db_size - bufoff, size);
755168404Spjd
756168404Spjd			bcopy((char *)db->db_data + bufoff, buf, tocpy);
757168404Spjd
758168404Spjd			offset += tocpy;
759168404Spjd			size -= tocpy;
760168404Spjd			buf = (char *)buf + tocpy;
761168404Spjd		}
762168404Spjd		dmu_buf_rele_array(dbp, numbufs, FTAG);
763168404Spjd	}
764168404Spjd	dnode_rele(dn, FTAG);
765185029Spjd	return (err);
766168404Spjd}
767168404Spjd
768168404Spjdvoid
769168404Spjddmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
770168404Spjd    const void *buf, dmu_tx_t *tx)
771168404Spjd{
772168404Spjd	dmu_buf_t **dbp;
773168404Spjd	int numbufs, i;
774168404Spjd
775168404Spjd	if (size == 0)
776168404Spjd		return;
777168404Spjd
778168404Spjd	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
779168404Spjd	    FALSE, FTAG, &numbufs, &dbp));
780168404Spjd
781168404Spjd	for (i = 0; i < numbufs; i++) {
782168404Spjd		int tocpy;
783168404Spjd		int bufoff;
784168404Spjd		dmu_buf_t *db = dbp[i];
785168404Spjd
786168404Spjd		ASSERT(size > 0);
787168404Spjd
788168404Spjd		bufoff = offset - db->db_offset;
789168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
790168404Spjd
791168404Spjd		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
792168404Spjd
793168404Spjd		if (tocpy == db->db_size)
794168404Spjd			dmu_buf_will_fill(db, tx);
795168404Spjd		else
796168404Spjd			dmu_buf_will_dirty(db, tx);
797168404Spjd
798168404Spjd		bcopy(buf, (char *)db->db_data + bufoff, tocpy);
799168404Spjd
800168404Spjd		if (tocpy == db->db_size)
801168404Spjd			dmu_buf_fill_done(db, tx);
802168404Spjd
803168404Spjd		offset += tocpy;
804168404Spjd		size -= tocpy;
805168404Spjd		buf = (char *)buf + tocpy;
806168404Spjd	}
807168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
808168404Spjd}
809168404Spjd
810219089Spjdvoid
811219089Spjddmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
812219089Spjd    dmu_tx_t *tx)
813219089Spjd{
814219089Spjd	dmu_buf_t **dbp;
815219089Spjd	int numbufs, i;
816219089Spjd
817219089Spjd	if (size == 0)
818219089Spjd		return;
819219089Spjd
820219089Spjd	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
821219089Spjd	    FALSE, FTAG, &numbufs, &dbp));
822219089Spjd
823219089Spjd	for (i = 0; i < numbufs; i++) {
824219089Spjd		dmu_buf_t *db = dbp[i];
825219089Spjd
826219089Spjd		dmu_buf_will_not_fill(db, tx);
827219089Spjd	}
828219089Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
829219089Spjd}
830219089Spjd
831219089Spjd/*
832219089Spjd * DMU support for xuio
833219089Spjd */
834219089Spjdkstat_t *xuio_ksp = NULL;
835219089Spjd
836219089Spjdint
837219089Spjddmu_xuio_init(xuio_t *xuio, int nblk)
838219089Spjd{
839219089Spjd	dmu_xuio_t *priv;
840219089Spjd	uio_t *uio = &xuio->xu_uio;
841219089Spjd
842219089Spjd	uio->uio_iovcnt = nblk;
843219089Spjd	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
844219089Spjd
845219089Spjd	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
846219089Spjd	priv->cnt = nblk;
847219089Spjd	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
848219089Spjd	priv->iovp = uio->uio_iov;
849219089Spjd	XUIO_XUZC_PRIV(xuio) = priv;
850219089Spjd
851219089Spjd	if (XUIO_XUZC_RW(xuio) == UIO_READ)
852219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
853219089Spjd	else
854219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
855219089Spjd
856219089Spjd	return (0);
857219089Spjd}
858219089Spjd
859219089Spjdvoid
860219089Spjddmu_xuio_fini(xuio_t *xuio)
861219089Spjd{
862219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
863219089Spjd	int nblk = priv->cnt;
864219089Spjd
865219089Spjd	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
866219089Spjd	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
867219089Spjd	kmem_free(priv, sizeof (dmu_xuio_t));
868219089Spjd
869219089Spjd	if (XUIO_XUZC_RW(xuio) == UIO_READ)
870219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
871219089Spjd	else
872219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
873219089Spjd}
874219089Spjd
875219089Spjd/*
876219089Spjd * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
877219089Spjd * and increase priv->next by 1.
878219089Spjd */
879219089Spjdint
880219089Spjddmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
881219089Spjd{
882219089Spjd	struct iovec *iov;
883219089Spjd	uio_t *uio = &xuio->xu_uio;
884219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
885219089Spjd	int i = priv->next++;
886219089Spjd
887219089Spjd	ASSERT(i < priv->cnt);
888219089Spjd	ASSERT(off + n <= arc_buf_size(abuf));
889219089Spjd	iov = uio->uio_iov + i;
890219089Spjd	iov->iov_base = (char *)abuf->b_data + off;
891219089Spjd	iov->iov_len = n;
892219089Spjd	priv->bufs[i] = abuf;
893219089Spjd	return (0);
894219089Spjd}
895219089Spjd
896219089Spjdint
897219089Spjddmu_xuio_cnt(xuio_t *xuio)
898219089Spjd{
899219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
900219089Spjd	return (priv->cnt);
901219089Spjd}
902219089Spjd
903219089Spjdarc_buf_t *
904219089Spjddmu_xuio_arcbuf(xuio_t *xuio, int i)
905219089Spjd{
906219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
907219089Spjd
908219089Spjd	ASSERT(i < priv->cnt);
909219089Spjd	return (priv->bufs[i]);
910219089Spjd}
911219089Spjd
912219089Spjdvoid
913219089Spjddmu_xuio_clear(xuio_t *xuio, int i)
914219089Spjd{
915219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
916219089Spjd
917219089Spjd	ASSERT(i < priv->cnt);
918219089Spjd	priv->bufs[i] = NULL;
919219089Spjd}
920219089Spjd
921219089Spjdstatic void
922219089Spjdxuio_stat_init(void)
923219089Spjd{
924219089Spjd	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
925219089Spjd	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
926219089Spjd	    KSTAT_FLAG_VIRTUAL);
927219089Spjd	if (xuio_ksp != NULL) {
928219089Spjd		xuio_ksp->ks_data = &xuio_stats;
929219089Spjd		kstat_install(xuio_ksp);
930219089Spjd	}
931219089Spjd}
932219089Spjd
933219089Spjdstatic void
934219089Spjdxuio_stat_fini(void)
935219089Spjd{
936219089Spjd	if (xuio_ksp != NULL) {
937219089Spjd		kstat_delete(xuio_ksp);
938219089Spjd		xuio_ksp = NULL;
939219089Spjd	}
940219089Spjd}
941219089Spjd
942219089Spjdvoid
943219089Spjdxuio_stat_wbuf_copied()
944219089Spjd{
945219089Spjd	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
946219089Spjd}
947219089Spjd
948219089Spjdvoid
949219089Spjdxuio_stat_wbuf_nocopy()
950219089Spjd{
951219089Spjd	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
952219089Spjd}
953219089Spjd
954168404Spjd#ifdef _KERNEL
955168404Spjdint
956168404Spjddmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
957168404Spjd{
958168404Spjd	dmu_buf_t **dbp;
959168404Spjd	int numbufs, i, err;
960219089Spjd	xuio_t *xuio = NULL;
961168404Spjd
962168404Spjd	/*
963168404Spjd	 * NB: we could do this block-at-a-time, but it's nice
964168404Spjd	 * to be reading in parallel.
965168404Spjd	 */
966168404Spjd	err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
967168404Spjd	    &numbufs, &dbp);
968168404Spjd	if (err)
969168404Spjd		return (err);
970168404Spjd
971219089Spjd#ifdef UIO_XUIO
972219089Spjd	if (uio->uio_extflg == UIO_XUIO)
973219089Spjd		xuio = (xuio_t *)uio;
974219089Spjd#endif
975219089Spjd
976168404Spjd	for (i = 0; i < numbufs; i++) {
977168404Spjd		int tocpy;
978168404Spjd		int bufoff;
979168404Spjd		dmu_buf_t *db = dbp[i];
980168404Spjd
981168404Spjd		ASSERT(size > 0);
982168404Spjd
983168404Spjd		bufoff = uio->uio_loffset - db->db_offset;
984168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
985168404Spjd
986219089Spjd		if (xuio) {
987219089Spjd			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
988219089Spjd			arc_buf_t *dbuf_abuf = dbi->db_buf;
989219089Spjd			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
990219089Spjd			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
991219089Spjd			if (!err) {
992219089Spjd				uio->uio_resid -= tocpy;
993219089Spjd				uio->uio_loffset += tocpy;
994219089Spjd			}
995219089Spjd
996219089Spjd			if (abuf == dbuf_abuf)
997219089Spjd				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
998219089Spjd			else
999219089Spjd				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
1000219089Spjd		} else {
1001219089Spjd			err = uiomove((char *)db->db_data + bufoff, tocpy,
1002219089Spjd			    UIO_READ, uio);
1003219089Spjd		}
1004168404Spjd		if (err)
1005168404Spjd			break;
1006168404Spjd
1007168404Spjd		size -= tocpy;
1008168404Spjd	}
1009168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1010168404Spjd
1011168404Spjd	return (err);
1012168404Spjd}
1013168404Spjd
1014219089Spjdstatic int
1015219089Spjddmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
1016168404Spjd{
1017168404Spjd	dmu_buf_t **dbp;
1018219089Spjd	int numbufs;
1019168404Spjd	int err = 0;
1020219089Spjd	int i;
1021168404Spjd
1022219089Spjd	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1023219089Spjd	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
1024168404Spjd	if (err)
1025168404Spjd		return (err);
1026168404Spjd
1027168404Spjd	for (i = 0; i < numbufs; i++) {
1028168404Spjd		int tocpy;
1029168404Spjd		int bufoff;
1030168404Spjd		dmu_buf_t *db = dbp[i];
1031168404Spjd
1032168404Spjd		ASSERT(size > 0);
1033168404Spjd
1034168404Spjd		bufoff = uio->uio_loffset - db->db_offset;
1035168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
1036168404Spjd
1037168404Spjd		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1038168404Spjd
1039168404Spjd		if (tocpy == db->db_size)
1040168404Spjd			dmu_buf_will_fill(db, tx);
1041168404Spjd		else
1042168404Spjd			dmu_buf_will_dirty(db, tx);
1043168404Spjd
1044168404Spjd		/*
1045168404Spjd		 * XXX uiomove could block forever (eg. nfs-backed
1046168404Spjd		 * pages).  There needs to be a uiolockdown() function
1047168404Spjd		 * to lock the pages in memory, so that uiomove won't
1048168404Spjd		 * block.
1049168404Spjd		 */
1050168404Spjd		err = uiomove((char *)db->db_data + bufoff, tocpy,
1051168404Spjd		    UIO_WRITE, uio);
1052168404Spjd
1053168404Spjd		if (tocpy == db->db_size)
1054168404Spjd			dmu_buf_fill_done(db, tx);
1055168404Spjd
1056168404Spjd		if (err)
1057168404Spjd			break;
1058168404Spjd
1059168404Spjd		size -= tocpy;
1060168404Spjd	}
1061219089Spjd
1062168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1063168404Spjd	return (err);
1064168404Spjd}
1065168404Spjd
1066168404Spjdint
1067219089Spjddmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
1068219089Spjd    dmu_tx_t *tx)
1069219089Spjd{
1070219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1071219089Spjd	dnode_t *dn;
1072219089Spjd	int err;
1073219089Spjd
1074219089Spjd	if (size == 0)
1075219089Spjd		return (0);
1076219089Spjd
1077219089Spjd	DB_DNODE_ENTER(db);
1078219089Spjd	dn = DB_DNODE(db);
1079219089Spjd	err = dmu_write_uio_dnode(dn, uio, size, tx);
1080219089Spjd	DB_DNODE_EXIT(db);
1081219089Spjd
1082219089Spjd	return (err);
1083219089Spjd}
1084219089Spjd
1085219089Spjdint
1086219089Spjddmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
1087219089Spjd    dmu_tx_t *tx)
1088219089Spjd{
1089219089Spjd	dnode_t *dn;
1090219089Spjd	int err;
1091219089Spjd
1092219089Spjd	if (size == 0)
1093219089Spjd		return (0);
1094219089Spjd
1095219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
1096219089Spjd	if (err)
1097219089Spjd		return (err);
1098219089Spjd
1099219089Spjd	err = dmu_write_uio_dnode(dn, uio, size, tx);
1100219089Spjd
1101219089Spjd	dnode_rele(dn, FTAG);
1102219089Spjd
1103219089Spjd	return (err);
1104219089Spjd}
1105219089Spjd
1106219089Spjd#ifdef sun
1107219089Spjdint
1108168404Spjddmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1109168404Spjd    page_t *pp, dmu_tx_t *tx)
1110168404Spjd{
1111168404Spjd	dmu_buf_t **dbp;
1112168404Spjd	int numbufs, i;
1113168404Spjd	int err;
1114168404Spjd
1115168404Spjd	if (size == 0)
1116168404Spjd		return (0);
1117168404Spjd
1118168404Spjd	err = dmu_buf_hold_array(os, object, offset, size,
1119168404Spjd	    FALSE, FTAG, &numbufs, &dbp);
1120168404Spjd	if (err)
1121168404Spjd		return (err);
1122168404Spjd
1123168404Spjd	for (i = 0; i < numbufs; i++) {
1124168404Spjd		int tocpy, copied, thiscpy;
1125168404Spjd		int bufoff;
1126168404Spjd		dmu_buf_t *db = dbp[i];
1127168404Spjd		caddr_t va;
1128168404Spjd
1129168404Spjd		ASSERT(size > 0);
1130168404Spjd		ASSERT3U(db->db_size, >=, PAGESIZE);
1131168404Spjd
1132168404Spjd		bufoff = offset - db->db_offset;
1133168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
1134168404Spjd
1135168404Spjd		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1136168404Spjd
1137168404Spjd		if (tocpy == db->db_size)
1138168404Spjd			dmu_buf_will_fill(db, tx);
1139168404Spjd		else
1140168404Spjd			dmu_buf_will_dirty(db, tx);
1141168404Spjd
1142168404Spjd		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
1143168404Spjd			ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
1144168404Spjd			thiscpy = MIN(PAGESIZE, tocpy - copied);
1145185029Spjd			va = zfs_map_page(pp, S_READ);
1146168404Spjd			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
1147185029Spjd			zfs_unmap_page(pp, va);
1148168404Spjd			pp = pp->p_next;
1149168404Spjd			bufoff += PAGESIZE;
1150168404Spjd		}
1151168404Spjd
1152168404Spjd		if (tocpy == db->db_size)
1153168404Spjd			dmu_buf_fill_done(db, tx);
1154168404Spjd
1155168404Spjd		offset += tocpy;
1156168404Spjd		size -= tocpy;
1157168404Spjd	}
1158168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1159168404Spjd	return (err);
1160168404Spjd}
1161219089Spjd#endif	/* sun */
1162219089Spjd#endif
1163168404Spjd
1164209962Smm/*
1165209962Smm * Allocate a loaned anonymous arc buffer.
1166209962Smm */
1167209962Smmarc_buf_t *
1168209962Smmdmu_request_arcbuf(dmu_buf_t *handle, int size)
1169209962Smm{
1170219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
1171219089Spjd	spa_t *spa;
1172209962Smm
1173219089Spjd	DB_GET_SPA(&spa, db);
1174219089Spjd	return (arc_loan_buf(spa, size));
1175209962Smm}
1176209962Smm
1177209962Smm/*
1178209962Smm * Free a loaned arc buffer.
1179209962Smm */
1180209962Smmvoid
1181209962Smmdmu_return_arcbuf(arc_buf_t *buf)
1182209962Smm{
1183209962Smm	arc_return_buf(buf, FTAG);
1184209962Smm	VERIFY(arc_buf_remove_ref(buf, FTAG) == 1);
1185209962Smm}
1186209962Smm
1187209962Smm/*
1188209962Smm * When possible directly assign passed loaned arc buffer to a dbuf.
1189209962Smm * If this is not possible copy the contents of passed arc buf via
1190209962Smm * dmu_write().
1191209962Smm */
1192209962Smmvoid
1193209962Smmdmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
1194209962Smm    dmu_tx_t *tx)
1195209962Smm{
1196219089Spjd	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
1197219089Spjd	dnode_t *dn;
1198209962Smm	dmu_buf_impl_t *db;
1199209962Smm	uint32_t blksz = (uint32_t)arc_buf_size(buf);
1200209962Smm	uint64_t blkid;
1201209962Smm
1202219089Spjd	DB_DNODE_ENTER(dbuf);
1203219089Spjd	dn = DB_DNODE(dbuf);
1204209962Smm	rw_enter(&dn->dn_struct_rwlock, RW_READER);
1205209962Smm	blkid = dbuf_whichblock(dn, offset);
1206209962Smm	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
1207209962Smm	rw_exit(&dn->dn_struct_rwlock);
1208219089Spjd	DB_DNODE_EXIT(dbuf);
1209209962Smm
1210209962Smm	if (offset == db->db.db_offset && blksz == db->db.db_size) {
1211209962Smm		dbuf_assign_arcbuf(db, buf, tx);
1212209962Smm		dbuf_rele(db, FTAG);
1213209962Smm	} else {
1214219089Spjd		objset_t *os;
1215219089Spjd		uint64_t object;
1216219089Spjd
1217219089Spjd		DB_DNODE_ENTER(dbuf);
1218219089Spjd		dn = DB_DNODE(dbuf);
1219219089Spjd		os = dn->dn_objset;
1220219089Spjd		object = dn->dn_object;
1221219089Spjd		DB_DNODE_EXIT(dbuf);
1222219089Spjd
1223209962Smm		dbuf_rele(db, FTAG);
1224219089Spjd		dmu_write(os, object, offset, blksz, buf->b_data, tx);
1225209962Smm		dmu_return_arcbuf(buf);
1226219089Spjd		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1227209962Smm	}
1228209962Smm}
1229209962Smm
1230168404Spjdtypedef struct {
1231219089Spjd	dbuf_dirty_record_t	*dsa_dr;
1232219089Spjd	dmu_sync_cb_t		*dsa_done;
1233219089Spjd	zgd_t			*dsa_zgd;
1234219089Spjd	dmu_tx_t		*dsa_tx;
1235168404Spjd} dmu_sync_arg_t;
1236168404Spjd
1237168404Spjd/* ARGSUSED */
1238168404Spjdstatic void
1239185029Spjddmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
1240185029Spjd{
1241219089Spjd	dmu_sync_arg_t *dsa = varg;
1242219089Spjd	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
1243185029Spjd	blkptr_t *bp = zio->io_bp;
1244185029Spjd
1245219089Spjd	if (zio->io_error == 0) {
1246219089Spjd		if (BP_IS_HOLE(bp)) {
1247219089Spjd			/*
1248219089Spjd			 * A block of zeros may compress to a hole, but the
1249219089Spjd			 * block size still needs to be known for replay.
1250219089Spjd			 */
1251219089Spjd			BP_SET_LSIZE(bp, db->db_size);
1252219089Spjd		} else {
1253219089Spjd			ASSERT(BP_GET_LEVEL(bp) == 0);
1254219089Spjd			bp->blk_fill = 1;
1255219089Spjd		}
1256185029Spjd	}
1257185029Spjd}
1258185029Spjd
1259219089Spjdstatic void
1260219089Spjddmu_sync_late_arrival_ready(zio_t *zio)
1261219089Spjd{
1262219089Spjd	dmu_sync_ready(zio, NULL, zio->io_private);
1263219089Spjd}
1264219089Spjd
1265185029Spjd/* ARGSUSED */
1266185029Spjdstatic void
1267168404Spjddmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
1268168404Spjd{
1269219089Spjd	dmu_sync_arg_t *dsa = varg;
1270219089Spjd	dbuf_dirty_record_t *dr = dsa->dsa_dr;
1271168404Spjd	dmu_buf_impl_t *db = dr->dr_dbuf;
1272168404Spjd
1273168404Spjd	mutex_enter(&db->db_mtx);
1274168404Spjd	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
1275219089Spjd	if (zio->io_error == 0) {
1276219089Spjd		dr->dt.dl.dr_overridden_by = *zio->io_bp;
1277219089Spjd		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
1278219089Spjd		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
1279219089Spjd		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
1280219089Spjd			BP_ZERO(&dr->dt.dl.dr_overridden_by);
1281219089Spjd	} else {
1282219089Spjd		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1283219089Spjd	}
1284168404Spjd	cv_broadcast(&db->db_changed);
1285168404Spjd	mutex_exit(&db->db_mtx);
1286168404Spjd
1287219089Spjd	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1288168404Spjd
1289219089Spjd	kmem_free(dsa, sizeof (*dsa));
1290168404Spjd}
1291168404Spjd
1292219089Spjdstatic void
1293219089Spjddmu_sync_late_arrival_done(zio_t *zio)
1294219089Spjd{
1295219089Spjd	blkptr_t *bp = zio->io_bp;
1296219089Spjd	dmu_sync_arg_t *dsa = zio->io_private;
1297219089Spjd
1298219089Spjd	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
1299219089Spjd		ASSERT(zio->io_bp->blk_birth == zio->io_txg);
1300219089Spjd		ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
1301219089Spjd		zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
1302219089Spjd	}
1303219089Spjd
1304219089Spjd	dmu_tx_commit(dsa->dsa_tx);
1305219089Spjd
1306219089Spjd	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1307219089Spjd
1308219089Spjd	kmem_free(dsa, sizeof (*dsa));
1309219089Spjd}
1310219089Spjd
1311219089Spjdstatic int
1312219089Spjddmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
1313219089Spjd    zio_prop_t *zp, zbookmark_t *zb)
1314219089Spjd{
1315219089Spjd	dmu_sync_arg_t *dsa;
1316219089Spjd	dmu_tx_t *tx;
1317219089Spjd
1318219089Spjd	tx = dmu_tx_create(os);
1319219089Spjd	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
1320219089Spjd	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
1321219089Spjd		dmu_tx_abort(tx);
1322219089Spjd		return (EIO);	/* Make zl_get_data do txg_waited_synced() */
1323219089Spjd	}
1324219089Spjd
1325219089Spjd	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1326219089Spjd	dsa->dsa_dr = NULL;
1327219089Spjd	dsa->dsa_done = done;
1328219089Spjd	dsa->dsa_zgd = zgd;
1329219089Spjd	dsa->dsa_tx = tx;
1330219089Spjd
1331219089Spjd	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
1332219089Spjd	    zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
1333219089Spjd	    dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
1334219089Spjd	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
1335219089Spjd
1336219089Spjd	return (0);
1337219089Spjd}
1338219089Spjd
1339168404Spjd/*
1340168404Spjd * Intent log support: sync the block associated with db to disk.
1341168404Spjd * N.B. and XXX: the caller is responsible for making sure that the
1342168404Spjd * data isn't changing while dmu_sync() is writing it.
1343168404Spjd *
1344168404Spjd * Return values:
1345168404Spjd *
1346168404Spjd *	EEXIST: this txg has already been synced, so there's nothing to to.
1347168404Spjd *		The caller should not log the write.
1348168404Spjd *
1349168404Spjd *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1350168404Spjd *		The caller should not log the write.
1351168404Spjd *
1352168404Spjd *	EALREADY: this block is already in the process of being synced.
1353168404Spjd *		The caller should track its progress (somehow).
1354168404Spjd *
1355219089Spjd *	EIO: could not do the I/O.
1356219089Spjd *		The caller should do a txg_wait_synced().
1357168404Spjd *
1358219089Spjd *	0: the I/O has been initiated.
1359219089Spjd *		The caller should log this blkptr in the done callback.
1360219089Spjd *		It is possible that the I/O will fail, in which case
1361219089Spjd *		the error will be reported to the done callback and
1362219089Spjd *		propagated to pio from zio_done().
1363168404Spjd */
1364168404Spjdint
1365219089Spjddmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
1366168404Spjd{
1367219089Spjd	blkptr_t *bp = zgd->zgd_bp;
1368219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
1369219089Spjd	objset_t *os = db->db_objset;
1370219089Spjd	dsl_dataset_t *ds = os->os_dsl_dataset;
1371168404Spjd	dbuf_dirty_record_t *dr;
1372219089Spjd	dmu_sync_arg_t *dsa;
1373168404Spjd	zbookmark_t zb;
1374219089Spjd	zio_prop_t zp;
1375219089Spjd	dnode_t *dn;
1376168404Spjd
1377219089Spjd	ASSERT(pio != NULL);
1378168404Spjd	ASSERT(BP_IS_HOLE(bp));
1379168404Spjd	ASSERT(txg != 0);
1380168404Spjd
1381219089Spjd	SET_BOOKMARK(&zb, ds->ds_object,
1382219089Spjd	    db->db.db_object, db->db_level, db->db_blkid);
1383168404Spjd
1384219089Spjd	DB_DNODE_ENTER(db);
1385219089Spjd	dn = DB_DNODE(db);
1386219089Spjd	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
1387219089Spjd	DB_DNODE_EXIT(db);
1388219089Spjd
1389168404Spjd	/*
1390219089Spjd	 * If we're frozen (running ziltest), we always need to generate a bp.
1391168404Spjd	 */
1392219089Spjd	if (txg > spa_freeze_txg(os->os_spa))
1393219089Spjd		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
1394168404Spjd
1395168404Spjd	/*
1396219089Spjd	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
1397219089Spjd	 * and us.  If we determine that this txg is not yet syncing,
1398219089Spjd	 * but it begins to sync a moment later, that's OK because the
1399219089Spjd	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
1400168404Spjd	 */
1401219089Spjd	mutex_enter(&db->db_mtx);
1402219089Spjd
1403219089Spjd	if (txg <= spa_last_synced_txg(os->os_spa)) {
1404168404Spjd		/*
1405219089Spjd		 * This txg has already synced.  There's nothing to do.
1406168404Spjd		 */
1407219089Spjd		mutex_exit(&db->db_mtx);
1408168404Spjd		return (EEXIST);
1409168404Spjd	}
1410168404Spjd
1411219089Spjd	if (txg <= spa_syncing_txg(os->os_spa)) {
1412219089Spjd		/*
1413219089Spjd		 * This txg is currently syncing, so we can't mess with
1414219089Spjd		 * the dirty record anymore; just write a new log block.
1415219089Spjd		 */
1416219089Spjd		mutex_exit(&db->db_mtx);
1417219089Spjd		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
1418168404Spjd	}
1419168404Spjd
1420168404Spjd	dr = db->db_last_dirty;
1421219089Spjd	while (dr && dr->dr_txg != txg)
1422168404Spjd		dr = dr->dr_next;
1423219089Spjd
1424219089Spjd	if (dr == NULL) {
1425168404Spjd		/*
1426219089Spjd		 * There's no dr for this dbuf, so it must have been freed.
1427168404Spjd		 * There's no need to log writes to freed blocks, so we're done.
1428168404Spjd		 */
1429168404Spjd		mutex_exit(&db->db_mtx);
1430168404Spjd		return (ENOENT);
1431168404Spjd	}
1432168404Spjd
1433168404Spjd	ASSERT(dr->dr_txg == txg);
1434219089Spjd	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
1435219089Spjd	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
1436168404Spjd		/*
1437219089Spjd		 * We have already issued a sync write for this buffer,
1438219089Spjd		 * or this buffer has already been synced.  It could not
1439219089Spjd		 * have been dirtied since, or we would have cleared the state.
1440168404Spjd		 */
1441168404Spjd		mutex_exit(&db->db_mtx);
1442168404Spjd		return (EALREADY);
1443168404Spjd	}
1444168404Spjd
1445219089Spjd	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
1446168404Spjd	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
1447168404Spjd	mutex_exit(&db->db_mtx);
1448168404Spjd
1449219089Spjd	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1450219089Spjd	dsa->dsa_dr = dr;
1451219089Spjd	dsa->dsa_done = done;
1452219089Spjd	dsa->dsa_zgd = zgd;
1453219089Spjd	dsa->dsa_tx = NULL;
1454168404Spjd
1455219089Spjd	zio_nowait(arc_write(pio, os->os_spa, txg,
1456219089Spjd	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
1457219089Spjd	    dmu_sync_ready, dmu_sync_done, dsa,
1458219089Spjd	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
1459185029Spjd
1460219089Spjd	return (0);
1461168404Spjd}
1462168404Spjd
1463168404Spjdint
1464168404Spjddmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
1465168404Spjd	dmu_tx_t *tx)
1466168404Spjd{
1467168404Spjd	dnode_t *dn;
1468168404Spjd	int err;
1469168404Spjd
1470219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
1471168404Spjd	if (err)
1472168404Spjd		return (err);
1473168404Spjd	err = dnode_set_blksz(dn, size, ibs, tx);
1474168404Spjd	dnode_rele(dn, FTAG);
1475168404Spjd	return (err);
1476168404Spjd}
1477168404Spjd
1478168404Spjdvoid
1479168404Spjddmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
1480168404Spjd	dmu_tx_t *tx)
1481168404Spjd{
1482168404Spjd	dnode_t *dn;
1483168404Spjd
1484168404Spjd	/* XXX assumes dnode_hold will not get an i/o error */
1485219089Spjd	(void) dnode_hold(os, object, FTAG, &dn);
1486168404Spjd	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
1487168404Spjd	dn->dn_checksum = checksum;
1488168404Spjd	dnode_setdirty(dn, tx);
1489168404Spjd	dnode_rele(dn, FTAG);
1490168404Spjd}
1491168404Spjd
1492168404Spjdvoid
1493168404Spjddmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
1494168404Spjd	dmu_tx_t *tx)
1495168404Spjd{
1496168404Spjd	dnode_t *dn;
1497168404Spjd
1498168404Spjd	/* XXX assumes dnode_hold will not get an i/o error */
1499219089Spjd	(void) dnode_hold(os, object, FTAG, &dn);
1500168404Spjd	ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
1501168404Spjd	dn->dn_compress = compress;
1502168404Spjd	dnode_setdirty(dn, tx);
1503168404Spjd	dnode_rele(dn, FTAG);
1504168404Spjd}
1505168404Spjd
1506219089Spjdint zfs_mdcomp_disable = 0;
1507219089SpjdTUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
1508219089SpjdSYSCTL_DECL(_vfs_zfs);
1509219089SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RW,
1510219089Spjd    &zfs_mdcomp_disable, 0, "Disable metadata compression");
1511219089Spjd
1512219089Spjdvoid
1513219089Spjddmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
1514219089Spjd{
1515219089Spjd	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
1516219089Spjd	boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata ||
1517219089Spjd	    (wp & WP_SPILL));
1518219089Spjd	enum zio_checksum checksum = os->os_checksum;
1519219089Spjd	enum zio_compress compress = os->os_compress;
1520219089Spjd	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
1521219089Spjd	boolean_t dedup;
1522219089Spjd	boolean_t dedup_verify = os->os_dedup_verify;
1523219089Spjd	int copies = os->os_copies;
1524219089Spjd
1525219089Spjd	/*
1526219089Spjd	 * Determine checksum setting.
1527219089Spjd	 */
1528219089Spjd	if (ismd) {
1529219089Spjd		/*
1530219089Spjd		 * Metadata always gets checksummed.  If the data
1531219089Spjd		 * checksum is multi-bit correctable, and it's not a
1532219089Spjd		 * ZBT-style checksum, then it's suitable for metadata
1533219089Spjd		 * as well.  Otherwise, the metadata checksum defaults
1534219089Spjd		 * to fletcher4.
1535219089Spjd		 */
1536219089Spjd		if (zio_checksum_table[checksum].ci_correctable < 1 ||
1537219089Spjd		    zio_checksum_table[checksum].ci_eck)
1538219089Spjd			checksum = ZIO_CHECKSUM_FLETCHER_4;
1539219089Spjd	} else {
1540219089Spjd		checksum = zio_checksum_select(dn->dn_checksum, checksum);
1541219089Spjd	}
1542219089Spjd
1543219089Spjd	/*
1544219089Spjd	 * Determine compression setting.
1545219089Spjd	 */
1546219089Spjd	if (ismd) {
1547219089Spjd		/*
1548219089Spjd		 * XXX -- we should design a compression algorithm
1549219089Spjd		 * that specializes in arrays of bps.
1550219089Spjd		 */
1551219089Spjd		compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
1552219089Spjd		    ZIO_COMPRESS_LZJB;
1553219089Spjd	} else {
1554219089Spjd		compress = zio_compress_select(dn->dn_compress, compress);
1555219089Spjd	}
1556219089Spjd
1557219089Spjd	/*
1558219089Spjd	 * Determine dedup setting.  If we are in dmu_sync(), we won't
1559219089Spjd	 * actually dedup now because that's all done in syncing context;
1560219089Spjd	 * but we do want to use the dedup checkum.  If the checksum is not
1561219089Spjd	 * strong enough to ensure unique signatures, force dedup_verify.
1562219089Spjd	 */
1563219089Spjd	dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
1564219089Spjd	if (dedup) {
1565219089Spjd		checksum = dedup_checksum;
1566219089Spjd		if (!zio_checksum_table[checksum].ci_dedup)
1567219089Spjd			dedup_verify = 1;
1568219089Spjd	}
1569219089Spjd
1570219089Spjd	if (wp & WP_DMU_SYNC)
1571219089Spjd		dedup = 0;
1572219089Spjd
1573219089Spjd	if (wp & WP_NOFILL) {
1574219089Spjd		ASSERT(!ismd && level == 0);
1575219089Spjd		checksum = ZIO_CHECKSUM_OFF;
1576219089Spjd		compress = ZIO_COMPRESS_OFF;
1577219089Spjd		dedup = B_FALSE;
1578219089Spjd	}
1579219089Spjd
1580219089Spjd	zp->zp_checksum = checksum;
1581219089Spjd	zp->zp_compress = compress;
1582219089Spjd	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
1583219089Spjd	zp->zp_level = level;
1584219089Spjd	zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
1585219089Spjd	zp->zp_dedup = dedup;
1586219089Spjd	zp->zp_dedup_verify = dedup && dedup_verify;
1587219089Spjd}
1588219089Spjd
1589168404Spjdint
1590168404Spjddmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
1591168404Spjd{
1592168404Spjd	dnode_t *dn;
1593168404Spjd	int i, err;
1594168404Spjd
1595219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
1596168404Spjd	if (err)
1597168404Spjd		return (err);
1598168404Spjd	/*
1599168404Spjd	 * Sync any current changes before
1600168404Spjd	 * we go trundling through the block pointers.
1601168404Spjd	 */
1602168404Spjd	for (i = 0; i < TXG_SIZE; i++) {
1603168404Spjd		if (list_link_active(&dn->dn_dirty_link[i]))
1604168404Spjd			break;
1605168404Spjd	}
1606168404Spjd	if (i != TXG_SIZE) {
1607168404Spjd		dnode_rele(dn, FTAG);
1608168404Spjd		txg_wait_synced(dmu_objset_pool(os), 0);
1609219089Spjd		err = dnode_hold(os, object, FTAG, &dn);
1610168404Spjd		if (err)
1611168404Spjd			return (err);
1612168404Spjd	}
1613168404Spjd
1614185029Spjd	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
1615168404Spjd	dnode_rele(dn, FTAG);
1616168404Spjd
1617168404Spjd	return (err);
1618168404Spjd}
1619168404Spjd
1620168404Spjdvoid
1621168404Spjddmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
1622168404Spjd{
1623219089Spjd	dnode_phys_t *dnp;
1624219089Spjd
1625168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
1626168404Spjd	mutex_enter(&dn->dn_mtx);
1627168404Spjd
1628219089Spjd	dnp = dn->dn_phys;
1629219089Spjd
1630168404Spjd	doi->doi_data_block_size = dn->dn_datablksz;
1631168404Spjd	doi->doi_metadata_block_size = dn->dn_indblkshift ?
1632168404Spjd	    1ULL << dn->dn_indblkshift : 0;
1633219089Spjd	doi->doi_type = dn->dn_type;
1634219089Spjd	doi->doi_bonus_type = dn->dn_bonustype;
1635219089Spjd	doi->doi_bonus_size = dn->dn_bonuslen;
1636168404Spjd	doi->doi_indirection = dn->dn_nlevels;
1637168404Spjd	doi->doi_checksum = dn->dn_checksum;
1638168404Spjd	doi->doi_compress = dn->dn_compress;
1639219089Spjd	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
1640219089Spjd	doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
1641219089Spjd	doi->doi_fill_count = 0;
1642219089Spjd	for (int i = 0; i < dnp->dn_nblkptr; i++)
1643219089Spjd		doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
1644168404Spjd
1645168404Spjd	mutex_exit(&dn->dn_mtx);
1646168404Spjd	rw_exit(&dn->dn_struct_rwlock);
1647168404Spjd}
1648168404Spjd
1649168404Spjd/*
1650168404Spjd * Get information on a DMU object.
1651168404Spjd * If doi is NULL, just indicates whether the object exists.
1652168404Spjd */
1653168404Spjdint
1654168404Spjddmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
1655168404Spjd{
1656168404Spjd	dnode_t *dn;
1657219089Spjd	int err = dnode_hold(os, object, FTAG, &dn);
1658168404Spjd
1659168404Spjd	if (err)
1660168404Spjd		return (err);
1661168404Spjd
1662168404Spjd	if (doi != NULL)
1663168404Spjd		dmu_object_info_from_dnode(dn, doi);
1664168404Spjd
1665168404Spjd	dnode_rele(dn, FTAG);
1666168404Spjd	return (0);
1667168404Spjd}
1668168404Spjd
1669168404Spjd/*
1670168404Spjd * As above, but faster; can be used when you have a held dbuf in hand.
1671168404Spjd */
1672168404Spjdvoid
1673219089Spjddmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
1674168404Spjd{
1675219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1676219089Spjd
1677219089Spjd	DB_DNODE_ENTER(db);
1678219089Spjd	dmu_object_info_from_dnode(DB_DNODE(db), doi);
1679219089Spjd	DB_DNODE_EXIT(db);
1680168404Spjd}
1681168404Spjd
1682168404Spjd/*
1683168404Spjd * Faster still when you only care about the size.
1684168404Spjd * This is specifically optimized for zfs_getattr().
1685168404Spjd */
1686168404Spjdvoid
1687219089Spjddmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
1688219089Spjd    u_longlong_t *nblk512)
1689168404Spjd{
1690219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1691219089Spjd	dnode_t *dn;
1692168404Spjd
1693219089Spjd	DB_DNODE_ENTER(db);
1694219089Spjd	dn = DB_DNODE(db);
1695219089Spjd
1696168404Spjd	*blksize = dn->dn_datablksz;
1697168404Spjd	/* add 1 for dnode space */
1698168404Spjd	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
1699168404Spjd	    SPA_MINBLOCKSHIFT) + 1;
1700219089Spjd	DB_DNODE_EXIT(db);
1701168404Spjd}
1702168404Spjd
1703168404Spjdvoid
1704168404Spjdbyteswap_uint64_array(void *vbuf, size_t size)
1705168404Spjd{
1706168404Spjd	uint64_t *buf = vbuf;
1707168404Spjd	size_t count = size >> 3;
1708168404Spjd	int i;
1709168404Spjd
1710168404Spjd	ASSERT((size & 7) == 0);
1711168404Spjd
1712168404Spjd	for (i = 0; i < count; i++)
1713168404Spjd		buf[i] = BSWAP_64(buf[i]);
1714168404Spjd}
1715168404Spjd
1716168404Spjdvoid
1717168404Spjdbyteswap_uint32_array(void *vbuf, size_t size)
1718168404Spjd{
1719168404Spjd	uint32_t *buf = vbuf;
1720168404Spjd	size_t count = size >> 2;
1721168404Spjd	int i;
1722168404Spjd
1723168404Spjd	ASSERT((size & 3) == 0);
1724168404Spjd
1725168404Spjd	for (i = 0; i < count; i++)
1726168404Spjd		buf[i] = BSWAP_32(buf[i]);
1727168404Spjd}
1728168404Spjd
1729168404Spjdvoid
1730168404Spjdbyteswap_uint16_array(void *vbuf, size_t size)
1731168404Spjd{
1732168404Spjd	uint16_t *buf = vbuf;
1733168404Spjd	size_t count = size >> 1;
1734168404Spjd	int i;
1735168404Spjd
1736168404Spjd	ASSERT((size & 1) == 0);
1737168404Spjd
1738168404Spjd	for (i = 0; i < count; i++)
1739168404Spjd		buf[i] = BSWAP_16(buf[i]);
1740168404Spjd}
1741168404Spjd
1742168404Spjd/* ARGSUSED */
1743168404Spjdvoid
1744168404Spjdbyteswap_uint8_array(void *vbuf, size_t size)
1745168404Spjd{
1746168404Spjd}
1747168404Spjd
1748168404Spjdvoid
1749168404Spjddmu_init(void)
1750168404Spjd{
1751219089Spjd	zfs_dbgmsg_init();
1752219089Spjd	sa_cache_init();
1753219089Spjd	xuio_stat_init();
1754219089Spjd	dmu_objset_init();
1755219089Spjd	dnode_init();
1756168404Spjd	dbuf_init();
1757208130Smm	zfetch_init();
1758168404Spjd	arc_init();
1759185029Spjd	l2arc_init();
1760168404Spjd}
1761168404Spjd
1762168404Spjdvoid
1763168404Spjddmu_fini(void)
1764168404Spjd{
1765219089Spjd	l2arc_fini();
1766168404Spjd	arc_fini();
1767208130Smm	zfetch_fini();
1768219089Spjd	dbuf_fini();
1769168404Spjd	dnode_fini();
1770219089Spjd	dmu_objset_fini();
1771219089Spjd	xuio_stat_fini();
1772219089Spjd	sa_cache_fini();
1773219089Spjd	zfs_dbgmsg_fini();
1774168404Spjd}
1775