dmu.c revision 321549
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23304138Savg * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24168404Spjd */
25251478Sdelphij/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
26255750Sdelphij/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
27268126Sdelphij/* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */
28251478Sdelphij
29168404Spjd#include <sys/dmu.h>
30168404Spjd#include <sys/dmu_impl.h>
31168404Spjd#include <sys/dmu_tx.h>
32168404Spjd#include <sys/dbuf.h>
33168404Spjd#include <sys/dnode.h>
34168404Spjd#include <sys/zfs_context.h>
35168404Spjd#include <sys/dmu_objset.h>
36168404Spjd#include <sys/dmu_traverse.h>
37168404Spjd#include <sys/dsl_dataset.h>
38168404Spjd#include <sys/dsl_dir.h>
39168404Spjd#include <sys/dsl_pool.h>
40168404Spjd#include <sys/dsl_synctask.h>
41168404Spjd#include <sys/dsl_prop.h>
42168404Spjd#include <sys/dmu_zfetch.h>
43168404Spjd#include <sys/zfs_ioctl.h>
44168404Spjd#include <sys/zap.h>
45168404Spjd#include <sys/zio_checksum.h>
46243524Smm#include <sys/zio_compress.h>
47219089Spjd#include <sys/sa.h>
48268126Sdelphij#include <sys/zfeature.h>
49219089Spjd#ifdef _KERNEL
50297633Strasz#include <sys/racct.h>
51258745Savg#include <sys/vm.h>
52185029Spjd#include <sys/zfs_znode.h>
53219089Spjd#endif
54168404Spjd
55243524Smm/*
56243524Smm * Enable/disable nopwrite feature.
57243524Smm */
58243524Smmint zfs_nopwrite_enabled = 1;
59243525SmmSYSCTL_DECL(_vfs_zfs);
60243525SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN,
61243525Smm    &zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
62243524Smm
63321523Smav/*
64321523Smav * Tunable to control percentage of dirtied blocks from frees in one TXG.
65321523Smav * After this threshold is crossed, additional dirty blocks from frees
66321523Smav * wait until the next TXG.
67321523Smav * A value of zero will disable this throttle.
68321523Smav */
69321523Smavuint32_t zfs_per_txg_dirty_frees_percent = 30;
70321523SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
71321523Smav	&zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg");
72321523Smav
73168404Spjdconst dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
74236884Smm	{	DMU_BSWAP_UINT8,	TRUE,	"unallocated"		},
75236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"object directory"	},
76236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"object array"		},
77236884Smm	{	DMU_BSWAP_UINT8,	TRUE,	"packed nvlist"		},
78236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"packed nvlist size"	},
79236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj"			},
80236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj header"		},
81236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map header"	},
82236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map"		},
83236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"ZIL intent log"	},
84236884Smm	{	DMU_BSWAP_DNODE,	TRUE,	"DMU dnode"		},
85236884Smm	{	DMU_BSWAP_OBJSET,	TRUE,	"DMU objset"		},
86236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"DSL directory"		},
87236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DSL directory child map"},
88236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset snap map"	},
89236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DSL props"		},
90236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"DSL dataset"		},
91236884Smm	{	DMU_BSWAP_ZNODE,	TRUE,	"ZFS znode"		},
92236884Smm	{	DMU_BSWAP_OLDACL,	TRUE,	"ZFS V0 ACL"		},
93236884Smm	{	DMU_BSWAP_UINT8,	FALSE,	"ZFS plain file"	},
94236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS directory"		},
95236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS master node"	},
96236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS delete queue"	},
97236884Smm	{	DMU_BSWAP_UINT8,	FALSE,	"zvol object"		},
98236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"zvol prop"		},
99236884Smm	{	DMU_BSWAP_UINT8,	FALSE,	"other uint8[]"		},
100236884Smm	{	DMU_BSWAP_UINT64,	FALSE,	"other uint64[]"	},
101236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"other ZAP"		},
102236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"persistent error log"	},
103236884Smm	{	DMU_BSWAP_UINT8,	TRUE,	"SPA history"		},
104236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"SPA history offsets"	},
105236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"Pool properties"	},
106236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DSL permissions"	},
107236884Smm	{	DMU_BSWAP_ACL,		TRUE,	"ZFS ACL"		},
108236884Smm	{	DMU_BSWAP_UINT8,	TRUE,	"ZFS SYSACL"		},
109236884Smm	{	DMU_BSWAP_UINT8,	TRUE,	"FUID table"		},
110236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"FUID table size"	},
111236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset next clones"},
112236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"scan work queue"	},
113236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group used"	},
114236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group quota"	},
115236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"snapshot refcount tags"},
116236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DDT ZAP algorithm"	},
117236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DDT statistics"	},
118236884Smm	{	DMU_BSWAP_UINT8,	TRUE,	"System attributes"	},
119236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"SA master node"	},
120236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr registration"	},
121236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr layouts"	},
122236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"scan translations"	},
123236884Smm	{	DMU_BSWAP_UINT8,	FALSE,	"deduplicated block"	},
124236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DSL deadlist map"	},
125236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"DSL deadlist map hdr"	},
126236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dir clones"	},
127236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj subobj"		}
128168404Spjd};
129168404Spjd
130236884Smmconst dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
131236884Smm	{	byteswap_uint8_array,	"uint8"		},
132236884Smm	{	byteswap_uint16_array,	"uint16"	},
133236884Smm	{	byteswap_uint32_array,	"uint32"	},
134236884Smm	{	byteswap_uint64_array,	"uint64"	},
135236884Smm	{	zap_byteswap,		"zap"		},
136236884Smm	{	dnode_buf_byteswap,	"dnode"		},
137236884Smm	{	dmu_objset_byteswap,	"objset"	},
138236884Smm	{	zfs_znode_byteswap,	"znode"		},
139236884Smm	{	zfs_oldacl_byteswap,	"oldacl"	},
140236884Smm	{	zfs_acl_byteswap,	"acl"		}
141236884Smm};
142236884Smm
143168404Spjdint
144307290Smavdmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
145307290Smav    void *tag, dmu_buf_t **dbp)
146307290Smav{
147307290Smav	uint64_t blkid;
148307290Smav	dmu_buf_impl_t *db;
149307290Smav
150307290Smav	blkid = dbuf_whichblock(dn, 0, offset);
151307290Smav	rw_enter(&dn->dn_struct_rwlock, RW_READER);
152307290Smav	db = dbuf_hold(dn, blkid, tag);
153307290Smav	rw_exit(&dn->dn_struct_rwlock);
154307290Smav
155307290Smav	if (db == NULL) {
156307290Smav		*dbp = NULL;
157307290Smav		return (SET_ERROR(EIO));
158307290Smav	}
159307290Smav
160307290Smav	*dbp = &db->db;
161307290Smav	return (0);
162307290Smav}
163307290Smavint
164268075Sdelphijdmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
165268075Sdelphij    void *tag, dmu_buf_t **dbp)
166168404Spjd{
167168404Spjd	dnode_t *dn;
168168404Spjd	uint64_t blkid;
169168404Spjd	dmu_buf_impl_t *db;
170168404Spjd	int err;
171168404Spjd
172219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
173168404Spjd	if (err)
174168404Spjd		return (err);
175286705Smav	blkid = dbuf_whichblock(dn, 0, offset);
176168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
177168404Spjd	db = dbuf_hold(dn, blkid, tag);
178168404Spjd	rw_exit(&dn->dn_struct_rwlock);
179268075Sdelphij	dnode_rele(dn, FTAG);
180268075Sdelphij
181168404Spjd	if (db == NULL) {
182268075Sdelphij		*dbp = NULL;
183268075Sdelphij		return (SET_ERROR(EIO));
184268075Sdelphij	}
185268075Sdelphij
186268075Sdelphij	*dbp = &db->db;
187268075Sdelphij	return (err);
188268075Sdelphij}
189268075Sdelphij
190268075Sdelphijint
191307290Smavdmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
192307290Smav    void *tag, dmu_buf_t **dbp, int flags)
193307290Smav{
194307290Smav	int err;
195307290Smav	int db_flags = DB_RF_CANFAIL;
196307290Smav
197307290Smav	if (flags & DMU_READ_NO_PREFETCH)
198307290Smav		db_flags |= DB_RF_NOPREFETCH;
199307290Smav
200307290Smav	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
201307290Smav	if (err == 0) {
202307290Smav		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
203307290Smav		err = dbuf_read(db, NULL, db_flags);
204307290Smav		if (err != 0) {
205307290Smav			dbuf_rele(db, tag);
206307290Smav			*dbp = NULL;
207307290Smav		}
208307290Smav	}
209307290Smav
210307290Smav	return (err);
211307290Smav}
212307290Smav
213307290Smavint
214268075Sdelphijdmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
215268075Sdelphij    void *tag, dmu_buf_t **dbp, int flags)
216268075Sdelphij{
217268075Sdelphij	int err;
218268075Sdelphij	int db_flags = DB_RF_CANFAIL;
219268075Sdelphij
220268075Sdelphij	if (flags & DMU_READ_NO_PREFETCH)
221268075Sdelphij		db_flags |= DB_RF_NOPREFETCH;
222268075Sdelphij
223268075Sdelphij	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
224268075Sdelphij	if (err == 0) {
225268075Sdelphij		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
226219089Spjd		err = dbuf_read(db, NULL, db_flags);
227268075Sdelphij		if (err != 0) {
228168404Spjd			dbuf_rele(db, tag);
229268075Sdelphij			*dbp = NULL;
230168404Spjd		}
231168404Spjd	}
232168404Spjd
233168404Spjd	return (err);
234168404Spjd}
235168404Spjd
236168404Spjdint
237168404Spjddmu_bonus_max(void)
238168404Spjd{
239168404Spjd	return (DN_MAX_BONUSLEN);
240168404Spjd}
241168404Spjd
242185029Spjdint
243219089Spjddmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
244185029Spjd{
245219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
246219089Spjd	dnode_t *dn;
247219089Spjd	int error;
248185029Spjd
249219089Spjd	DB_DNODE_ENTER(db);
250219089Spjd	dn = DB_DNODE(db);
251219089Spjd
252219089Spjd	if (dn->dn_bonus != db) {
253249195Smm		error = SET_ERROR(EINVAL);
254219089Spjd	} else if (newsize < 0 || newsize > db_fake->db_size) {
255249195Smm		error = SET_ERROR(EINVAL);
256219089Spjd	} else {
257219089Spjd		dnode_setbonuslen(dn, newsize, tx);
258219089Spjd		error = 0;
259219089Spjd	}
260219089Spjd
261219089Spjd	DB_DNODE_EXIT(db);
262219089Spjd	return (error);
263185029Spjd}
264185029Spjd
265219089Spjdint
266219089Spjddmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
267219089Spjd{
268219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
269219089Spjd	dnode_t *dn;
270219089Spjd	int error;
271219089Spjd
272219089Spjd	DB_DNODE_ENTER(db);
273219089Spjd	dn = DB_DNODE(db);
274219089Spjd
275236884Smm	if (!DMU_OT_IS_VALID(type)) {
276249195Smm		error = SET_ERROR(EINVAL);
277219089Spjd	} else if (dn->dn_bonus != db) {
278249195Smm		error = SET_ERROR(EINVAL);
279219089Spjd	} else {
280219089Spjd		dnode_setbonus_type(dn, type, tx);
281219089Spjd		error = 0;
282219089Spjd	}
283219089Spjd
284219089Spjd	DB_DNODE_EXIT(db);
285219089Spjd	return (error);
286219089Spjd}
287219089Spjd
288219089Spjddmu_object_type_t
289219089Spjddmu_get_bonustype(dmu_buf_t *db_fake)
290219089Spjd{
291219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
292219089Spjd	dnode_t *dn;
293219089Spjd	dmu_object_type_t type;
294219089Spjd
295219089Spjd	DB_DNODE_ENTER(db);
296219089Spjd	dn = DB_DNODE(db);
297219089Spjd	type = dn->dn_bonustype;
298219089Spjd	DB_DNODE_EXIT(db);
299219089Spjd
300219089Spjd	return (type);
301219089Spjd}
302219089Spjd
303219089Spjdint
304219089Spjddmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
305219089Spjd{
306219089Spjd	dnode_t *dn;
307219089Spjd	int error;
308219089Spjd
309219089Spjd	error = dnode_hold(os, object, FTAG, &dn);
310219089Spjd	dbuf_rm_spill(dn, tx);
311219089Spjd	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
312219089Spjd	dnode_rm_spill(dn, tx);
313219089Spjd	rw_exit(&dn->dn_struct_rwlock);
314219089Spjd	dnode_rele(dn, FTAG);
315219089Spjd	return (error);
316219089Spjd}
317219089Spjd
318168404Spjd/*
319168404Spjd * returns ENOENT, EIO, or 0.
320168404Spjd */
321168404Spjdint
322168404Spjddmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
323168404Spjd{
324168404Spjd	dnode_t *dn;
325168404Spjd	dmu_buf_impl_t *db;
326185029Spjd	int error;
327168404Spjd
328219089Spjd	error = dnode_hold(os, object, FTAG, &dn);
329185029Spjd	if (error)
330185029Spjd		return (error);
331168404Spjd
332168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
333168404Spjd	if (dn->dn_bonus == NULL) {
334168404Spjd		rw_exit(&dn->dn_struct_rwlock);
335168404Spjd		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
336168404Spjd		if (dn->dn_bonus == NULL)
337185029Spjd			dbuf_create_bonus(dn);
338168404Spjd	}
339168404Spjd	db = dn->dn_bonus;
340185029Spjd
341185029Spjd	/* as long as the bonus buf is held, the dnode will be held */
342219089Spjd	if (refcount_add(&db->db_holds, tag) == 1) {
343185029Spjd		VERIFY(dnode_add_ref(dn, db));
344270248Sdelphij		atomic_inc_32(&dn->dn_dbufs_count);
345219089Spjd	}
346185029Spjd
347219089Spjd	/*
348219089Spjd	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
349219089Spjd	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
350219089Spjd	 * a dnode hold for every dbuf.
351219089Spjd	 */
352219089Spjd	rw_exit(&dn->dn_struct_rwlock);
353219089Spjd
354168404Spjd	dnode_rele(dn, FTAG);
355168404Spjd
356219089Spjd	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
357168404Spjd
358168404Spjd	*dbp = &db->db;
359168404Spjd	return (0);
360168404Spjd}
361168404Spjd
362168404Spjd/*
363219089Spjd * returns ENOENT, EIO, or 0.
364219089Spjd *
365219089Spjd * This interface will allocate a blank spill dbuf when a spill blk
366219089Spjd * doesn't already exist on the dnode.
367219089Spjd *
368219089Spjd * if you only want to find an already existing spill db, then
369219089Spjd * dmu_spill_hold_existing() should be used.
370219089Spjd */
371219089Spjdint
372219089Spjddmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
373219089Spjd{
374219089Spjd	dmu_buf_impl_t *db = NULL;
375219089Spjd	int err;
376219089Spjd
377219089Spjd	if ((flags & DB_RF_HAVESTRUCT) == 0)
378219089Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
379219089Spjd
380219089Spjd	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
381219089Spjd
382219089Spjd	if ((flags & DB_RF_HAVESTRUCT) == 0)
383219089Spjd		rw_exit(&dn->dn_struct_rwlock);
384219089Spjd
385219089Spjd	ASSERT(db != NULL);
386219089Spjd	err = dbuf_read(db, NULL, flags);
387219089Spjd	if (err == 0)
388219089Spjd		*dbp = &db->db;
389219089Spjd	else
390219089Spjd		dbuf_rele(db, tag);
391219089Spjd	return (err);
392219089Spjd}
393219089Spjd
394219089Spjdint
395219089Spjddmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
396219089Spjd{
397219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
398219089Spjd	dnode_t *dn;
399219089Spjd	int err;
400219089Spjd
401219089Spjd	DB_DNODE_ENTER(db);
402219089Spjd	dn = DB_DNODE(db);
403219089Spjd
404219089Spjd	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
405249195Smm		err = SET_ERROR(EINVAL);
406219089Spjd	} else {
407219089Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
408219089Spjd
409219089Spjd		if (!dn->dn_have_spill) {
410249195Smm			err = SET_ERROR(ENOENT);
411219089Spjd		} else {
412219089Spjd			err = dmu_spill_hold_by_dnode(dn,
413219089Spjd			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
414219089Spjd		}
415219089Spjd
416219089Spjd		rw_exit(&dn->dn_struct_rwlock);
417219089Spjd	}
418219089Spjd
419219089Spjd	DB_DNODE_EXIT(db);
420219089Spjd	return (err);
421219089Spjd}
422219089Spjd
423219089Spjdint
424219089Spjddmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
425219089Spjd{
426219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
427219089Spjd	dnode_t *dn;
428219089Spjd	int err;
429219089Spjd
430219089Spjd	DB_DNODE_ENTER(db);
431219089Spjd	dn = DB_DNODE(db);
432219089Spjd	err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
433219089Spjd	DB_DNODE_EXIT(db);
434219089Spjd
435219089Spjd	return (err);
436219089Spjd}
437219089Spjd
438219089Spjd/*
439168404Spjd * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
440168404Spjd * to take a held dnode rather than <os, object> -- the lookup is wasteful,
441168404Spjd * and can induce severe lock contention when writing to several files
442168404Spjd * whose dnodes are in the same block.
443168404Spjd */
444168404Spjdstatic int
445209962Smmdmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
446287702Sdelphij    boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
447168404Spjd{
448168404Spjd	dmu_buf_t **dbp;
449168404Spjd	uint64_t blkid, nblks, i;
450209962Smm	uint32_t dbuf_flags;
451168404Spjd	int err;
452168404Spjd	zio_t *zio;
453168404Spjd
454168404Spjd	ASSERT(length <= DMU_MAX_ACCESS);
455168404Spjd
456287702Sdelphij	/*
457287702Sdelphij	 * Note: We directly notify the prefetch code of this read, so that
458287702Sdelphij	 * we can tell it about the multi-block read.  dbuf_read() only knows
459287702Sdelphij	 * about the one block it is accessing.
460287702Sdelphij	 */
461287702Sdelphij	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
462287702Sdelphij	    DB_RF_NOPREFETCH;
463168404Spjd
464168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
465168404Spjd	if (dn->dn_datablkshift) {
466168404Spjd		int blkshift = dn->dn_datablkshift;
467287702Sdelphij		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
468287702Sdelphij		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
469168404Spjd	} else {
470168404Spjd		if (offset + length > dn->dn_datablksz) {
471168404Spjd			zfs_panic_recover("zfs: accessing past end of object "
472168404Spjd			    "%llx/%llx (size=%u access=%llu+%llu)",
473168404Spjd			    (longlong_t)dn->dn_objset->
474168404Spjd			    os_dsl_dataset->ds_object,
475168404Spjd			    (longlong_t)dn->dn_object, dn->dn_datablksz,
476168404Spjd			    (longlong_t)offset, (longlong_t)length);
477214378Smm			rw_exit(&dn->dn_struct_rwlock);
478249195Smm			return (SET_ERROR(EIO));
479168404Spjd		}
480168404Spjd		nblks = 1;
481168404Spjd	}
482168404Spjd	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
483168404Spjd
484297633Strasz#if defined(_KERNEL) && defined(RACCT)
485297633Strasz	if (racct_enable && !read) {
486297633Strasz		PROC_LOCK(curproc);
487297633Strasz		racct_add_force(curproc, RACCT_WRITEBPS, length);
488297633Strasz		racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
489297633Strasz		PROC_UNLOCK(curproc);
490297633Strasz	}
491297633Strasz#endif
492297633Strasz
493185029Spjd	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
494286705Smav	blkid = dbuf_whichblock(dn, 0, offset);
495168404Spjd	for (i = 0; i < nblks; i++) {
496287702Sdelphij		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
497168404Spjd		if (db == NULL) {
498168404Spjd			rw_exit(&dn->dn_struct_rwlock);
499168404Spjd			dmu_buf_rele_array(dbp, nblks, tag);
500168404Spjd			zio_nowait(zio);
501249195Smm			return (SET_ERROR(EIO));
502168404Spjd		}
503287702Sdelphij
504168404Spjd		/* initiate async i/o */
505226620Spjd		if (read)
506209962Smm			(void) dbuf_read(db, zio, dbuf_flags);
507226620Spjd#ifdef _KERNEL
508226620Spjd		else
509226620Spjd			curthread->td_ru.ru_oublock++;
510226620Spjd#endif
511168404Spjd		dbp[i] = &db->db;
512168404Spjd	}
513287702Sdelphij
514297832Smav	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
515297832Smav	    DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
516297832Smav		dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
517297832Smav		    read && DNODE_IS_CACHEABLE(dn));
518287702Sdelphij	}
519168404Spjd	rw_exit(&dn->dn_struct_rwlock);
520168404Spjd
521168404Spjd	/* wait for async i/o */
522168404Spjd	err = zio_wait(zio);
523168404Spjd	if (err) {
524168404Spjd		dmu_buf_rele_array(dbp, nblks, tag);
525168404Spjd		return (err);
526168404Spjd	}
527168404Spjd
528168404Spjd	/* wait for other io to complete */
529168404Spjd	if (read) {
530168404Spjd		for (i = 0; i < nblks; i++) {
531168404Spjd			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
532168404Spjd			mutex_enter(&db->db_mtx);
533168404Spjd			while (db->db_state == DB_READ ||
534168404Spjd			    db->db_state == DB_FILL)
535168404Spjd				cv_wait(&db->db_changed, &db->db_mtx);
536168404Spjd			if (db->db_state == DB_UNCACHED)
537249195Smm				err = SET_ERROR(EIO);
538168404Spjd			mutex_exit(&db->db_mtx);
539168404Spjd			if (err) {
540168404Spjd				dmu_buf_rele_array(dbp, nblks, tag);
541168404Spjd				return (err);
542168404Spjd			}
543168404Spjd		}
544168404Spjd	}
545168404Spjd
546168404Spjd	*numbufsp = nblks;
547168404Spjd	*dbpp = dbp;
548168404Spjd	return (0);
549168404Spjd}
550168404Spjd
551168404Spjdstatic int
552168404Spjddmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
553168404Spjd    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
554168404Spjd{
555168404Spjd	dnode_t *dn;
556168404Spjd	int err;
557168404Spjd
558219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
559168404Spjd	if (err)
560168404Spjd		return (err);
561168404Spjd
562168404Spjd	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
563209962Smm	    numbufsp, dbpp, DMU_READ_PREFETCH);
564168404Spjd
565168404Spjd	dnode_rele(dn, FTAG);
566168404Spjd
567168404Spjd	return (err);
568168404Spjd}
569168404Spjd
570168404Spjdint
571219089Spjddmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
572287702Sdelphij    uint64_t length, boolean_t read, void *tag, int *numbufsp,
573287702Sdelphij    dmu_buf_t ***dbpp)
574168404Spjd{
575219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
576219089Spjd	dnode_t *dn;
577168404Spjd	int err;
578168404Spjd
579219089Spjd	DB_DNODE_ENTER(db);
580219089Spjd	dn = DB_DNODE(db);
581168404Spjd	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
582209962Smm	    numbufsp, dbpp, DMU_READ_PREFETCH);
583219089Spjd	DB_DNODE_EXIT(db);
584168404Spjd
585168404Spjd	return (err);
586168404Spjd}
587168404Spjd
588168404Spjdvoid
589168404Spjddmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
590168404Spjd{
591168404Spjd	int i;
592168404Spjd	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
593168404Spjd
594168404Spjd	if (numbufs == 0)
595168404Spjd		return;
596168404Spjd
597168404Spjd	for (i = 0; i < numbufs; i++) {
598168404Spjd		if (dbp[i])
599168404Spjd			dbuf_rele(dbp[i], tag);
600168404Spjd	}
601168404Spjd
602168404Spjd	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
603168404Spjd}
604168404Spjd
605258632Savg/*
606286705Smav * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
607286705Smav * indirect blocks prefeteched will be those that point to the blocks containing
608286705Smav * the data starting at offset, and continuing to offset + len.
609258632Savg *
610286705Smav * Note that if the indirect blocks above the blocks being prefetched are not in
611286705Smav * cache, they will be asychronously read in.
612258632Savg */
613168404Spjdvoid
614286705Smavdmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
615286705Smav    uint64_t len, zio_priority_t pri)
616168404Spjd{
617168404Spjd	dnode_t *dn;
618168404Spjd	uint64_t blkid;
619258632Savg	int nblks, err;
620168404Spjd
621168404Spjd	if (len == 0) {  /* they're interested in the bonus buffer */
622219089Spjd		dn = DMU_META_DNODE(os);
623168404Spjd
624168404Spjd		if (object == 0 || object >= DN_MAX_OBJECT)
625168404Spjd			return;
626168404Spjd
627168404Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
628286705Smav		blkid = dbuf_whichblock(dn, level,
629286705Smav		    object * sizeof (dnode_phys_t));
630286705Smav		dbuf_prefetch(dn, level, blkid, pri, 0);
631168404Spjd		rw_exit(&dn->dn_struct_rwlock);
632168404Spjd		return;
633168404Spjd	}
634168404Spjd
635168404Spjd	/*
636168404Spjd	 * XXX - Note, if the dnode for the requested object is not
637168404Spjd	 * already cached, we will do a *synchronous* read in the
638168404Spjd	 * dnode_hold() call.  The same is true for any indirects.
639168404Spjd	 */
640219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
641168404Spjd	if (err != 0)
642168404Spjd		return;
643168404Spjd
644168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
645286705Smav	/*
646286705Smav	 * offset + len - 1 is the last byte we want to prefetch for, and offset
647286705Smav	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
648286705Smav	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
649286705Smav	 * offset)  is the first.  Then the number we need to prefetch is the
650286705Smav	 * last - first + 1.
651286705Smav	 */
652286705Smav	if (level > 0 || dn->dn_datablkshift != 0) {
653286705Smav		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
654286705Smav		    dbuf_whichblock(dn, level, offset) + 1;
655168404Spjd	} else {
656168404Spjd		nblks = (offset < dn->dn_datablksz);
657168404Spjd	}
658168404Spjd
659168404Spjd	if (nblks != 0) {
660286705Smav		blkid = dbuf_whichblock(dn, level, offset);
661258632Savg		for (int i = 0; i < nblks; i++)
662286705Smav			dbuf_prefetch(dn, level, blkid + i, pri, 0);
663168404Spjd	}
664168404Spjd
665168404Spjd	rw_exit(&dn->dn_struct_rwlock);
666168404Spjd
667168404Spjd	dnode_rele(dn, FTAG);
668168404Spjd}
669168404Spjd
670208775Smm/*
671208775Smm * Get the next "chunk" of file data to free.  We traverse the file from
672208775Smm * the end so that the file gets shorter over time (if we crashes in the
673208775Smm * middle, this will leave us in a better state).  We find allocated file
674208775Smm * data by simply searching the allocated level 1 indirects.
675254753Sdelphij *
676254753Sdelphij * On input, *start should be the first offset that does not need to be
677254753Sdelphij * freed (e.g. "offset + length").  On return, *start will be the first
678254753Sdelphij * offset that should be freed.
679208775Smm */
680185029Spjdstatic int
681254753Sdelphijget_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
682185029Spjd{
683254753Sdelphij	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
684254753Sdelphij	/* bytes of data covered by a level-1 indirect block */
685208775Smm	uint64_t iblkrange =
686185029Spjd	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
687185029Spjd
688254753Sdelphij	ASSERT3U(minimum, <=, *start);
689185029Spjd
690254753Sdelphij	if (*start - minimum <= iblkrange * maxblks) {
691254753Sdelphij		*start = minimum;
692185029Spjd		return (0);
693185029Spjd	}
694208775Smm	ASSERT(ISP2(iblkrange));
695185029Spjd
696254753Sdelphij	for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
697185029Spjd		int err;
698185029Spjd
699254753Sdelphij		/*
700254753Sdelphij		 * dnode_next_offset(BACKWARDS) will find an allocated L1
701254753Sdelphij		 * indirect block at or before the input offset.  We must
702254753Sdelphij		 * decrement *start so that it is at the end of the region
703254753Sdelphij		 * to search.
704254753Sdelphij		 */
705254753Sdelphij		(*start)--;
706185029Spjd		err = dnode_next_offset(dn,
707208775Smm		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
708185029Spjd
709254753Sdelphij		/* if there are no indirect blocks before start, we are done */
710208775Smm		if (err == ESRCH) {
711254753Sdelphij			*start = minimum;
712254753Sdelphij			break;
713254753Sdelphij		} else if (err != 0) {
714208775Smm			return (err);
715185029Spjd		}
716185029Spjd
717254753Sdelphij		/* set start to the beginning of this L1 indirect */
718208775Smm		*start = P2ALIGN(*start, iblkrange);
719185029Spjd	}
720254753Sdelphij	if (*start < minimum)
721254753Sdelphij		*start = minimum;
722185029Spjd	return (0);
723185029Spjd}
724185029Spjd
725185029Spjdstatic int
726185029Spjddmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
727254753Sdelphij    uint64_t length)
728185029Spjd{
729254753Sdelphij	uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
730254753Sdelphij	int err;
731321523Smav	uint64_t dirty_frees_threshold;
732321523Smav	dsl_pool_t *dp = dmu_objset_pool(os);
733185029Spjd
734254753Sdelphij	if (offset >= object_size)
735185029Spjd		return (0);
736185029Spjd
737321523Smav	if (zfs_per_txg_dirty_frees_percent <= 100)
738321523Smav		dirty_frees_threshold =
739321523Smav		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
740321523Smav	else
741321523Smav		dirty_frees_threshold = zfs_dirty_data_max / 4;
742321523Smav
743254753Sdelphij	if (length == DMU_OBJECT_END || offset + length > object_size)
744254753Sdelphij		length = object_size - offset;
745254753Sdelphij
746254753Sdelphij	while (length != 0) {
747321523Smav		uint64_t chunk_end, chunk_begin, chunk_len;
748321523Smav		uint64_t long_free_dirty_all_txgs = 0;
749321523Smav		dmu_tx_t *tx;
750254753Sdelphij
751254753Sdelphij		chunk_end = chunk_begin = offset + length;
752254753Sdelphij
753254753Sdelphij		/* move chunk_begin backwards to the beginning of this chunk */
754254753Sdelphij		err = get_next_chunk(dn, &chunk_begin, offset);
755185029Spjd		if (err)
756185029Spjd			return (err);
757254753Sdelphij		ASSERT3U(chunk_begin, >=, offset);
758254753Sdelphij		ASSERT3U(chunk_begin, <=, chunk_end);
759185029Spjd
760321523Smav		chunk_len = chunk_end - chunk_begin;
761268464Sdelphij
762321523Smav		mutex_enter(&dp->dp_lock);
763321523Smav		for (int t = 0; t < TXG_SIZE; t++) {
764321523Smav			long_free_dirty_all_txgs +=
765321523Smav			    dp->dp_long_free_dirty_pertxg[t];
766321523Smav		}
767321523Smav		mutex_exit(&dp->dp_lock);
768321523Smav
769268464Sdelphij		/*
770321523Smav		 * To avoid filling up a TXG with just frees wait for
771321523Smav		 * the next TXG to open before freeing more chunks if
772321523Smav		 * we have reached the threshold of frees
773321523Smav		 */
774321523Smav		if (dirty_frees_threshold != 0 &&
775321523Smav		    long_free_dirty_all_txgs >= dirty_frees_threshold) {
776321523Smav			txg_wait_open(dp, 0);
777321523Smav			continue;
778321523Smav		}
779321523Smav
780321523Smav		tx = dmu_tx_create(os);
781321523Smav		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
782321523Smav
783321523Smav		/*
784268464Sdelphij		 * Mark this transaction as typically resulting in a net
785268464Sdelphij		 * reduction in space used.
786268464Sdelphij		 */
787268464Sdelphij		dmu_tx_mark_netfree(tx);
788185029Spjd		err = dmu_tx_assign(tx, TXG_WAIT);
789185029Spjd		if (err) {
790185029Spjd			dmu_tx_abort(tx);
791185029Spjd			return (err);
792185029Spjd		}
793321523Smav
794321523Smav		mutex_enter(&dp->dp_lock);
795321523Smav		dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
796321523Smav		    chunk_len;
797321523Smav		mutex_exit(&dp->dp_lock);
798321523Smav		DTRACE_PROBE3(free__long__range,
799321523Smav		    uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
800321523Smav		    uint64_t, dmu_tx_get_txg(tx));
801321523Smav		dnode_free_range(dn, chunk_begin, chunk_len, tx);
802254753Sdelphij		dmu_tx_commit(tx);
803185029Spjd
804321523Smav		length -= chunk_len;
805185029Spjd	}
806185029Spjd	return (0);
807185029Spjd}
808185029Spjd
809168404Spjdint
810185029Spjddmu_free_long_range(objset_t *os, uint64_t object,
811185029Spjd    uint64_t offset, uint64_t length)
812185029Spjd{
813185029Spjd	dnode_t *dn;
814185029Spjd	int err;
815185029Spjd
816219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
817185029Spjd	if (err != 0)
818185029Spjd		return (err);
819254753Sdelphij	err = dmu_free_long_range_impl(os, dn, offset, length);
820256259Savg
821256259Savg	/*
822256259Savg	 * It is important to zero out the maxblkid when freeing the entire
823256259Savg	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
824256259Savg	 * will take the fast path, and (b) dnode_reallocate() can verify
825256259Savg	 * that the entire file has been freed.
826256259Savg	 */
827260150Sdelphij	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
828256259Savg		dn->dn_maxblkid = 0;
829256259Savg
830185029Spjd	dnode_rele(dn, FTAG);
831185029Spjd	return (err);
832185029Spjd}
833185029Spjd
834185029Spjdint
835254753Sdelphijdmu_free_long_object(objset_t *os, uint64_t object)
836185029Spjd{
837185029Spjd	dmu_tx_t *tx;
838185029Spjd	int err;
839185029Spjd
840254753Sdelphij	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
841185029Spjd	if (err != 0)
842185029Spjd		return (err);
843254753Sdelphij
844254753Sdelphij	tx = dmu_tx_create(os);
845254753Sdelphij	dmu_tx_hold_bonus(tx, object);
846254753Sdelphij	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
847268464Sdelphij	dmu_tx_mark_netfree(tx);
848254753Sdelphij	err = dmu_tx_assign(tx, TXG_WAIT);
849254753Sdelphij	if (err == 0) {
850254753Sdelphij		err = dmu_object_free(os, object, tx);
851254753Sdelphij		dmu_tx_commit(tx);
852185029Spjd	} else {
853254753Sdelphij		dmu_tx_abort(tx);
854185029Spjd	}
855254753Sdelphij
856185029Spjd	return (err);
857185029Spjd}
858185029Spjd
859185029Spjdint
860168404Spjddmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
861168404Spjd    uint64_t size, dmu_tx_t *tx)
862168404Spjd{
863168404Spjd	dnode_t *dn;
864219089Spjd	int err = dnode_hold(os, object, FTAG, &dn);
865168404Spjd	if (err)
866168404Spjd		return (err);
867168404Spjd	ASSERT(offset < UINT64_MAX);
868168404Spjd	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
869168404Spjd	dnode_free_range(dn, offset, size, tx);
870168404Spjd	dnode_rele(dn, FTAG);
871168404Spjd	return (0);
872168404Spjd}
873168404Spjd
874321549Smavstatic int
875321549Smavdmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
876209962Smm    void *buf, uint32_t flags)
877168404Spjd{
878168404Spjd	dmu_buf_t **dbp;
879321549Smav	int numbufs, err = 0;
880168404Spjd
881168404Spjd	/*
882168404Spjd	 * Deal with odd block sizes, where there can't be data past the first
883168404Spjd	 * block.  If we ever do the tail block optimization, we will need to
884168404Spjd	 * handle that here as well.
885168404Spjd	 */
886214378Smm	if (dn->dn_maxblkid == 0) {
887168404Spjd		int newsz = offset > dn->dn_datablksz ? 0 :
888168404Spjd		    MIN(size, dn->dn_datablksz - offset);
889168404Spjd		bzero((char *)buf + newsz, size - newsz);
890168404Spjd		size = newsz;
891168404Spjd	}
892168404Spjd
893168404Spjd	while (size > 0) {
894168404Spjd		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
895214378Smm		int i;
896168404Spjd
897168404Spjd		/*
898168404Spjd		 * NB: we could do this block-at-a-time, but it's nice
899168404Spjd		 * to be reading in parallel.
900168404Spjd		 */
901168404Spjd		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
902209962Smm		    TRUE, FTAG, &numbufs, &dbp, flags);
903168404Spjd		if (err)
904185029Spjd			break;
905168404Spjd
906168404Spjd		for (i = 0; i < numbufs; i++) {
907168404Spjd			int tocpy;
908168404Spjd			int bufoff;
909168404Spjd			dmu_buf_t *db = dbp[i];
910168404Spjd
911168404Spjd			ASSERT(size > 0);
912168404Spjd
913168404Spjd			bufoff = offset - db->db_offset;
914168404Spjd			tocpy = (int)MIN(db->db_size - bufoff, size);
915168404Spjd
916168404Spjd			bcopy((char *)db->db_data + bufoff, buf, tocpy);
917168404Spjd
918168404Spjd			offset += tocpy;
919168404Spjd			size -= tocpy;
920168404Spjd			buf = (char *)buf + tocpy;
921168404Spjd		}
922168404Spjd		dmu_buf_rele_array(dbp, numbufs, FTAG);
923168404Spjd	}
924321549Smav	return (err);
925321549Smav}
926321549Smav
927321549Smavint
928321549Smavdmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
929321549Smav    void *buf, uint32_t flags)
930321549Smav{
931321549Smav	dnode_t *dn;
932321549Smav	int err;
933321549Smav
934321549Smav	err = dnode_hold(os, object, FTAG, &dn);
935321549Smav	if (err != 0)
936321549Smav		return (err);
937321549Smav
938321549Smav	err = dmu_read_impl(dn, offset, size, buf, flags);
939168404Spjd	dnode_rele(dn, FTAG);
940185029Spjd	return (err);
941168404Spjd}
942168404Spjd
943321549Smavint
944321549Smavdmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
945321549Smav    uint32_t flags)
946321549Smav{
947321549Smav	return (dmu_read_impl(dn, offset, size, buf, flags));
948321549Smav}
949321549Smav
950321549Smavstatic void
951321549Smavdmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
952168404Spjd    const void *buf, dmu_tx_t *tx)
953168404Spjd{
954321549Smav	int i;
955168404Spjd
956168404Spjd	for (i = 0; i < numbufs; i++) {
957168404Spjd		int tocpy;
958168404Spjd		int bufoff;
959168404Spjd		dmu_buf_t *db = dbp[i];
960168404Spjd
961168404Spjd		ASSERT(size > 0);
962168404Spjd
963168404Spjd		bufoff = offset - db->db_offset;
964168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
965168404Spjd
966168404Spjd		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
967168404Spjd
968168404Spjd		if (tocpy == db->db_size)
969168404Spjd			dmu_buf_will_fill(db, tx);
970168404Spjd		else
971168404Spjd			dmu_buf_will_dirty(db, tx);
972168404Spjd
973168404Spjd		bcopy(buf, (char *)db->db_data + bufoff, tocpy);
974168404Spjd
975168404Spjd		if (tocpy == db->db_size)
976168404Spjd			dmu_buf_fill_done(db, tx);
977168404Spjd
978168404Spjd		offset += tocpy;
979168404Spjd		size -= tocpy;
980168404Spjd		buf = (char *)buf + tocpy;
981168404Spjd	}
982321549Smav}
983321549Smav
984321549Smavvoid
985321549Smavdmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
986321549Smav    const void *buf, dmu_tx_t *tx)
987321549Smav{
988321549Smav	dmu_buf_t **dbp;
989321549Smav	int numbufs;
990321549Smav
991321549Smav	if (size == 0)
992321549Smav		return;
993321549Smav
994321549Smav	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
995321549Smav	    FALSE, FTAG, &numbufs, &dbp));
996321549Smav	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
997168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
998168404Spjd}
999168404Spjd
1000219089Spjdvoid
1001321549Smavdmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
1002321549Smav    const void *buf, dmu_tx_t *tx)
1003321549Smav{
1004321549Smav	dmu_buf_t **dbp;
1005321549Smav	int numbufs;
1006321549Smav
1007321549Smav	if (size == 0)
1008321549Smav		return;
1009321549Smav
1010321549Smav	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
1011321549Smav	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
1012321549Smav	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
1013321549Smav	dmu_buf_rele_array(dbp, numbufs, FTAG);
1014321549Smav}
1015321549Smav
1016321549Smavvoid
1017219089Spjddmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1018219089Spjd    dmu_tx_t *tx)
1019219089Spjd{
1020219089Spjd	dmu_buf_t **dbp;
1021219089Spjd	int numbufs, i;
1022219089Spjd
1023219089Spjd	if (size == 0)
1024219089Spjd		return;
1025219089Spjd
1026219089Spjd	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
1027219089Spjd	    FALSE, FTAG, &numbufs, &dbp));
1028219089Spjd
1029219089Spjd	for (i = 0; i < numbufs; i++) {
1030219089Spjd		dmu_buf_t *db = dbp[i];
1031219089Spjd
1032219089Spjd		dmu_buf_will_not_fill(db, tx);
1033219089Spjd	}
1034219089Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1035219089Spjd}
1036219089Spjd
1037268075Sdelphijvoid
1038268075Sdelphijdmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
1039268075Sdelphij    void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
1040268075Sdelphij    int compressed_size, int byteorder, dmu_tx_t *tx)
1041268075Sdelphij{
1042268075Sdelphij	dmu_buf_t *db;
1043268075Sdelphij
1044268075Sdelphij	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
1045268075Sdelphij	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
1046268075Sdelphij	VERIFY0(dmu_buf_hold_noread(os, object, offset,
1047268075Sdelphij	    FTAG, &db));
1048268075Sdelphij
1049268075Sdelphij	dmu_buf_write_embedded(db,
1050268075Sdelphij	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
1051268075Sdelphij	    uncompressed_size, compressed_size, byteorder, tx);
1052268075Sdelphij
1053268075Sdelphij	dmu_buf_rele(db, FTAG);
1054268075Sdelphij}
1055268075Sdelphij
1056219089Spjd/*
1057219089Spjd * DMU support for xuio
1058219089Spjd */
1059219089Spjdkstat_t *xuio_ksp = NULL;
1060219089Spjd
1061219089Spjdint
1062219089Spjddmu_xuio_init(xuio_t *xuio, int nblk)
1063219089Spjd{
1064219089Spjd	dmu_xuio_t *priv;
1065219089Spjd	uio_t *uio = &xuio->xu_uio;
1066219089Spjd
1067219089Spjd	uio->uio_iovcnt = nblk;
1068219089Spjd	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
1069219089Spjd
1070219089Spjd	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
1071219089Spjd	priv->cnt = nblk;
1072219089Spjd	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
1073219089Spjd	priv->iovp = uio->uio_iov;
1074219089Spjd	XUIO_XUZC_PRIV(xuio) = priv;
1075219089Spjd
1076219089Spjd	if (XUIO_XUZC_RW(xuio) == UIO_READ)
1077219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
1078219089Spjd	else
1079219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
1080219089Spjd
1081219089Spjd	return (0);
1082219089Spjd}
1083219089Spjd
1084219089Spjdvoid
1085219089Spjddmu_xuio_fini(xuio_t *xuio)
1086219089Spjd{
1087219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1088219089Spjd	int nblk = priv->cnt;
1089219089Spjd
1090219089Spjd	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
1091219089Spjd	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
1092219089Spjd	kmem_free(priv, sizeof (dmu_xuio_t));
1093219089Spjd
1094219089Spjd	if (XUIO_XUZC_RW(xuio) == UIO_READ)
1095219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
1096219089Spjd	else
1097219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
1098219089Spjd}
1099219089Spjd
1100219089Spjd/*
1101219089Spjd * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
1102219089Spjd * and increase priv->next by 1.
1103219089Spjd */
1104219089Spjdint
1105219089Spjddmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
1106219089Spjd{
1107219089Spjd	struct iovec *iov;
1108219089Spjd	uio_t *uio = &xuio->xu_uio;
1109219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1110219089Spjd	int i = priv->next++;
1111219089Spjd
1112219089Spjd	ASSERT(i < priv->cnt);
1113321535Smav	ASSERT(off + n <= arc_buf_lsize(abuf));
1114219089Spjd	iov = uio->uio_iov + i;
1115219089Spjd	iov->iov_base = (char *)abuf->b_data + off;
1116219089Spjd	iov->iov_len = n;
1117219089Spjd	priv->bufs[i] = abuf;
1118219089Spjd	return (0);
1119219089Spjd}
1120219089Spjd
1121219089Spjdint
1122219089Spjddmu_xuio_cnt(xuio_t *xuio)
1123219089Spjd{
1124219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1125219089Spjd	return (priv->cnt);
1126219089Spjd}
1127219089Spjd
1128219089Spjdarc_buf_t *
1129219089Spjddmu_xuio_arcbuf(xuio_t *xuio, int i)
1130219089Spjd{
1131219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1132219089Spjd
1133219089Spjd	ASSERT(i < priv->cnt);
1134219089Spjd	return (priv->bufs[i]);
1135219089Spjd}
1136219089Spjd
1137219089Spjdvoid
1138219089Spjddmu_xuio_clear(xuio_t *xuio, int i)
1139219089Spjd{
1140219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1141219089Spjd
1142219089Spjd	ASSERT(i < priv->cnt);
1143219089Spjd	priv->bufs[i] = NULL;
1144219089Spjd}
1145219089Spjd
1146219089Spjdstatic void
1147219089Spjdxuio_stat_init(void)
1148219089Spjd{
1149219089Spjd	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
1150219089Spjd	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
1151219089Spjd	    KSTAT_FLAG_VIRTUAL);
1152219089Spjd	if (xuio_ksp != NULL) {
1153219089Spjd		xuio_ksp->ks_data = &xuio_stats;
1154219089Spjd		kstat_install(xuio_ksp);
1155219089Spjd	}
1156219089Spjd}
1157219089Spjd
1158219089Spjdstatic void
1159219089Spjdxuio_stat_fini(void)
1160219089Spjd{
1161219089Spjd	if (xuio_ksp != NULL) {
1162219089Spjd		kstat_delete(xuio_ksp);
1163219089Spjd		xuio_ksp = NULL;
1164219089Spjd	}
1165219089Spjd}
1166219089Spjd
1167219089Spjdvoid
1168321530Smavxuio_stat_wbuf_copied(void)
1169219089Spjd{
1170219089Spjd	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1171219089Spjd}
1172219089Spjd
1173219089Spjdvoid
1174321530Smavxuio_stat_wbuf_nocopy(void)
1175219089Spjd{
1176219089Spjd	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
1177219089Spjd}
1178219089Spjd
1179168404Spjd#ifdef _KERNEL
1180272809Sdelphijstatic int
1181272809Sdelphijdmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
1182168404Spjd{
1183168404Spjd	dmu_buf_t **dbp;
1184168404Spjd	int numbufs, i, err;
1185219089Spjd	xuio_t *xuio = NULL;
1186168404Spjd
1187168404Spjd	/*
1188168404Spjd	 * NB: we could do this block-at-a-time, but it's nice
1189168404Spjd	 * to be reading in parallel.
1190168404Spjd	 */
1191272809Sdelphij	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1192272809Sdelphij	    TRUE, FTAG, &numbufs, &dbp, 0);
1193168404Spjd	if (err)
1194168404Spjd		return (err);
1195168404Spjd
1196219089Spjd#ifdef UIO_XUIO
1197219089Spjd	if (uio->uio_extflg == UIO_XUIO)
1198219089Spjd		xuio = (xuio_t *)uio;
1199219089Spjd#endif
1200219089Spjd
1201168404Spjd	for (i = 0; i < numbufs; i++) {
1202168404Spjd		int tocpy;
1203168404Spjd		int bufoff;
1204168404Spjd		dmu_buf_t *db = dbp[i];
1205168404Spjd
1206168404Spjd		ASSERT(size > 0);
1207168404Spjd
1208168404Spjd		bufoff = uio->uio_loffset - db->db_offset;
1209168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
1210168404Spjd
1211219089Spjd		if (xuio) {
1212219089Spjd			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
1213219089Spjd			arc_buf_t *dbuf_abuf = dbi->db_buf;
1214219089Spjd			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
1215219089Spjd			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
1216219089Spjd			if (!err) {
1217219089Spjd				uio->uio_resid -= tocpy;
1218219089Spjd				uio->uio_loffset += tocpy;
1219219089Spjd			}
1220219089Spjd
1221219089Spjd			if (abuf == dbuf_abuf)
1222219089Spjd				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
1223219089Spjd			else
1224219089Spjd				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
1225219089Spjd		} else {
1226298105Savg#ifdef illumos
1227219089Spjd			err = uiomove((char *)db->db_data + bufoff, tocpy,
1228219089Spjd			    UIO_READ, uio);
1229298105Savg#else
1230298105Savg			err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
1231298105Savg			    tocpy, uio);
1232298105Savg#endif
1233219089Spjd		}
1234168404Spjd		if (err)
1235168404Spjd			break;
1236168404Spjd
1237168404Spjd		size -= tocpy;
1238168404Spjd	}
1239168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1240168404Spjd
1241168404Spjd	return (err);
1242168404Spjd}
1243168404Spjd
1244272809Sdelphij/*
1245272809Sdelphij * Read 'size' bytes into the uio buffer.
1246272809Sdelphij * From object zdb->db_object.
1247272809Sdelphij * Starting at offset uio->uio_loffset.
1248272809Sdelphij *
1249272809Sdelphij * If the caller already has a dbuf in the target object
1250272809Sdelphij * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
1251272809Sdelphij * because we don't have to find the dnode_t for the object.
1252272809Sdelphij */
1253272809Sdelphijint
1254272809Sdelphijdmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
1255272809Sdelphij{
1256272809Sdelphij	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1257272809Sdelphij	dnode_t *dn;
1258272809Sdelphij	int err;
1259272809Sdelphij
1260272809Sdelphij	if (size == 0)
1261272809Sdelphij		return (0);
1262272809Sdelphij
1263272809Sdelphij	DB_DNODE_ENTER(db);
1264272809Sdelphij	dn = DB_DNODE(db);
1265272809Sdelphij	err = dmu_read_uio_dnode(dn, uio, size);
1266272809Sdelphij	DB_DNODE_EXIT(db);
1267272809Sdelphij
1268272809Sdelphij	return (err);
1269272809Sdelphij}
1270272809Sdelphij
1271272809Sdelphij/*
1272272809Sdelphij * Read 'size' bytes into the uio buffer.
1273272809Sdelphij * From the specified object
1274272809Sdelphij * Starting at offset uio->uio_loffset.
1275272809Sdelphij */
1276272809Sdelphijint
1277272809Sdelphijdmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
1278272809Sdelphij{
1279272809Sdelphij	dnode_t *dn;
1280272809Sdelphij	int err;
1281272809Sdelphij
1282272809Sdelphij	if (size == 0)
1283272809Sdelphij		return (0);
1284272809Sdelphij
1285272809Sdelphij	err = dnode_hold(os, object, FTAG, &dn);
1286272809Sdelphij	if (err)
1287272809Sdelphij		return (err);
1288272809Sdelphij
1289272809Sdelphij	err = dmu_read_uio_dnode(dn, uio, size);
1290272809Sdelphij
1291272809Sdelphij	dnode_rele(dn, FTAG);
1292272809Sdelphij
1293272809Sdelphij	return (err);
1294272809Sdelphij}
1295272809Sdelphij
1296219089Spjdstatic int
1297219089Spjddmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
1298168404Spjd{
1299168404Spjd	dmu_buf_t **dbp;
1300219089Spjd	int numbufs;
1301168404Spjd	int err = 0;
1302219089Spjd	int i;
1303168404Spjd
1304219089Spjd	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1305219089Spjd	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
1306168404Spjd	if (err)
1307168404Spjd		return (err);
1308168404Spjd
1309168404Spjd	for (i = 0; i < numbufs; i++) {
1310168404Spjd		int tocpy;
1311168404Spjd		int bufoff;
1312168404Spjd		dmu_buf_t *db = dbp[i];
1313168404Spjd
1314168404Spjd		ASSERT(size > 0);
1315168404Spjd
1316168404Spjd		bufoff = uio->uio_loffset - db->db_offset;
1317168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
1318168404Spjd
1319168404Spjd		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1320168404Spjd
1321168404Spjd		if (tocpy == db->db_size)
1322168404Spjd			dmu_buf_will_fill(db, tx);
1323168404Spjd		else
1324168404Spjd			dmu_buf_will_dirty(db, tx);
1325168404Spjd
1326298105Savg#ifdef illumos
1327168404Spjd		/*
1328168404Spjd		 * XXX uiomove could block forever (eg. nfs-backed
1329168404Spjd		 * pages).  There needs to be a uiolockdown() function
1330168404Spjd		 * to lock the pages in memory, so that uiomove won't
1331168404Spjd		 * block.
1332168404Spjd		 */
1333168404Spjd		err = uiomove((char *)db->db_data + bufoff, tocpy,
1334168404Spjd		    UIO_WRITE, uio);
1335298105Savg#else
1336298105Savg		err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy,
1337298105Savg		    uio);
1338298105Savg#endif
1339168404Spjd
1340168404Spjd		if (tocpy == db->db_size)
1341168404Spjd			dmu_buf_fill_done(db, tx);
1342168404Spjd
1343168404Spjd		if (err)
1344168404Spjd			break;
1345168404Spjd
1346168404Spjd		size -= tocpy;
1347168404Spjd	}
1348219089Spjd
1349168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1350168404Spjd	return (err);
1351168404Spjd}
1352168404Spjd
1353272809Sdelphij/*
1354272809Sdelphij * Write 'size' bytes from the uio buffer.
1355272809Sdelphij * To object zdb->db_object.
1356272809Sdelphij * Starting at offset uio->uio_loffset.
1357272809Sdelphij *
1358272809Sdelphij * If the caller already has a dbuf in the target object
1359272809Sdelphij * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
1360272809Sdelphij * because we don't have to find the dnode_t for the object.
1361272809Sdelphij */
1362168404Spjdint
1363219089Spjddmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
1364219089Spjd    dmu_tx_t *tx)
1365219089Spjd{
1366219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1367219089Spjd	dnode_t *dn;
1368219089Spjd	int err;
1369219089Spjd
1370219089Spjd	if (size == 0)
1371219089Spjd		return (0);
1372219089Spjd
1373219089Spjd	DB_DNODE_ENTER(db);
1374219089Spjd	dn = DB_DNODE(db);
1375219089Spjd	err = dmu_write_uio_dnode(dn, uio, size, tx);
1376219089Spjd	DB_DNODE_EXIT(db);
1377219089Spjd
1378219089Spjd	return (err);
1379219089Spjd}
1380219089Spjd
1381272809Sdelphij/*
1382272809Sdelphij * Write 'size' bytes from the uio buffer.
1383272809Sdelphij * To the specified object.
1384272809Sdelphij * Starting at offset uio->uio_loffset.
1385272809Sdelphij */
1386219089Spjdint
1387219089Spjddmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
1388219089Spjd    dmu_tx_t *tx)
1389219089Spjd{
1390219089Spjd	dnode_t *dn;
1391219089Spjd	int err;
1392219089Spjd
1393219089Spjd	if (size == 0)
1394219089Spjd		return (0);
1395219089Spjd
1396219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
1397219089Spjd	if (err)
1398219089Spjd		return (err);
1399219089Spjd
1400219089Spjd	err = dmu_write_uio_dnode(dn, uio, size, tx);
1401219089Spjd
1402219089Spjd	dnode_rele(dn, FTAG);
1403219089Spjd
1404219089Spjd	return (err);
1405219089Spjd}
1406219089Spjd
1407277300Ssmh#ifdef illumos
1408219089Spjdint
1409168404Spjddmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1410168404Spjd    page_t *pp, dmu_tx_t *tx)
1411168404Spjd{
1412168404Spjd	dmu_buf_t **dbp;
1413168404Spjd	int numbufs, i;
1414168404Spjd	int err;
1415168404Spjd
1416168404Spjd	if (size == 0)
1417168404Spjd		return (0);
1418168404Spjd
1419168404Spjd	err = dmu_buf_hold_array(os, object, offset, size,
1420168404Spjd	    FALSE, FTAG, &numbufs, &dbp);
1421168404Spjd	if (err)
1422168404Spjd		return (err);
1423168404Spjd
1424168404Spjd	for (i = 0; i < numbufs; i++) {
1425168404Spjd		int tocpy, copied, thiscpy;
1426168404Spjd		int bufoff;
1427168404Spjd		dmu_buf_t *db = dbp[i];
1428168404Spjd		caddr_t va;
1429168404Spjd
1430168404Spjd		ASSERT(size > 0);
1431168404Spjd		ASSERT3U(db->db_size, >=, PAGESIZE);
1432168404Spjd
1433168404Spjd		bufoff = offset - db->db_offset;
1434168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
1435168404Spjd
1436168404Spjd		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1437168404Spjd
1438168404Spjd		if (tocpy == db->db_size)
1439168404Spjd			dmu_buf_will_fill(db, tx);
1440168404Spjd		else
1441168404Spjd			dmu_buf_will_dirty(db, tx);
1442168404Spjd
1443168404Spjd		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
1444168404Spjd			ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
1445168404Spjd			thiscpy = MIN(PAGESIZE, tocpy - copied);
1446185029Spjd			va = zfs_map_page(pp, S_READ);
1447168404Spjd			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
1448185029Spjd			zfs_unmap_page(pp, va);
1449168404Spjd			pp = pp->p_next;
1450168404Spjd			bufoff += PAGESIZE;
1451168404Spjd		}
1452168404Spjd
1453168404Spjd		if (tocpy == db->db_size)
1454168404Spjd			dmu_buf_fill_done(db, tx);
1455168404Spjd
1456168404Spjd		offset += tocpy;
1457168404Spjd		size -= tocpy;
1458168404Spjd	}
1459168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1460168404Spjd	return (err);
1461168404Spjd}
1462258745Savg
1463277300Ssmh#else	/* !illumos */
1464258745Savg
1465258745Savgint
1466258745Savgdmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1467258745Savg    vm_page_t *ma, dmu_tx_t *tx)
1468258745Savg{
1469258745Savg	dmu_buf_t **dbp;
1470258745Savg	struct sf_buf *sf;
1471258745Savg	int numbufs, i;
1472258745Savg	int err;
1473258745Savg
1474258745Savg	if (size == 0)
1475258745Savg		return (0);
1476258745Savg
1477258745Savg	err = dmu_buf_hold_array(os, object, offset, size,
1478258745Savg	    FALSE, FTAG, &numbufs, &dbp);
1479258745Savg	if (err)
1480258745Savg		return (err);
1481258745Savg
1482258745Savg	for (i = 0; i < numbufs; i++) {
1483258745Savg		int tocpy, copied, thiscpy;
1484258745Savg		int bufoff;
1485258745Savg		dmu_buf_t *db = dbp[i];
1486258745Savg		caddr_t va;
1487258745Savg
1488258745Savg		ASSERT(size > 0);
1489258745Savg		ASSERT3U(db->db_size, >=, PAGESIZE);
1490258745Savg
1491258745Savg		bufoff = offset - db->db_offset;
1492258745Savg		tocpy = (int)MIN(db->db_size - bufoff, size);
1493258745Savg
1494258745Savg		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1495258745Savg
1496258745Savg		if (tocpy == db->db_size)
1497258745Savg			dmu_buf_will_fill(db, tx);
1498258745Savg		else
1499258745Savg			dmu_buf_will_dirty(db, tx);
1500258745Savg
1501258745Savg		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
1502258745Savg			ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff);
1503258745Savg			thiscpy = MIN(PAGESIZE, tocpy - copied);
1504258745Savg			va = zfs_map_page(*ma, &sf);
1505258745Savg			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
1506258745Savg			zfs_unmap_page(sf);
1507258745Savg			ma += 1;
1508258745Savg			bufoff += PAGESIZE;
1509258745Savg		}
1510258745Savg
1511258745Savg		if (tocpy == db->db_size)
1512258745Savg			dmu_buf_fill_done(db, tx);
1513258745Savg
1514258745Savg		offset += tocpy;
1515258745Savg		size -= tocpy;
1516258745Savg	}
1517258745Savg	dmu_buf_rele_array(dbp, numbufs, FTAG);
1518258745Savg	return (err);
1519258745Savg}
1520277300Ssmh#endif	/* illumos */
1521277300Ssmh#endif	/* _KERNEL */
1522168404Spjd
1523209962Smm/*
1524209962Smm * Allocate a loaned anonymous arc buffer.
1525209962Smm */
1526209962Smmarc_buf_t *
1527209962Smmdmu_request_arcbuf(dmu_buf_t *handle, int size)
1528209962Smm{
1529219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
1530209962Smm
1531321535Smav	return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
1532209962Smm}
1533209962Smm
1534209962Smm/*
1535209962Smm * Free a loaned arc buffer.
1536209962Smm */
1537209962Smmvoid
1538209962Smmdmu_return_arcbuf(arc_buf_t *buf)
1539209962Smm{
1540209962Smm	arc_return_buf(buf, FTAG);
1541307265Smav	arc_buf_destroy(buf, FTAG);
1542209962Smm}
1543209962Smm
1544209962Smm/*
1545209962Smm * When possible directly assign passed loaned arc buffer to a dbuf.
1546209962Smm * If this is not possible copy the contents of passed arc buf via
1547209962Smm * dmu_write().
1548209962Smm */
1549209962Smmvoid
1550209962Smmdmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
1551209962Smm    dmu_tx_t *tx)
1552209962Smm{
1553219089Spjd	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
1554219089Spjd	dnode_t *dn;
1555209962Smm	dmu_buf_impl_t *db;
1556321535Smav	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
1557209962Smm	uint64_t blkid;
1558209962Smm
1559219089Spjd	DB_DNODE_ENTER(dbuf);
1560219089Spjd	dn = DB_DNODE(dbuf);
1561209962Smm	rw_enter(&dn->dn_struct_rwlock, RW_READER);
1562286705Smav	blkid = dbuf_whichblock(dn, 0, offset);
1563209962Smm	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
1564209962Smm	rw_exit(&dn->dn_struct_rwlock);
1565219089Spjd	DB_DNODE_EXIT(dbuf);
1566209962Smm
1567272601Sdelphij	/*
1568272601Sdelphij	 * We can only assign if the offset is aligned, the arc buf is the
1569321535Smav	 * same size as the dbuf, and the dbuf is not metadata.
1570272601Sdelphij	 */
1571321535Smav	if (offset == db->db.db_offset && blksz == db->db.db_size) {
1572294625Strasz#ifdef _KERNEL
1573294625Strasz		curthread->td_ru.ru_oublock++;
1574297633Strasz#ifdef RACCT
1575297633Strasz		if (racct_enable) {
1576297633Strasz			PROC_LOCK(curproc);
1577297633Strasz			racct_add_force(curproc, RACCT_WRITEBPS, blksz);
1578297633Strasz			racct_add_force(curproc, RACCT_WRITEIOPS, 1);
1579297633Strasz			PROC_UNLOCK(curproc);
1580297633Strasz		}
1581297633Strasz#endif /* RACCT */
1582297633Strasz#endif /* _KERNEL */
1583209962Smm		dbuf_assign_arcbuf(db, buf, tx);
1584209962Smm		dbuf_rele(db, FTAG);
1585209962Smm	} else {
1586219089Spjd		objset_t *os;
1587219089Spjd		uint64_t object;
1588219089Spjd
1589321535Smav		/* compressed bufs must always be assignable to their dbuf */
1590321535Smav		ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
1591321535Smav		ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
1592321535Smav
1593219089Spjd		DB_DNODE_ENTER(dbuf);
1594219089Spjd		dn = DB_DNODE(dbuf);
1595219089Spjd		os = dn->dn_objset;
1596219089Spjd		object = dn->dn_object;
1597219089Spjd		DB_DNODE_EXIT(dbuf);
1598219089Spjd
1599209962Smm		dbuf_rele(db, FTAG);
1600219089Spjd		dmu_write(os, object, offset, blksz, buf->b_data, tx);
1601209962Smm		dmu_return_arcbuf(buf);
1602219089Spjd		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1603209962Smm	}
1604209962Smm}
1605209962Smm
1606168404Spjdtypedef struct {
1607219089Spjd	dbuf_dirty_record_t	*dsa_dr;
1608219089Spjd	dmu_sync_cb_t		*dsa_done;
1609219089Spjd	zgd_t			*dsa_zgd;
1610219089Spjd	dmu_tx_t		*dsa_tx;
1611168404Spjd} dmu_sync_arg_t;
1612168404Spjd
1613168404Spjd/* ARGSUSED */
1614168404Spjdstatic void
1615185029Spjddmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
1616185029Spjd{
1617219089Spjd	dmu_sync_arg_t *dsa = varg;
1618219089Spjd	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
1619185029Spjd	blkptr_t *bp = zio->io_bp;
1620185029Spjd
1621219089Spjd	if (zio->io_error == 0) {
1622219089Spjd		if (BP_IS_HOLE(bp)) {
1623219089Spjd			/*
1624219089Spjd			 * A block of zeros may compress to a hole, but the
1625219089Spjd			 * block size still needs to be known for replay.
1626219089Spjd			 */
1627219089Spjd			BP_SET_LSIZE(bp, db->db_size);
1628268075Sdelphij		} else if (!BP_IS_EMBEDDED(bp)) {
1629219089Spjd			ASSERT(BP_GET_LEVEL(bp) == 0);
1630219089Spjd			bp->blk_fill = 1;
1631219089Spjd		}
1632185029Spjd	}
1633185029Spjd}
1634185029Spjd
1635219089Spjdstatic void
1636219089Spjddmu_sync_late_arrival_ready(zio_t *zio)
1637219089Spjd{
1638219089Spjd	dmu_sync_ready(zio, NULL, zio->io_private);
1639219089Spjd}
1640219089Spjd
1641185029Spjd/* ARGSUSED */
1642185029Spjdstatic void
1643168404Spjddmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
1644168404Spjd{
1645219089Spjd	dmu_sync_arg_t *dsa = varg;
1646219089Spjd	dbuf_dirty_record_t *dr = dsa->dsa_dr;
1647168404Spjd	dmu_buf_impl_t *db = dr->dr_dbuf;
1648168404Spjd
1649168404Spjd	mutex_enter(&db->db_mtx);
1650168404Spjd	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
1651219089Spjd	if (zio->io_error == 0) {
1652243524Smm		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
1653243524Smm		if (dr->dt.dl.dr_nopwrite) {
1654243524Smm			blkptr_t *bp = zio->io_bp;
1655243524Smm			blkptr_t *bp_orig = &zio->io_bp_orig;
1656243524Smm			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
1657243524Smm
1658243524Smm			ASSERT(BP_EQUAL(bp, bp_orig));
1659243524Smm			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
1660289422Smav			ASSERT(zio_checksum_table[chksum].ci_flags &
1661289422Smav			    ZCHECKSUM_FLAG_NOPWRITE);
1662243524Smm		}
1663219089Spjd		dr->dt.dl.dr_overridden_by = *zio->io_bp;
1664219089Spjd		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
1665219089Spjd		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
1666286677Smav
1667286677Smav		/*
1668286677Smav		 * Old style holes are filled with all zeros, whereas
1669286677Smav		 * new-style holes maintain their lsize, type, level,
1670286677Smav		 * and birth time (see zio_write_compress). While we
1671286677Smav		 * need to reset the BP_SET_LSIZE() call that happened
1672286677Smav		 * in dmu_sync_ready for old style holes, we do *not*
1673286677Smav		 * want to wipe out the information contained in new
1674286677Smav		 * style holes. Thus, only zero out the block pointer if
1675286677Smav		 * it's an old style hole.
1676286677Smav		 */
1677286677Smav		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
1678286677Smav		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
1679219089Spjd			BP_ZERO(&dr->dt.dl.dr_overridden_by);
1680219089Spjd	} else {
1681219089Spjd		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1682219089Spjd	}
1683168404Spjd	cv_broadcast(&db->db_changed);
1684168404Spjd	mutex_exit(&db->db_mtx);
1685168404Spjd
1686219089Spjd	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1687168404Spjd
1688219089Spjd	kmem_free(dsa, sizeof (*dsa));
1689168404Spjd}
1690168404Spjd
1691219089Spjdstatic void
1692219089Spjddmu_sync_late_arrival_done(zio_t *zio)
1693219089Spjd{
1694219089Spjd	blkptr_t *bp = zio->io_bp;
1695219089Spjd	dmu_sync_arg_t *dsa = zio->io_private;
1696243524Smm	blkptr_t *bp_orig = &zio->io_bp_orig;
1697219089Spjd
1698219089Spjd	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
1699243524Smm		/*
1700243524Smm		 * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
1701243524Smm		 * then there is nothing to do here. Otherwise, free the
1702243524Smm		 * newly allocated block in this txg.
1703243524Smm		 */
1704243524Smm		if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
1705243524Smm			ASSERT(BP_EQUAL(bp, bp_orig));
1706243524Smm		} else {
1707243524Smm			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
1708243524Smm			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
1709243524Smm			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
1710243524Smm			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
1711243524Smm		}
1712219089Spjd	}
1713219089Spjd
1714219089Spjd	dmu_tx_commit(dsa->dsa_tx);
1715219089Spjd
1716219089Spjd	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1717219089Spjd
1718219089Spjd	kmem_free(dsa, sizeof (*dsa));
1719219089Spjd}
1720219089Spjd
1721219089Spjdstatic int
1722219089Spjddmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
1723268123Sdelphij    zio_prop_t *zp, zbookmark_phys_t *zb)
1724219089Spjd{
1725219089Spjd	dmu_sync_arg_t *dsa;
1726219089Spjd	dmu_tx_t *tx;
1727219089Spjd
1728219089Spjd	tx = dmu_tx_create(os);
1729219089Spjd	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
1730219089Spjd	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
1731219089Spjd		dmu_tx_abort(tx);
1732249195Smm		/* Make zl_get_data do txg_waited_synced() */
1733249195Smm		return (SET_ERROR(EIO));
1734219089Spjd	}
1735219089Spjd
1736219089Spjd	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1737219089Spjd	dsa->dsa_dr = NULL;
1738219089Spjd	dsa->dsa_done = done;
1739219089Spjd	dsa->dsa_zgd = zgd;
1740219089Spjd	dsa->dsa_tx = tx;
1741219089Spjd
1742321535Smav	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
1743321535Smav	    zgd->zgd_db->db_data, zgd->zgd_db->db_size, zgd->zgd_db->db_size,
1744304138Savg	    zp, dmu_sync_late_arrival_ready, NULL,
1745304138Savg	    NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
1746304138Savg	    ZIO_FLAG_CANFAIL, zb));
1747219089Spjd
1748219089Spjd	return (0);
1749219089Spjd}
1750219089Spjd
1751168404Spjd/*
1752168404Spjd * Intent log support: sync the block associated with db to disk.
1753168404Spjd * N.B. and XXX: the caller is responsible for making sure that the
1754168404Spjd * data isn't changing while dmu_sync() is writing it.
1755168404Spjd *
1756168404Spjd * Return values:
1757168404Spjd *
1758243524Smm *	EEXIST: this txg has already been synced, so there's nothing to do.
1759168404Spjd *		The caller should not log the write.
1760168404Spjd *
1761168404Spjd *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1762168404Spjd *		The caller should not log the write.
1763168404Spjd *
1764168404Spjd *	EALREADY: this block is already in the process of being synced.
1765168404Spjd *		The caller should track its progress (somehow).
1766168404Spjd *
1767219089Spjd *	EIO: could not do the I/O.
1768219089Spjd *		The caller should do a txg_wait_synced().
1769168404Spjd *
1770219089Spjd *	0: the I/O has been initiated.
1771219089Spjd *		The caller should log this blkptr in the done callback.
1772219089Spjd *		It is possible that the I/O will fail, in which case
1773219089Spjd *		the error will be reported to the done callback and
1774219089Spjd *		propagated to pio from zio_done().
1775168404Spjd */
1776168404Spjdint
1777219089Spjddmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
1778168404Spjd{
1779219089Spjd	blkptr_t *bp = zgd->zgd_bp;
1780219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
1781219089Spjd	objset_t *os = db->db_objset;
1782219089Spjd	dsl_dataset_t *ds = os->os_dsl_dataset;
1783168404Spjd	dbuf_dirty_record_t *dr;
1784219089Spjd	dmu_sync_arg_t *dsa;
1785268123Sdelphij	zbookmark_phys_t zb;
1786219089Spjd	zio_prop_t zp;
1787219089Spjd	dnode_t *dn;
1788168404Spjd
1789219089Spjd	ASSERT(pio != NULL);
1790168404Spjd	ASSERT(txg != 0);
1791168404Spjd
1792219089Spjd	SET_BOOKMARK(&zb, ds->ds_object,
1793219089Spjd	    db->db.db_object, db->db_level, db->db_blkid);
1794168404Spjd
1795219089Spjd	DB_DNODE_ENTER(db);
1796219089Spjd	dn = DB_DNODE(db);
1797321535Smav	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC,
1798321535Smav	    ZIO_COMPRESS_INHERIT, &zp);
1799219089Spjd	DB_DNODE_EXIT(db);
1800219089Spjd
1801168404Spjd	/*
1802219089Spjd	 * If we're frozen (running ziltest), we always need to generate a bp.
1803168404Spjd	 */
1804219089Spjd	if (txg > spa_freeze_txg(os->os_spa))
1805219089Spjd		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
1806168404Spjd
1807168404Spjd	/*
1808219089Spjd	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
1809219089Spjd	 * and us.  If we determine that this txg is not yet syncing,
1810219089Spjd	 * but it begins to sync a moment later, that's OK because the
1811219089Spjd	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
1812168404Spjd	 */
1813219089Spjd	mutex_enter(&db->db_mtx);
1814219089Spjd
1815219089Spjd	if (txg <= spa_last_synced_txg(os->os_spa)) {
1816168404Spjd		/*
1817219089Spjd		 * This txg has already synced.  There's nothing to do.
1818168404Spjd		 */
1819219089Spjd		mutex_exit(&db->db_mtx);
1820249195Smm		return (SET_ERROR(EEXIST));
1821168404Spjd	}
1822168404Spjd
1823219089Spjd	if (txg <= spa_syncing_txg(os->os_spa)) {
1824219089Spjd		/*
1825219089Spjd		 * This txg is currently syncing, so we can't mess with
1826219089Spjd		 * the dirty record anymore; just write a new log block.
1827219089Spjd		 */
1828219089Spjd		mutex_exit(&db->db_mtx);
1829219089Spjd		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
1830168404Spjd	}
1831168404Spjd
1832168404Spjd	dr = db->db_last_dirty;
1833219089Spjd	while (dr && dr->dr_txg != txg)
1834168404Spjd		dr = dr->dr_next;
1835219089Spjd
1836219089Spjd	if (dr == NULL) {
1837168404Spjd		/*
1838219089Spjd		 * There's no dr for this dbuf, so it must have been freed.
1839168404Spjd		 * There's no need to log writes to freed blocks, so we're done.
1840168404Spjd		 */
1841168404Spjd		mutex_exit(&db->db_mtx);
1842249195Smm		return (SET_ERROR(ENOENT));
1843168404Spjd	}
1844168404Spjd
1845243524Smm	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
1846243524Smm
1847243524Smm	/*
1848286589Smav	 * Assume the on-disk data is X, the current syncing data (in
1849286589Smav	 * txg - 1) is Y, and the current in-memory data is Z (currently
1850286589Smav	 * in dmu_sync).
1851286589Smav	 *
1852286589Smav	 * We usually want to perform a nopwrite if X and Z are the
1853286589Smav	 * same.  However, if Y is different (i.e. the BP is going to
1854286589Smav	 * change before this write takes effect), then a nopwrite will
1855286589Smav	 * be incorrect - we would override with X, which could have
1856286589Smav	 * been freed when Y was written.
1857286589Smav	 *
1858286589Smav	 * (Note that this is not a concern when we are nop-writing from
1859286589Smav	 * syncing context, because X and Y must be identical, because
1860286589Smav	 * all previous txgs have been synced.)
1861286589Smav	 *
1862286589Smav	 * Therefore, we disable nopwrite if the current BP could change
1863286589Smav	 * before this TXG.  There are two ways it could change: by
1864286589Smav	 * being dirty (dr_next is non-NULL), or by being freed
1865286589Smav	 * (dnode_block_freed()).  This behavior is verified by
1866286589Smav	 * zio_done(), which VERIFYs that the override BP is identical
1867286589Smav	 * to the on-disk BP.
1868243524Smm	 */
1869286589Smav	DB_DNODE_ENTER(db);
1870286589Smav	dn = DB_DNODE(db);
1871286589Smav	if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
1872243524Smm		zp.zp_nopwrite = B_FALSE;
1873286589Smav	DB_DNODE_EXIT(db);
1874243524Smm
1875168404Spjd	ASSERT(dr->dr_txg == txg);
1876219089Spjd	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
1877219089Spjd	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
1878168404Spjd		/*
1879219089Spjd		 * We have already issued a sync write for this buffer,
1880219089Spjd		 * or this buffer has already been synced.  It could not
1881219089Spjd		 * have been dirtied since, or we would have cleared the state.
1882168404Spjd		 */
1883168404Spjd		mutex_exit(&db->db_mtx);
1884249195Smm		return (SET_ERROR(EALREADY));
1885168404Spjd	}
1886168404Spjd
1887219089Spjd	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
1888168404Spjd	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
1889168404Spjd	mutex_exit(&db->db_mtx);
1890168404Spjd
1891219089Spjd	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1892219089Spjd	dsa->dsa_dr = dr;
1893219089Spjd	dsa->dsa_done = done;
1894219089Spjd	dsa->dsa_zgd = zgd;
1895219089Spjd	dsa->dsa_tx = NULL;
1896168404Spjd
1897219089Spjd	zio_nowait(arc_write(pio, os->os_spa, txg,
1898251478Sdelphij	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
1899307265Smav	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
1900304138Savg	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
1901185029Spjd
1902219089Spjd	return (0);
1903168404Spjd}
1904168404Spjd
1905168404Spjdint
1906168404Spjddmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
1907289562Smav    dmu_tx_t *tx)
1908168404Spjd{
1909168404Spjd	dnode_t *dn;
1910168404Spjd	int err;
1911168404Spjd
1912219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
1913168404Spjd	if (err)
1914168404Spjd		return (err);
1915168404Spjd	err = dnode_set_blksz(dn, size, ibs, tx);
1916168404Spjd	dnode_rele(dn, FTAG);
1917168404Spjd	return (err);
1918168404Spjd}
1919168404Spjd
1920168404Spjdvoid
1921168404Spjddmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
1922289562Smav    dmu_tx_t *tx)
1923168404Spjd{
1924168404Spjd	dnode_t *dn;
1925168404Spjd
1926268075Sdelphij	/*
1927268075Sdelphij	 * Send streams include each object's checksum function.  This
1928268075Sdelphij	 * check ensures that the receiving system can understand the
1929268075Sdelphij	 * checksum function transmitted.
1930268075Sdelphij	 */
1931268075Sdelphij	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
1932268075Sdelphij
1933268075Sdelphij	VERIFY0(dnode_hold(os, object, FTAG, &dn));
1934268075Sdelphij	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
1935168404Spjd	dn->dn_checksum = checksum;
1936168404Spjd	dnode_setdirty(dn, tx);
1937168404Spjd	dnode_rele(dn, FTAG);
1938168404Spjd}
1939168404Spjd
1940168404Spjdvoid
1941168404Spjddmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
1942289562Smav    dmu_tx_t *tx)
1943168404Spjd{
1944168404Spjd	dnode_t *dn;
1945168404Spjd
1946268075Sdelphij	/*
1947268075Sdelphij	 * Send streams include each object's compression function.  This
1948268075Sdelphij	 * check ensures that the receiving system can understand the
1949268075Sdelphij	 * compression function transmitted.
1950268075Sdelphij	 */
1951268075Sdelphij	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
1952268075Sdelphij
1953268075Sdelphij	VERIFY0(dnode_hold(os, object, FTAG, &dn));
1954168404Spjd	dn->dn_compress = compress;
1955168404Spjd	dnode_setdirty(dn, tx);
1956168404Spjd	dnode_rele(dn, FTAG);
1957168404Spjd}
1958168404Spjd
1959219089Spjdint zfs_mdcomp_disable = 0;
1960267992ShselaskySYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN,
1961219089Spjd    &zfs_mdcomp_disable, 0, "Disable metadata compression");
1962219089Spjd
1963266771Sdelphij/*
1964266771Sdelphij * When the "redundant_metadata" property is set to "most", only indirect
1965266771Sdelphij * blocks of this level and higher will have an additional ditto block.
1966266771Sdelphij */
1967266771Sdelphijint zfs_redundant_metadata_most_ditto_level = 2;
1968266771Sdelphij
1969219089Spjdvoid
1970321535Smavdmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
1971321535Smav    enum zio_compress override_compress, zio_prop_t *zp)
1972219089Spjd{
1973219089Spjd	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
1974236884Smm	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
1975219089Spjd	    (wp & WP_SPILL));
1976219089Spjd	enum zio_checksum checksum = os->os_checksum;
1977219089Spjd	enum zio_compress compress = os->os_compress;
1978219089Spjd	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
1979243524Smm	boolean_t dedup = B_FALSE;
1980243524Smm	boolean_t nopwrite = B_FALSE;
1981219089Spjd	boolean_t dedup_verify = os->os_dedup_verify;
1982219089Spjd	int copies = os->os_copies;
1983321535Smav	boolean_t lz4_ac = spa_feature_is_active(os->os_spa,
1984321535Smav	    SPA_FEATURE_LZ4_COMPRESS);
1985219089Spjd
1986321535Smav	IMPLY(override_compress == ZIO_COMPRESS_LZ4, lz4_ac);
1987321535Smav
1988219089Spjd	/*
1989243524Smm	 * We maintain different write policies for each of the following
1990243524Smm	 * types of data:
1991243524Smm	 *	 1. metadata
1992243524Smm	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
1993243524Smm	 *	 3. all other level 0 blocks
1994219089Spjd	 */
1995219089Spjd	if (ismd) {
1996268126Sdelphij		if (zfs_mdcomp_disable) {
1997268126Sdelphij			compress = ZIO_COMPRESS_EMPTY;
1998268126Sdelphij		} else {
1999286547Smav			/*
2000286547Smav			 * XXX -- we should design a compression algorithm
2001286547Smav			 * that specializes in arrays of bps.
2002286547Smav			 */
2003286547Smav			compress = zio_compress_select(os->os_spa,
2004286547Smav			    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
2005268126Sdelphij		}
2006268126Sdelphij
2007243524Smm		/*
2008219089Spjd		 * Metadata always gets checksummed.  If the data
2009219089Spjd		 * checksum is multi-bit correctable, and it's not a
2010219089Spjd		 * ZBT-style checksum, then it's suitable for metadata
2011219089Spjd		 * as well.  Otherwise, the metadata checksum defaults
2012219089Spjd		 * to fletcher4.
2013219089Spjd		 */
2014289422Smav		if (!(zio_checksum_table[checksum].ci_flags &
2015289422Smav		    ZCHECKSUM_FLAG_METADATA) ||
2016289422Smav		    (zio_checksum_table[checksum].ci_flags &
2017289422Smav		    ZCHECKSUM_FLAG_EMBEDDED))
2018219089Spjd			checksum = ZIO_CHECKSUM_FLETCHER_4;
2019266771Sdelphij
2020266771Sdelphij		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
2021266771Sdelphij		    (os->os_redundant_metadata ==
2022266771Sdelphij		    ZFS_REDUNDANT_METADATA_MOST &&
2023266771Sdelphij		    (level >= zfs_redundant_metadata_most_ditto_level ||
2024266771Sdelphij		    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
2025266771Sdelphij			copies++;
2026243524Smm	} else if (wp & WP_NOFILL) {
2027243524Smm		ASSERT(level == 0);
2028219089Spjd
2029219089Spjd		/*
2030243524Smm		 * If we're writing preallocated blocks, we aren't actually
2031243524Smm		 * writing them so don't set any policy properties.  These
2032243524Smm		 * blocks are currently only used by an external subsystem
2033243524Smm		 * outside of zfs (i.e. dump) and not written by the zio
2034243524Smm		 * pipeline.
2035219089Spjd		 */
2036243524Smm		compress = ZIO_COMPRESS_OFF;
2037255750Sdelphij		checksum = ZIO_CHECKSUM_NOPARITY;
2038219089Spjd	} else {
2039286547Smav		compress = zio_compress_select(os->os_spa, dn->dn_compress,
2040286547Smav		    compress);
2041219089Spjd
2042243524Smm		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
2043243524Smm		    zio_checksum_select(dn->dn_checksum, checksum) :
2044243524Smm		    dedup_checksum;
2045219089Spjd
2046243524Smm		/*
2047243524Smm		 * Determine dedup setting.  If we are in dmu_sync(),
2048243524Smm		 * we won't actually dedup now because that's all
2049243524Smm		 * done in syncing context; but we do want to use the
2050243524Smm		 * dedup checkum.  If the checksum is not strong
2051243524Smm		 * enough to ensure unique signatures, force
2052243524Smm		 * dedup_verify.
2053243524Smm		 */
2054243524Smm		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
2055243524Smm			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
2056289422Smav			if (!(zio_checksum_table[checksum].ci_flags &
2057289422Smav			    ZCHECKSUM_FLAG_DEDUP))
2058243524Smm				dedup_verify = B_TRUE;
2059243524Smm		}
2060219089Spjd
2061243524Smm		/*
2062289422Smav		 * Enable nopwrite if we have secure enough checksum
2063289422Smav		 * algorithm (see comment in zio_nop_write) and
2064289422Smav		 * compression is enabled.  We don't enable nopwrite if
2065289422Smav		 * dedup is enabled as the two features are mutually
2066289422Smav		 * exclusive.
2067243524Smm		 */
2068289422Smav		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
2069289422Smav		    ZCHECKSUM_FLAG_NOPWRITE) &&
2070243524Smm		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
2071219089Spjd	}
2072219089Spjd
2073219089Spjd	zp->zp_checksum = checksum;
2074321535Smav
2075321535Smav	/*
2076321535Smav	 * If we're writing a pre-compressed buffer, the compression type we use
2077321535Smav	 * must match the data. If it hasn't been compressed yet, then we should
2078321535Smav	 * use the value dictated by the policies above.
2079321535Smav	 */
2080321535Smav	zp->zp_compress = override_compress != ZIO_COMPRESS_INHERIT
2081321535Smav	    ? override_compress : compress;
2082321535Smav	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
2083321535Smav
2084219089Spjd	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
2085219089Spjd	zp->zp_level = level;
2086266771Sdelphij	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
2087219089Spjd	zp->zp_dedup = dedup;
2088219089Spjd	zp->zp_dedup_verify = dedup && dedup_verify;
2089243524Smm	zp->zp_nopwrite = nopwrite;
2090219089Spjd}
2091219089Spjd
2092168404Spjdint
2093168404Spjddmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
2094168404Spjd{
2095168404Spjd	dnode_t *dn;
2096287103Savg	int err;
2097168404Spjd
2098168404Spjd	/*
2099168404Spjd	 * Sync any current changes before
2100168404Spjd	 * we go trundling through the block pointers.
2101168404Spjd	 */
2102287103Savg	err = dmu_object_wait_synced(os, object);
2103287103Savg	if (err) {
2104287103Savg		return (err);
2105168404Spjd	}
2106287103Savg
2107287103Savg	err = dnode_hold(os, object, FTAG, &dn);
2108287103Savg	if (err) {
2109287103Savg		return (err);
2110168404Spjd	}
2111168404Spjd
2112185029Spjd	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
2113168404Spjd	dnode_rele(dn, FTAG);
2114168404Spjd
2115168404Spjd	return (err);
2116168404Spjd}
2117168404Spjd
2118287103Savg/*
2119287103Savg * Given the ZFS object, if it contains any dirty nodes
2120287103Savg * this function flushes all dirty blocks to disk. This
2121287103Savg * ensures the DMU object info is updated. A more efficient
2122287103Savg * future version might just find the TXG with the maximum
2123287103Savg * ID and wait for that to be synced.
2124287103Savg */
2125287103Savgint
2126289562Smavdmu_object_wait_synced(objset_t *os, uint64_t object)
2127289562Smav{
2128287103Savg	dnode_t *dn;
2129287103Savg	int error, i;
2130287103Savg
2131287103Savg	error = dnode_hold(os, object, FTAG, &dn);
2132287103Savg	if (error) {
2133287103Savg		return (error);
2134287103Savg	}
2135287103Savg
2136287103Savg	for (i = 0; i < TXG_SIZE; i++) {
2137287103Savg		if (list_link_active(&dn->dn_dirty_link[i])) {
2138287103Savg			break;
2139287103Savg		}
2140287103Savg	}
2141287103Savg	dnode_rele(dn, FTAG);
2142287103Savg	if (i != TXG_SIZE) {
2143287103Savg		txg_wait_synced(dmu_objset_pool(os), 0);
2144287103Savg	}
2145287103Savg
2146287103Savg	return (0);
2147287103Savg}
2148287103Savg
2149168404Spjdvoid
2150168404Spjddmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
2151168404Spjd{
2152219089Spjd	dnode_phys_t *dnp;
2153219089Spjd
2154168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
2155168404Spjd	mutex_enter(&dn->dn_mtx);
2156168404Spjd
2157219089Spjd	dnp = dn->dn_phys;
2158219089Spjd
2159168404Spjd	doi->doi_data_block_size = dn->dn_datablksz;
2160168404Spjd	doi->doi_metadata_block_size = dn->dn_indblkshift ?
2161168404Spjd	    1ULL << dn->dn_indblkshift : 0;
2162219089Spjd	doi->doi_type = dn->dn_type;
2163219089Spjd	doi->doi_bonus_type = dn->dn_bonustype;
2164219089Spjd	doi->doi_bonus_size = dn->dn_bonuslen;
2165168404Spjd	doi->doi_indirection = dn->dn_nlevels;
2166168404Spjd	doi->doi_checksum = dn->dn_checksum;
2167168404Spjd	doi->doi_compress = dn->dn_compress;
2168272810Sdelphij	doi->doi_nblkptr = dn->dn_nblkptr;
2169219089Spjd	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
2170247852Smm	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
2171219089Spjd	doi->doi_fill_count = 0;
2172219089Spjd	for (int i = 0; i < dnp->dn_nblkptr; i++)
2173268075Sdelphij		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
2174168404Spjd
2175168404Spjd	mutex_exit(&dn->dn_mtx);
2176168404Spjd	rw_exit(&dn->dn_struct_rwlock);
2177168404Spjd}
2178168404Spjd
2179168404Spjd/*
2180168404Spjd * Get information on a DMU object.
2181168404Spjd * If doi is NULL, just indicates whether the object exists.
2182168404Spjd */
2183168404Spjdint
2184168404Spjddmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
2185168404Spjd{
2186168404Spjd	dnode_t *dn;
2187219089Spjd	int err = dnode_hold(os, object, FTAG, &dn);
2188168404Spjd
2189168404Spjd	if (err)
2190168404Spjd		return (err);
2191168404Spjd
2192168404Spjd	if (doi != NULL)
2193168404Spjd		dmu_object_info_from_dnode(dn, doi);
2194168404Spjd
2195168404Spjd	dnode_rele(dn, FTAG);
2196168404Spjd	return (0);
2197168404Spjd}
2198168404Spjd
2199168404Spjd/*
2200168404Spjd * As above, but faster; can be used when you have a held dbuf in hand.
2201168404Spjd */
2202168404Spjdvoid
2203219089Spjddmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
2204168404Spjd{
2205219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2206219089Spjd
2207219089Spjd	DB_DNODE_ENTER(db);
2208219089Spjd	dmu_object_info_from_dnode(DB_DNODE(db), doi);
2209219089Spjd	DB_DNODE_EXIT(db);
2210168404Spjd}
2211168404Spjd
2212168404Spjd/*
2213168404Spjd * Faster still when you only care about the size.
2214168404Spjd * This is specifically optimized for zfs_getattr().
2215168404Spjd */
2216168404Spjdvoid
2217219089Spjddmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
2218219089Spjd    u_longlong_t *nblk512)
2219168404Spjd{
2220219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2221219089Spjd	dnode_t *dn;
2222168404Spjd
2223219089Spjd	DB_DNODE_ENTER(db);
2224219089Spjd	dn = DB_DNODE(db);
2225219089Spjd
2226168404Spjd	*blksize = dn->dn_datablksz;
2227168404Spjd	/* add 1 for dnode space */
2228168404Spjd	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
2229168404Spjd	    SPA_MINBLOCKSHIFT) + 1;
2230219089Spjd	DB_DNODE_EXIT(db);
2231168404Spjd}
2232168404Spjd
2233168404Spjdvoid
2234168404Spjdbyteswap_uint64_array(void *vbuf, size_t size)
2235168404Spjd{
2236168404Spjd	uint64_t *buf = vbuf;
2237168404Spjd	size_t count = size >> 3;
2238168404Spjd	int i;
2239168404Spjd
2240168404Spjd	ASSERT((size & 7) == 0);
2241168404Spjd
2242168404Spjd	for (i = 0; i < count; i++)
2243168404Spjd		buf[i] = BSWAP_64(buf[i]);
2244168404Spjd}
2245168404Spjd
2246168404Spjdvoid
2247168404Spjdbyteswap_uint32_array(void *vbuf, size_t size)
2248168404Spjd{
2249168404Spjd	uint32_t *buf = vbuf;
2250168404Spjd	size_t count = size >> 2;
2251168404Spjd	int i;
2252168404Spjd
2253168404Spjd	ASSERT((size & 3) == 0);
2254168404Spjd
2255168404Spjd	for (i = 0; i < count; i++)
2256168404Spjd		buf[i] = BSWAP_32(buf[i]);
2257168404Spjd}
2258168404Spjd
2259168404Spjdvoid
2260168404Spjdbyteswap_uint16_array(void *vbuf, size_t size)
2261168404Spjd{
2262168404Spjd	uint16_t *buf = vbuf;
2263168404Spjd	size_t count = size >> 1;
2264168404Spjd	int i;
2265168404Spjd
2266168404Spjd	ASSERT((size & 1) == 0);
2267168404Spjd
2268168404Spjd	for (i = 0; i < count; i++)
2269168404Spjd		buf[i] = BSWAP_16(buf[i]);
2270168404Spjd}
2271168404Spjd
2272168404Spjd/* ARGSUSED */
2273168404Spjdvoid
2274168404Spjdbyteswap_uint8_array(void *vbuf, size_t size)
2275168404Spjd{
2276168404Spjd}
2277168404Spjd
2278168404Spjdvoid
2279168404Spjddmu_init(void)
2280168404Spjd{
2281219089Spjd	zfs_dbgmsg_init();
2282219089Spjd	sa_cache_init();
2283219089Spjd	xuio_stat_init();
2284219089Spjd	dmu_objset_init();
2285219089Spjd	dnode_init();
2286208130Smm	zfetch_init();
2287254608Sgibbs	zio_compress_init();
2288239620Smm	l2arc_init();
2289168404Spjd	arc_init();
2290307265Smav	dbuf_init();
2291168404Spjd}
2292168404Spjd
2293168404Spjdvoid
2294168404Spjddmu_fini(void)
2295168404Spjd{
2296251629Sdelphij	arc_fini(); /* arc depends on l2arc, so arc must go first */
2297219089Spjd	l2arc_fini();
2298208130Smm	zfetch_fini();
2299254608Sgibbs	zio_compress_fini();
2300219089Spjd	dbuf_fini();
2301168404Spjd	dnode_fini();
2302219089Spjd	dmu_objset_fini();
2303219089Spjd	xuio_stat_fini();
2304219089Spjd	sa_cache_fini();
2305219089Spjd	zfs_dbgmsg_fini();
2306168404Spjd}
2307