1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23321573Smav * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
24168404Spjd */
25251478Sdelphij/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
26255750Sdelphij/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
27331384Smav/* Copyright 2016 Nexenta Systems, Inc. All rights reserved. */
28251478Sdelphij
29168404Spjd#include <sys/dmu.h>
30168404Spjd#include <sys/dmu_impl.h>
31168404Spjd#include <sys/dmu_tx.h>
32168404Spjd#include <sys/dbuf.h>
33168404Spjd#include <sys/dnode.h>
34168404Spjd#include <sys/zfs_context.h>
35168404Spjd#include <sys/dmu_objset.h>
36168404Spjd#include <sys/dmu_traverse.h>
37168404Spjd#include <sys/dsl_dataset.h>
38168404Spjd#include <sys/dsl_dir.h>
39168404Spjd#include <sys/dsl_pool.h>
40168404Spjd#include <sys/dsl_synctask.h>
41168404Spjd#include <sys/dsl_prop.h>
42168404Spjd#include <sys/dmu_zfetch.h>
43168404Spjd#include <sys/zfs_ioctl.h>
44168404Spjd#include <sys/zap.h>
45168404Spjd#include <sys/zio_checksum.h>
46243524Smm#include <sys/zio_compress.h>
47219089Spjd#include <sys/sa.h>
48268126Sdelphij#include <sys/zfeature.h>
49321610Smav#include <sys/abd.h>
50219089Spjd#ifdef _KERNEL
51297633Strasz#include <sys/racct.h>
52258745Savg#include <sys/vm.h>
53185029Spjd#include <sys/zfs_znode.h>
54219089Spjd#endif
55168404Spjd
56243524Smm/*
57243524Smm * Enable/disable nopwrite feature.
58243524Smm */
59243524Smmint zfs_nopwrite_enabled = 1;
60243525SmmSYSCTL_DECL(_vfs_zfs);
61243525SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN,
62243525Smm    &zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
63243524Smm
64321523Smav/*
65321523Smav * Tunable to control percentage of dirtied blocks from frees in one TXG.
66321523Smav * After this threshold is crossed, additional dirty blocks from frees
67321523Smav * wait until the next TXG.
68321523Smav * A value of zero will disable this throttle.
69321523Smav */
70321523Smavuint32_t zfs_per_txg_dirty_frees_percent = 30;
71321523SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
72321523Smav	&zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg");
73321523Smav
74332525Smav/*
75332525Smav * This can be used for testing, to ensure that certain actions happen
76332525Smav * while in the middle of a remap (which might otherwise complete too
77332525Smav * quickly).
78332525Smav */
79332525Smavint zfs_object_remap_one_indirect_delay_ticks = 0;
80332525Smav
81168404Spjdconst dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
82339109Smav	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "unallocated"		},
83339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "object directory"		},
84339109Smav	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "object array"		},
85339109Smav	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "packed nvlist"		},
86339109Smav	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "packed nvlist size"		},
87339109Smav	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj"			},
88339109Smav	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj header"		},
89339109Smav	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map header"	},
90339109Smav	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map"		},
91339109Smav	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "ZIL intent log"		},
92339109Smav	{ DMU_BSWAP_DNODE,  TRUE,  FALSE,  "DMU dnode"			},
93339109Smav	{ DMU_BSWAP_OBJSET, TRUE,  TRUE,   "DMU objset"			},
94339109Smav	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL directory"		},
95339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL directory child map"	},
96339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset snap map"	},
97339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL props"			},
98339109Smav	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL dataset"		},
99339109Smav	{ DMU_BSWAP_ZNODE,  TRUE,  FALSE,  "ZFS znode"			},
100339109Smav	{ DMU_BSWAP_OLDACL, TRUE,  FALSE,  "ZFS V0 ACL"			},
101339109Smav	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "ZFS plain file"		},
102339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS directory"		},
103339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS master node"		},
104339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS delete queue"		},
105339109Smav	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "zvol object"		},
106339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "zvol prop"			},
107339109Smav	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "other uint8[]"		},
108339109Smav	{ DMU_BSWAP_UINT64, FALSE, FALSE,  "other uint64[]"		},
109339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "other ZAP"			},
110339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "persistent error log"	},
111339109Smav	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "SPA history"		},
112339109Smav	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA history offsets"	},
113339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "Pool properties"		},
114339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL permissions"		},
115339109Smav	{ DMU_BSWAP_ACL,    TRUE,  FALSE,  "ZFS ACL"			},
116339109Smav	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "ZFS SYSACL"			},
117339109Smav	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "FUID table"			},
118339109Smav	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "FUID table size"		},
119339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset next clones"	},
120339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan work queue"		},
121339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group used"	},
122339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group quota"	},
123339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "snapshot refcount tags"	},
124339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT ZAP algorithm"		},
125339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT statistics"		},
126339109Smav	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "System attributes"		},
127339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA master node"		},
128339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr registration"	},
129339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr layouts"		},
130339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan translations"		},
131339109Smav	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "deduplicated block"		},
132339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL deadlist map"		},
133339109Smav	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL deadlist map hdr"	},
134339109Smav	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dir clones"		},
135339109Smav	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj subobj"		}
136168404Spjd};
137168404Spjd
138236884Smmconst dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
139236884Smm	{	byteswap_uint8_array,	"uint8"		},
140236884Smm	{	byteswap_uint16_array,	"uint16"	},
141236884Smm	{	byteswap_uint32_array,	"uint32"	},
142236884Smm	{	byteswap_uint64_array,	"uint64"	},
143236884Smm	{	zap_byteswap,		"zap"		},
144236884Smm	{	dnode_buf_byteswap,	"dnode"		},
145236884Smm	{	dmu_objset_byteswap,	"objset"	},
146236884Smm	{	zfs_znode_byteswap,	"znode"		},
147236884Smm	{	zfs_oldacl_byteswap,	"oldacl"	},
148236884Smm	{	zfs_acl_byteswap,	"acl"		}
149236884Smm};
150236884Smm
151168404Spjdint
152307290Smavdmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
153307290Smav    void *tag, dmu_buf_t **dbp)
154307290Smav{
155307290Smav	uint64_t blkid;
156307290Smav	dmu_buf_impl_t *db;
157307290Smav
158307290Smav	blkid = dbuf_whichblock(dn, 0, offset);
159307290Smav	rw_enter(&dn->dn_struct_rwlock, RW_READER);
160307290Smav	db = dbuf_hold(dn, blkid, tag);
161307290Smav	rw_exit(&dn->dn_struct_rwlock);
162307290Smav
163307290Smav	if (db == NULL) {
164307290Smav		*dbp = NULL;
165307290Smav		return (SET_ERROR(EIO));
166307290Smav	}
167307290Smav
168307290Smav	*dbp = &db->db;
169307290Smav	return (0);
170307290Smav}
171307290Smavint
172268075Sdelphijdmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
173268075Sdelphij    void *tag, dmu_buf_t **dbp)
174168404Spjd{
175168404Spjd	dnode_t *dn;
176168404Spjd	uint64_t blkid;
177168404Spjd	dmu_buf_impl_t *db;
178168404Spjd	int err;
179168404Spjd
180219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
181168404Spjd	if (err)
182168404Spjd		return (err);
183286705Smav	blkid = dbuf_whichblock(dn, 0, offset);
184168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
185168404Spjd	db = dbuf_hold(dn, blkid, tag);
186168404Spjd	rw_exit(&dn->dn_struct_rwlock);
187268075Sdelphij	dnode_rele(dn, FTAG);
188268075Sdelphij
189168404Spjd	if (db == NULL) {
190268075Sdelphij		*dbp = NULL;
191268075Sdelphij		return (SET_ERROR(EIO));
192268075Sdelphij	}
193268075Sdelphij
194268075Sdelphij	*dbp = &db->db;
195268075Sdelphij	return (err);
196268075Sdelphij}
197268075Sdelphij
198268075Sdelphijint
199307290Smavdmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
200307290Smav    void *tag, dmu_buf_t **dbp, int flags)
201307290Smav{
202307290Smav	int err;
203307290Smav	int db_flags = DB_RF_CANFAIL;
204307290Smav
205307290Smav	if (flags & DMU_READ_NO_PREFETCH)
206307290Smav		db_flags |= DB_RF_NOPREFETCH;
207307290Smav
208307290Smav	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
209307290Smav	if (err == 0) {
210307290Smav		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
211307290Smav		err = dbuf_read(db, NULL, db_flags);
212307290Smav		if (err != 0) {
213307290Smav			dbuf_rele(db, tag);
214307290Smav			*dbp = NULL;
215307290Smav		}
216307290Smav	}
217307290Smav
218307290Smav	return (err);
219307290Smav}
220307290Smav
221307290Smavint
222268075Sdelphijdmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
223268075Sdelphij    void *tag, dmu_buf_t **dbp, int flags)
224268075Sdelphij{
225268075Sdelphij	int err;
226268075Sdelphij	int db_flags = DB_RF_CANFAIL;
227268075Sdelphij
228268075Sdelphij	if (flags & DMU_READ_NO_PREFETCH)
229268075Sdelphij		db_flags |= DB_RF_NOPREFETCH;
230268075Sdelphij
231268075Sdelphij	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
232268075Sdelphij	if (err == 0) {
233268075Sdelphij		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
234219089Spjd		err = dbuf_read(db, NULL, db_flags);
235268075Sdelphij		if (err != 0) {
236168404Spjd			dbuf_rele(db, tag);
237268075Sdelphij			*dbp = NULL;
238168404Spjd		}
239168404Spjd	}
240168404Spjd
241168404Spjd	return (err);
242168404Spjd}
243168404Spjd
244168404Spjdint
245168404Spjddmu_bonus_max(void)
246168404Spjd{
247168404Spjd	return (DN_MAX_BONUSLEN);
248168404Spjd}
249168404Spjd
250185029Spjdint
251219089Spjddmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
252185029Spjd{
253219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
254219089Spjd	dnode_t *dn;
255219089Spjd	int error;
256185029Spjd
257219089Spjd	DB_DNODE_ENTER(db);
258219089Spjd	dn = DB_DNODE(db);
259219089Spjd
260219089Spjd	if (dn->dn_bonus != db) {
261249195Smm		error = SET_ERROR(EINVAL);
262219089Spjd	} else if (newsize < 0 || newsize > db_fake->db_size) {
263249195Smm		error = SET_ERROR(EINVAL);
264219089Spjd	} else {
265219089Spjd		dnode_setbonuslen(dn, newsize, tx);
266219089Spjd		error = 0;
267219089Spjd	}
268219089Spjd
269219089Spjd	DB_DNODE_EXIT(db);
270219089Spjd	return (error);
271185029Spjd}
272185029Spjd
273219089Spjdint
274219089Spjddmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
275219089Spjd{
276219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
277219089Spjd	dnode_t *dn;
278219089Spjd	int error;
279219089Spjd
280219089Spjd	DB_DNODE_ENTER(db);
281219089Spjd	dn = DB_DNODE(db);
282219089Spjd
283236884Smm	if (!DMU_OT_IS_VALID(type)) {
284249195Smm		error = SET_ERROR(EINVAL);
285219089Spjd	} else if (dn->dn_bonus != db) {
286249195Smm		error = SET_ERROR(EINVAL);
287219089Spjd	} else {
288219089Spjd		dnode_setbonus_type(dn, type, tx);
289219089Spjd		error = 0;
290219089Spjd	}
291219089Spjd
292219089Spjd	DB_DNODE_EXIT(db);
293219089Spjd	return (error);
294219089Spjd}
295219089Spjd
296219089Spjddmu_object_type_t
297219089Spjddmu_get_bonustype(dmu_buf_t *db_fake)
298219089Spjd{
299219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
300219089Spjd	dnode_t *dn;
301219089Spjd	dmu_object_type_t type;
302219089Spjd
303219089Spjd	DB_DNODE_ENTER(db);
304219089Spjd	dn = DB_DNODE(db);
305219089Spjd	type = dn->dn_bonustype;
306219089Spjd	DB_DNODE_EXIT(db);
307219089Spjd
308219089Spjd	return (type);
309219089Spjd}
310219089Spjd
311219089Spjdint
312219089Spjddmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
313219089Spjd{
314219089Spjd	dnode_t *dn;
315219089Spjd	int error;
316219089Spjd
317219089Spjd	error = dnode_hold(os, object, FTAG, &dn);
318219089Spjd	dbuf_rm_spill(dn, tx);
319219089Spjd	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
320219089Spjd	dnode_rm_spill(dn, tx);
321219089Spjd	rw_exit(&dn->dn_struct_rwlock);
322219089Spjd	dnode_rele(dn, FTAG);
323219089Spjd	return (error);
324219089Spjd}
325219089Spjd
326168404Spjd/*
327168404Spjd * returns ENOENT, EIO, or 0.
328168404Spjd */
329168404Spjdint
330168404Spjddmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
331168404Spjd{
332168404Spjd	dnode_t *dn;
333168404Spjd	dmu_buf_impl_t *db;
334185029Spjd	int error;
335168404Spjd
336219089Spjd	error = dnode_hold(os, object, FTAG, &dn);
337185029Spjd	if (error)
338185029Spjd		return (error);
339168404Spjd
340168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
341168404Spjd	if (dn->dn_bonus == NULL) {
342168404Spjd		rw_exit(&dn->dn_struct_rwlock);
343168404Spjd		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
344168404Spjd		if (dn->dn_bonus == NULL)
345185029Spjd			dbuf_create_bonus(dn);
346168404Spjd	}
347168404Spjd	db = dn->dn_bonus;
348185029Spjd
349185029Spjd	/* as long as the bonus buf is held, the dnode will be held */
350219089Spjd	if (refcount_add(&db->db_holds, tag) == 1) {
351185029Spjd		VERIFY(dnode_add_ref(dn, db));
352270248Sdelphij		atomic_inc_32(&dn->dn_dbufs_count);
353219089Spjd	}
354185029Spjd
355219089Spjd	/*
356219089Spjd	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
357219089Spjd	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
358219089Spjd	 * a dnode hold for every dbuf.
359219089Spjd	 */
360219089Spjd	rw_exit(&dn->dn_struct_rwlock);
361219089Spjd
362168404Spjd	dnode_rele(dn, FTAG);
363168404Spjd
364219089Spjd	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
365168404Spjd
366168404Spjd	*dbp = &db->db;
367168404Spjd	return (0);
368168404Spjd}
369168404Spjd
370168404Spjd/*
371219089Spjd * returns ENOENT, EIO, or 0.
372219089Spjd *
373219089Spjd * This interface will allocate a blank spill dbuf when a spill blk
374219089Spjd * doesn't already exist on the dnode.
375219089Spjd *
376219089Spjd * if you only want to find an already existing spill db, then
377219089Spjd * dmu_spill_hold_existing() should be used.
378219089Spjd */
379219089Spjdint
380219089Spjddmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
381219089Spjd{
382219089Spjd	dmu_buf_impl_t *db = NULL;
383219089Spjd	int err;
384219089Spjd
385219089Spjd	if ((flags & DB_RF_HAVESTRUCT) == 0)
386219089Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
387219089Spjd
388219089Spjd	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
389219089Spjd
390219089Spjd	if ((flags & DB_RF_HAVESTRUCT) == 0)
391219089Spjd		rw_exit(&dn->dn_struct_rwlock);
392219089Spjd
393219089Spjd	ASSERT(db != NULL);
394219089Spjd	err = dbuf_read(db, NULL, flags);
395219089Spjd	if (err == 0)
396219089Spjd		*dbp = &db->db;
397219089Spjd	else
398219089Spjd		dbuf_rele(db, tag);
399219089Spjd	return (err);
400219089Spjd}
401219089Spjd
402219089Spjdint
403219089Spjddmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
404219089Spjd{
405219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
406219089Spjd	dnode_t *dn;
407219089Spjd	int err;
408219089Spjd
409219089Spjd	DB_DNODE_ENTER(db);
410219089Spjd	dn = DB_DNODE(db);
411219089Spjd
412219089Spjd	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
413249195Smm		err = SET_ERROR(EINVAL);
414219089Spjd	} else {
415219089Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
416219089Spjd
417219089Spjd		if (!dn->dn_have_spill) {
418249195Smm			err = SET_ERROR(ENOENT);
419219089Spjd		} else {
420219089Spjd			err = dmu_spill_hold_by_dnode(dn,
421219089Spjd			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
422219089Spjd		}
423219089Spjd
424219089Spjd		rw_exit(&dn->dn_struct_rwlock);
425219089Spjd	}
426219089Spjd
427219089Spjd	DB_DNODE_EXIT(db);
428219089Spjd	return (err);
429219089Spjd}
430219089Spjd
431219089Spjdint
432219089Spjddmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
433219089Spjd{
434219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
435219089Spjd	dnode_t *dn;
436219089Spjd	int err;
437219089Spjd
438219089Spjd	DB_DNODE_ENTER(db);
439219089Spjd	dn = DB_DNODE(db);
440219089Spjd	err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
441219089Spjd	DB_DNODE_EXIT(db);
442219089Spjd
443219089Spjd	return (err);
444219089Spjd}
445219089Spjd
446219089Spjd/*
447168404Spjd * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
448168404Spjd * to take a held dnode rather than <os, object> -- the lookup is wasteful,
449168404Spjd * and can induce severe lock contention when writing to several files
450168404Spjd * whose dnodes are in the same block.
451168404Spjd */
452339128Smavint
453209962Smmdmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
454287702Sdelphij    boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
455168404Spjd{
456168404Spjd	dmu_buf_t **dbp;
457168404Spjd	uint64_t blkid, nblks, i;
458209962Smm	uint32_t dbuf_flags;
459168404Spjd	int err;
460168404Spjd	zio_t *zio;
461168404Spjd
462168404Spjd	ASSERT(length <= DMU_MAX_ACCESS);
463168404Spjd
464287702Sdelphij	/*
465287702Sdelphij	 * Note: We directly notify the prefetch code of this read, so that
466287702Sdelphij	 * we can tell it about the multi-block read.  dbuf_read() only knows
467287702Sdelphij	 * about the one block it is accessing.
468287702Sdelphij	 */
469287702Sdelphij	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
470287702Sdelphij	    DB_RF_NOPREFETCH;
471168404Spjd
472168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
473168404Spjd	if (dn->dn_datablkshift) {
474168404Spjd		int blkshift = dn->dn_datablkshift;
475287702Sdelphij		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
476287702Sdelphij		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
477168404Spjd	} else {
478168404Spjd		if (offset + length > dn->dn_datablksz) {
479168404Spjd			zfs_panic_recover("zfs: accessing past end of object "
480168404Spjd			    "%llx/%llx (size=%u access=%llu+%llu)",
481168404Spjd			    (longlong_t)dn->dn_objset->
482168404Spjd			    os_dsl_dataset->ds_object,
483168404Spjd			    (longlong_t)dn->dn_object, dn->dn_datablksz,
484168404Spjd			    (longlong_t)offset, (longlong_t)length);
485214378Smm			rw_exit(&dn->dn_struct_rwlock);
486249195Smm			return (SET_ERROR(EIO));
487168404Spjd		}
488168404Spjd		nblks = 1;
489168404Spjd	}
490168404Spjd	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
491168404Spjd
492297633Strasz#if defined(_KERNEL) && defined(RACCT)
493297633Strasz	if (racct_enable && !read) {
494297633Strasz		PROC_LOCK(curproc);
495297633Strasz		racct_add_force(curproc, RACCT_WRITEBPS, length);
496297633Strasz		racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
497297633Strasz		PROC_UNLOCK(curproc);
498297633Strasz	}
499297633Strasz#endif
500297633Strasz
501185029Spjd	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
502286705Smav	blkid = dbuf_whichblock(dn, 0, offset);
503168404Spjd	for (i = 0; i < nblks; i++) {
504287702Sdelphij		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
505168404Spjd		if (db == NULL) {
506168404Spjd			rw_exit(&dn->dn_struct_rwlock);
507168404Spjd			dmu_buf_rele_array(dbp, nblks, tag);
508168404Spjd			zio_nowait(zio);
509249195Smm			return (SET_ERROR(EIO));
510168404Spjd		}
511287702Sdelphij
512168404Spjd		/* initiate async i/o */
513226620Spjd		if (read)
514209962Smm			(void) dbuf_read(db, zio, dbuf_flags);
515226620Spjd#ifdef _KERNEL
516226620Spjd		else
517226620Spjd			curthread->td_ru.ru_oublock++;
518226620Spjd#endif
519168404Spjd		dbp[i] = &db->db;
520168404Spjd	}
521287702Sdelphij
522297832Smav	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
523297832Smav	    DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
524297832Smav		dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
525297832Smav		    read && DNODE_IS_CACHEABLE(dn));
526287702Sdelphij	}
527168404Spjd	rw_exit(&dn->dn_struct_rwlock);
528168404Spjd
529168404Spjd	/* wait for async i/o */
530168404Spjd	err = zio_wait(zio);
531168404Spjd	if (err) {
532168404Spjd		dmu_buf_rele_array(dbp, nblks, tag);
533168404Spjd		return (err);
534168404Spjd	}
535168404Spjd
536168404Spjd	/* wait for other io to complete */
537168404Spjd	if (read) {
538168404Spjd		for (i = 0; i < nblks; i++) {
539168404Spjd			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
540168404Spjd			mutex_enter(&db->db_mtx);
541168404Spjd			while (db->db_state == DB_READ ||
542168404Spjd			    db->db_state == DB_FILL)
543168404Spjd				cv_wait(&db->db_changed, &db->db_mtx);
544168404Spjd			if (db->db_state == DB_UNCACHED)
545249195Smm				err = SET_ERROR(EIO);
546168404Spjd			mutex_exit(&db->db_mtx);
547168404Spjd			if (err) {
548168404Spjd				dmu_buf_rele_array(dbp, nblks, tag);
549168404Spjd				return (err);
550168404Spjd			}
551168404Spjd		}
552168404Spjd	}
553168404Spjd
554168404Spjd	*numbufsp = nblks;
555168404Spjd	*dbpp = dbp;
556168404Spjd	return (0);
557168404Spjd}
558168404Spjd
559168404Spjdstatic int
560168404Spjddmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
561168404Spjd    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
562168404Spjd{
563168404Spjd	dnode_t *dn;
564168404Spjd	int err;
565168404Spjd
566219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
567168404Spjd	if (err)
568168404Spjd		return (err);
569168404Spjd
570168404Spjd	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
571209962Smm	    numbufsp, dbpp, DMU_READ_PREFETCH);
572168404Spjd
573168404Spjd	dnode_rele(dn, FTAG);
574168404Spjd
575168404Spjd	return (err);
576168404Spjd}
577168404Spjd
578168404Spjdint
579219089Spjddmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
580287702Sdelphij    uint64_t length, boolean_t read, void *tag, int *numbufsp,
581287702Sdelphij    dmu_buf_t ***dbpp)
582168404Spjd{
583219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
584219089Spjd	dnode_t *dn;
585168404Spjd	int err;
586168404Spjd
587219089Spjd	DB_DNODE_ENTER(db);
588219089Spjd	dn = DB_DNODE(db);
589168404Spjd	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
590209962Smm	    numbufsp, dbpp, DMU_READ_PREFETCH);
591219089Spjd	DB_DNODE_EXIT(db);
592168404Spjd
593168404Spjd	return (err);
594168404Spjd}
595168404Spjd
596168404Spjdvoid
597168404Spjddmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
598168404Spjd{
599168404Spjd	int i;
600168404Spjd	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
601168404Spjd
602168404Spjd	if (numbufs == 0)
603168404Spjd		return;
604168404Spjd
605168404Spjd	for (i = 0; i < numbufs; i++) {
606168404Spjd		if (dbp[i])
607168404Spjd			dbuf_rele(dbp[i], tag);
608168404Spjd	}
609168404Spjd
610168404Spjd	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
611168404Spjd}
612168404Spjd
613258632Savg/*
614286705Smav * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
615286705Smav * indirect blocks prefeteched will be those that point to the blocks containing
616286705Smav * the data starting at offset, and continuing to offset + len.
617258632Savg *
618286705Smav * Note that if the indirect blocks above the blocks being prefetched are not in
619286705Smav * cache, they will be asychronously read in.
620258632Savg */
621168404Spjdvoid
622286705Smavdmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
623286705Smav    uint64_t len, zio_priority_t pri)
624168404Spjd{
625168404Spjd	dnode_t *dn;
626168404Spjd	uint64_t blkid;
627258632Savg	int nblks, err;
628168404Spjd
629168404Spjd	if (len == 0) {  /* they're interested in the bonus buffer */
630219089Spjd		dn = DMU_META_DNODE(os);
631168404Spjd
632168404Spjd		if (object == 0 || object >= DN_MAX_OBJECT)
633168404Spjd			return;
634168404Spjd
635168404Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
636286705Smav		blkid = dbuf_whichblock(dn, level,
637286705Smav		    object * sizeof (dnode_phys_t));
638286705Smav		dbuf_prefetch(dn, level, blkid, pri, 0);
639168404Spjd		rw_exit(&dn->dn_struct_rwlock);
640168404Spjd		return;
641168404Spjd	}
642168404Spjd
643168404Spjd	/*
644168404Spjd	 * XXX - Note, if the dnode for the requested object is not
645168404Spjd	 * already cached, we will do a *synchronous* read in the
646168404Spjd	 * dnode_hold() call.  The same is true for any indirects.
647168404Spjd	 */
648219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
649168404Spjd	if (err != 0)
650168404Spjd		return;
651168404Spjd
652168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
653286705Smav	/*
654286705Smav	 * offset + len - 1 is the last byte we want to prefetch for, and offset
655286705Smav	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
656286705Smav	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
657286705Smav	 * offset)  is the first.  Then the number we need to prefetch is the
658286705Smav	 * last - first + 1.
659286705Smav	 */
660286705Smav	if (level > 0 || dn->dn_datablkshift != 0) {
661286705Smav		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
662286705Smav		    dbuf_whichblock(dn, level, offset) + 1;
663168404Spjd	} else {
664168404Spjd		nblks = (offset < dn->dn_datablksz);
665168404Spjd	}
666168404Spjd
667168404Spjd	if (nblks != 0) {
668286705Smav		blkid = dbuf_whichblock(dn, level, offset);
669258632Savg		for (int i = 0; i < nblks; i++)
670286705Smav			dbuf_prefetch(dn, level, blkid + i, pri, 0);
671168404Spjd	}
672168404Spjd
673168404Spjd	rw_exit(&dn->dn_struct_rwlock);
674168404Spjd
675168404Spjd	dnode_rele(dn, FTAG);
676168404Spjd}
677168404Spjd
678208775Smm/*
679208775Smm * Get the next "chunk" of file data to free.  We traverse the file from
680208775Smm * the end so that the file gets shorter over time (if we crashes in the
681208775Smm * middle, this will leave us in a better state).  We find allocated file
682208775Smm * data by simply searching the allocated level 1 indirects.
683254753Sdelphij *
684254753Sdelphij * On input, *start should be the first offset that does not need to be
685254753Sdelphij * freed (e.g. "offset + length").  On return, *start will be the first
686254753Sdelphij * offset that should be freed.
687208775Smm */
688185029Spjdstatic int
689254753Sdelphijget_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
690185029Spjd{
691254753Sdelphij	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
692254753Sdelphij	/* bytes of data covered by a level-1 indirect block */
693208775Smm	uint64_t iblkrange =
694185029Spjd	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
695185029Spjd
696254753Sdelphij	ASSERT3U(minimum, <=, *start);
697185029Spjd
698254753Sdelphij	if (*start - minimum <= iblkrange * maxblks) {
699254753Sdelphij		*start = minimum;
700185029Spjd		return (0);
701185029Spjd	}
702208775Smm	ASSERT(ISP2(iblkrange));
703185029Spjd
704254753Sdelphij	for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
705185029Spjd		int err;
706185029Spjd
707254753Sdelphij		/*
708254753Sdelphij		 * dnode_next_offset(BACKWARDS) will find an allocated L1
709254753Sdelphij		 * indirect block at or before the input offset.  We must
710254753Sdelphij		 * decrement *start so that it is at the end of the region
711254753Sdelphij		 * to search.
712254753Sdelphij		 */
713254753Sdelphij		(*start)--;
714185029Spjd		err = dnode_next_offset(dn,
715208775Smm		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
716185029Spjd
717254753Sdelphij		/* if there are no indirect blocks before start, we are done */
718208775Smm		if (err == ESRCH) {
719254753Sdelphij			*start = minimum;
720254753Sdelphij			break;
721254753Sdelphij		} else if (err != 0) {
722208775Smm			return (err);
723185029Spjd		}
724185029Spjd
725254753Sdelphij		/* set start to the beginning of this L1 indirect */
726208775Smm		*start = P2ALIGN(*start, iblkrange);
727185029Spjd	}
728254753Sdelphij	if (*start < minimum)
729254753Sdelphij		*start = minimum;
730185029Spjd	return (0);
731185029Spjd}
732185029Spjd
733331384Smav/*
734331384Smav * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
735331384Smav * otherwise return false.
736331384Smav * Used below in dmu_free_long_range_impl() to enable abort when unmounting
737331384Smav */
738331384Smav/*ARGSUSED*/
739331384Smavstatic boolean_t
740331384Smavdmu_objset_zfs_unmounting(objset_t *os)
741331384Smav{
742331384Smav#ifdef _KERNEL
743331384Smav	if (dmu_objset_type(os) == DMU_OST_ZFS)
744331384Smav		return (zfs_get_vfs_flag_unmounted(os));
745331384Smav#endif
746331384Smav	return (B_FALSE);
747331384Smav}
748331384Smav
749185029Spjdstatic int
750185029Spjddmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
751254753Sdelphij    uint64_t length)
752185029Spjd{
753254753Sdelphij	uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
754254753Sdelphij	int err;
755321523Smav	uint64_t dirty_frees_threshold;
756321523Smav	dsl_pool_t *dp = dmu_objset_pool(os);
757185029Spjd
758254753Sdelphij	if (offset >= object_size)
759185029Spjd		return (0);
760185029Spjd
761321523Smav	if (zfs_per_txg_dirty_frees_percent <= 100)
762321523Smav		dirty_frees_threshold =
763321523Smav		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
764321523Smav	else
765321523Smav		dirty_frees_threshold = zfs_dirty_data_max / 4;
766321523Smav
767254753Sdelphij	if (length == DMU_OBJECT_END || offset + length > object_size)
768254753Sdelphij		length = object_size - offset;
769254753Sdelphij
770254753Sdelphij	while (length != 0) {
771321523Smav		uint64_t chunk_end, chunk_begin, chunk_len;
772321523Smav		uint64_t long_free_dirty_all_txgs = 0;
773321523Smav		dmu_tx_t *tx;
774254753Sdelphij
775331384Smav		if (dmu_objset_zfs_unmounting(dn->dn_objset))
776331384Smav			return (SET_ERROR(EINTR));
777331384Smav
778254753Sdelphij		chunk_end = chunk_begin = offset + length;
779254753Sdelphij
780254753Sdelphij		/* move chunk_begin backwards to the beginning of this chunk */
781254753Sdelphij		err = get_next_chunk(dn, &chunk_begin, offset);
782185029Spjd		if (err)
783185029Spjd			return (err);
784254753Sdelphij		ASSERT3U(chunk_begin, >=, offset);
785254753Sdelphij		ASSERT3U(chunk_begin, <=, chunk_end);
786185029Spjd
787321523Smav		chunk_len = chunk_end - chunk_begin;
788268464Sdelphij
789321523Smav		mutex_enter(&dp->dp_lock);
790321523Smav		for (int t = 0; t < TXG_SIZE; t++) {
791321523Smav			long_free_dirty_all_txgs +=
792321523Smav			    dp->dp_long_free_dirty_pertxg[t];
793321523Smav		}
794321523Smav		mutex_exit(&dp->dp_lock);
795321523Smav
796268464Sdelphij		/*
797321523Smav		 * To avoid filling up a TXG with just frees wait for
798321523Smav		 * the next TXG to open before freeing more chunks if
799321523Smav		 * we have reached the threshold of frees
800321523Smav		 */
801321523Smav		if (dirty_frees_threshold != 0 &&
802321523Smav		    long_free_dirty_all_txgs >= dirty_frees_threshold) {
803321523Smav			txg_wait_open(dp, 0);
804321523Smav			continue;
805321523Smav		}
806321523Smav
807321523Smav		tx = dmu_tx_create(os);
808321523Smav		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
809321523Smav
810321523Smav		/*
811268464Sdelphij		 * Mark this transaction as typically resulting in a net
812268464Sdelphij		 * reduction in space used.
813268464Sdelphij		 */
814268464Sdelphij		dmu_tx_mark_netfree(tx);
815185029Spjd		err = dmu_tx_assign(tx, TXG_WAIT);
816185029Spjd		if (err) {
817185029Spjd			dmu_tx_abort(tx);
818185029Spjd			return (err);
819185029Spjd		}
820321523Smav
821321523Smav		mutex_enter(&dp->dp_lock);
822321523Smav		dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
823321523Smav		    chunk_len;
824321523Smav		mutex_exit(&dp->dp_lock);
825321523Smav		DTRACE_PROBE3(free__long__range,
826321523Smav		    uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
827321523Smav		    uint64_t, dmu_tx_get_txg(tx));
828321523Smav		dnode_free_range(dn, chunk_begin, chunk_len, tx);
829254753Sdelphij		dmu_tx_commit(tx);
830185029Spjd
831321523Smav		length -= chunk_len;
832185029Spjd	}
833185029Spjd	return (0);
834185029Spjd}
835185029Spjd
836168404Spjdint
837185029Spjddmu_free_long_range(objset_t *os, uint64_t object,
838185029Spjd    uint64_t offset, uint64_t length)
839185029Spjd{
840185029Spjd	dnode_t *dn;
841185029Spjd	int err;
842185029Spjd
843219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
844185029Spjd	if (err != 0)
845185029Spjd		return (err);
846254753Sdelphij	err = dmu_free_long_range_impl(os, dn, offset, length);
847256259Savg
848256259Savg	/*
849256259Savg	 * It is important to zero out the maxblkid when freeing the entire
850256259Savg	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
851256259Savg	 * will take the fast path, and (b) dnode_reallocate() can verify
852256259Savg	 * that the entire file has been freed.
853256259Savg	 */
854260150Sdelphij	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
855256259Savg		dn->dn_maxblkid = 0;
856256259Savg
857185029Spjd	dnode_rele(dn, FTAG);
858185029Spjd	return (err);
859185029Spjd}
860185029Spjd
861185029Spjdint
862254753Sdelphijdmu_free_long_object(objset_t *os, uint64_t object)
863185029Spjd{
864185029Spjd	dmu_tx_t *tx;
865185029Spjd	int err;
866185029Spjd
867254753Sdelphij	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
868185029Spjd	if (err != 0)
869185029Spjd		return (err);
870254753Sdelphij
871254753Sdelphij	tx = dmu_tx_create(os);
872254753Sdelphij	dmu_tx_hold_bonus(tx, object);
873254753Sdelphij	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
874268464Sdelphij	dmu_tx_mark_netfree(tx);
875254753Sdelphij	err = dmu_tx_assign(tx, TXG_WAIT);
876254753Sdelphij	if (err == 0) {
877254753Sdelphij		err = dmu_object_free(os, object, tx);
878254753Sdelphij		dmu_tx_commit(tx);
879185029Spjd	} else {
880254753Sdelphij		dmu_tx_abort(tx);
881185029Spjd	}
882254753Sdelphij
883185029Spjd	return (err);
884185029Spjd}
885185029Spjd
886185029Spjdint
887168404Spjddmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
888168404Spjd    uint64_t size, dmu_tx_t *tx)
889168404Spjd{
890168404Spjd	dnode_t *dn;
891219089Spjd	int err = dnode_hold(os, object, FTAG, &dn);
892168404Spjd	if (err)
893168404Spjd		return (err);
894168404Spjd	ASSERT(offset < UINT64_MAX);
895168404Spjd	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
896168404Spjd	dnode_free_range(dn, offset, size, tx);
897168404Spjd	dnode_rele(dn, FTAG);
898168404Spjd	return (0);
899168404Spjd}
900168404Spjd
901321549Smavstatic int
902321549Smavdmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
903209962Smm    void *buf, uint32_t flags)
904168404Spjd{
905168404Spjd	dmu_buf_t **dbp;
906321549Smav	int numbufs, err = 0;
907168404Spjd
908168404Spjd	/*
909168404Spjd	 * Deal with odd block sizes, where there can't be data past the first
910168404Spjd	 * block.  If we ever do the tail block optimization, we will need to
911168404Spjd	 * handle that here as well.
912168404Spjd	 */
913214378Smm	if (dn->dn_maxblkid == 0) {
914168404Spjd		int newsz = offset > dn->dn_datablksz ? 0 :
915168404Spjd		    MIN(size, dn->dn_datablksz - offset);
916168404Spjd		bzero((char *)buf + newsz, size - newsz);
917168404Spjd		size = newsz;
918168404Spjd	}
919168404Spjd
920168404Spjd	while (size > 0) {
921168404Spjd		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
922214378Smm		int i;
923168404Spjd
924168404Spjd		/*
925168404Spjd		 * NB: we could do this block-at-a-time, but it's nice
926168404Spjd		 * to be reading in parallel.
927168404Spjd		 */
928168404Spjd		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
929209962Smm		    TRUE, FTAG, &numbufs, &dbp, flags);
930168404Spjd		if (err)
931185029Spjd			break;
932168404Spjd
933168404Spjd		for (i = 0; i < numbufs; i++) {
934168404Spjd			int tocpy;
935168404Spjd			int bufoff;
936168404Spjd			dmu_buf_t *db = dbp[i];
937168404Spjd
938168404Spjd			ASSERT(size > 0);
939168404Spjd
940168404Spjd			bufoff = offset - db->db_offset;
941168404Spjd			tocpy = (int)MIN(db->db_size - bufoff, size);
942168404Spjd
943168404Spjd			bcopy((char *)db->db_data + bufoff, buf, tocpy);
944168404Spjd
945168404Spjd			offset += tocpy;
946168404Spjd			size -= tocpy;
947168404Spjd			buf = (char *)buf + tocpy;
948168404Spjd		}
949168404Spjd		dmu_buf_rele_array(dbp, numbufs, FTAG);
950168404Spjd	}
951321549Smav	return (err);
952321549Smav}
953321549Smav
954321549Smavint
955321549Smavdmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
956321549Smav    void *buf, uint32_t flags)
957321549Smav{
958321549Smav	dnode_t *dn;
959321549Smav	int err;
960321549Smav
961321549Smav	err = dnode_hold(os, object, FTAG, &dn);
962321549Smav	if (err != 0)
963321549Smav		return (err);
964321549Smav
965321549Smav	err = dmu_read_impl(dn, offset, size, buf, flags);
966168404Spjd	dnode_rele(dn, FTAG);
967185029Spjd	return (err);
968168404Spjd}
969168404Spjd
970321549Smavint
971321549Smavdmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
972321549Smav    uint32_t flags)
973321549Smav{
974321549Smav	return (dmu_read_impl(dn, offset, size, buf, flags));
975321549Smav}
976321549Smav
977321549Smavstatic void
978321549Smavdmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
979168404Spjd    const void *buf, dmu_tx_t *tx)
980168404Spjd{
981321549Smav	int i;
982168404Spjd
983168404Spjd	for (i = 0; i < numbufs; i++) {
984168404Spjd		int tocpy;
985168404Spjd		int bufoff;
986168404Spjd		dmu_buf_t *db = dbp[i];
987168404Spjd
988168404Spjd		ASSERT(size > 0);
989168404Spjd
990168404Spjd		bufoff = offset - db->db_offset;
991168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
992168404Spjd
993168404Spjd		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
994168404Spjd
995168404Spjd		if (tocpy == db->db_size)
996168404Spjd			dmu_buf_will_fill(db, tx);
997168404Spjd		else
998168404Spjd			dmu_buf_will_dirty(db, tx);
999168404Spjd
1000168404Spjd		bcopy(buf, (char *)db->db_data + bufoff, tocpy);
1001168404Spjd
1002168404Spjd		if (tocpy == db->db_size)
1003168404Spjd			dmu_buf_fill_done(db, tx);
1004168404Spjd
1005168404Spjd		offset += tocpy;
1006168404Spjd		size -= tocpy;
1007168404Spjd		buf = (char *)buf + tocpy;
1008168404Spjd	}
1009321549Smav}
1010321549Smav
1011321549Smavvoid
1012321549Smavdmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1013321549Smav    const void *buf, dmu_tx_t *tx)
1014321549Smav{
1015321549Smav	dmu_buf_t **dbp;
1016321549Smav	int numbufs;
1017321549Smav
1018321549Smav	if (size == 0)
1019321549Smav		return;
1020321549Smav
1021321549Smav	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
1022321549Smav	    FALSE, FTAG, &numbufs, &dbp));
1023321549Smav	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
1024168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1025168404Spjd}
1026168404Spjd
1027219089Spjdvoid
1028321549Smavdmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
1029321549Smav    const void *buf, dmu_tx_t *tx)
1030321549Smav{
1031321549Smav	dmu_buf_t **dbp;
1032321549Smav	int numbufs;
1033321549Smav
1034321549Smav	if (size == 0)
1035321549Smav		return;
1036321549Smav
1037321549Smav	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
1038321549Smav	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
1039321549Smav	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
1040321549Smav	dmu_buf_rele_array(dbp, numbufs, FTAG);
1041321549Smav}
1042321549Smav
1043332525Smavstatic int
1044332525Smavdmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
1045332525Smav    uint64_t last_removal_txg, uint64_t offset)
1046332525Smav{
1047332525Smav	uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
1048332525Smav	int err = 0;
1049332525Smav
1050332525Smav	rw_enter(&dn->dn_struct_rwlock, RW_READER);
1051332525Smav	dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG);
1052332525Smav	ASSERT3P(dbuf, !=, NULL);
1053332525Smav
1054332525Smav	/*
1055332525Smav	 * If the block hasn't been written yet, this default will ensure
1056332525Smav	 * we don't try to remap it.
1057332525Smav	 */
1058332525Smav	uint64_t birth = UINT64_MAX;
1059332525Smav	ASSERT3U(last_removal_txg, !=, UINT64_MAX);
1060332525Smav	if (dbuf->db_blkptr != NULL)
1061332525Smav		birth = dbuf->db_blkptr->blk_birth;
1062332525Smav	rw_exit(&dn->dn_struct_rwlock);
1063332525Smav
1064332525Smav	/*
1065332525Smav	 * If this L1 was already written after the last removal, then we've
1066332525Smav	 * already tried to remap it.
1067332525Smav	 */
1068332525Smav	if (birth <= last_removal_txg &&
1069332525Smav	    dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
1070332525Smav	    dbuf_can_remap(dbuf)) {
1071332525Smav		dmu_tx_t *tx = dmu_tx_create(os);
1072332525Smav		dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
1073332525Smav		err = dmu_tx_assign(tx, TXG_WAIT);
1074332525Smav		if (err == 0) {
1075332525Smav			(void) dbuf_dirty(dbuf, tx);
1076332525Smav			dmu_tx_commit(tx);
1077332525Smav		} else {
1078332525Smav			dmu_tx_abort(tx);
1079332525Smav		}
1080332525Smav	}
1081332525Smav
1082332525Smav	dbuf_rele(dbuf, FTAG);
1083332525Smav
1084332525Smav	delay(zfs_object_remap_one_indirect_delay_ticks);
1085332525Smav
1086332525Smav	return (err);
1087332525Smav}
1088332525Smav
1089332525Smav/*
1090332525Smav * Remap all blockpointers in the object, if possible, so that they reference
1091332525Smav * only concrete vdevs.
1092332525Smav *
1093332525Smav * To do this, iterate over the L0 blockpointers and remap any that reference
1094332525Smav * an indirect vdev. Note that we only examine L0 blockpointers; since we
1095332525Smav * cannot guarantee that we can remap all blockpointer anyways (due to split
1096332525Smav * blocks), we do not want to make the code unnecessarily complicated to
1097332525Smav * catch the unlikely case that there is an L1 block on an indirect vdev that
1098332525Smav * contains no indirect blockpointers.
1099332525Smav */
1100332525Smavint
1101332525Smavdmu_object_remap_indirects(objset_t *os, uint64_t object,
1102332525Smav    uint64_t last_removal_txg)
1103332525Smav{
1104332525Smav	uint64_t offset, l1span;
1105332525Smav	int err;
1106332525Smav	dnode_t *dn;
1107332525Smav
1108332525Smav	err = dnode_hold(os, object, FTAG, &dn);
1109332525Smav	if (err != 0) {
1110332525Smav		return (err);
1111332525Smav	}
1112332525Smav
1113332525Smav	if (dn->dn_nlevels <= 1) {
1114332525Smav		if (issig(JUSTLOOKING) && issig(FORREAL)) {
1115332525Smav			err = SET_ERROR(EINTR);
1116332525Smav		}
1117332525Smav
1118332525Smav		/*
1119332525Smav		 * If the dnode has no indirect blocks, we cannot dirty them.
1120332525Smav		 * We still want to remap the blkptr(s) in the dnode if
1121332525Smav		 * appropriate, so mark it as dirty.
1122332525Smav		 */
1123332525Smav		if (err == 0 && dnode_needs_remap(dn)) {
1124332525Smav			dmu_tx_t *tx = dmu_tx_create(os);
1125332525Smav			dmu_tx_hold_bonus(tx, dn->dn_object);
1126332525Smav			if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
1127332525Smav				dnode_setdirty(dn, tx);
1128332525Smav				dmu_tx_commit(tx);
1129332525Smav			} else {
1130332525Smav				dmu_tx_abort(tx);
1131332525Smav			}
1132332525Smav		}
1133332525Smav
1134332525Smav		dnode_rele(dn, FTAG);
1135332525Smav		return (err);
1136332525Smav	}
1137332525Smav
1138332525Smav	offset = 0;
1139332525Smav	l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT +
1140332525Smav	    dn->dn_datablkshift);
1141332525Smav	/*
1142332525Smav	 * Find the next L1 indirect that is not a hole.
1143332525Smav	 */
1144332525Smav	while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) {
1145332525Smav		if (issig(JUSTLOOKING) && issig(FORREAL)) {
1146332525Smav			err = SET_ERROR(EINTR);
1147332525Smav			break;
1148332525Smav		}
1149332525Smav		if ((err = dmu_object_remap_one_indirect(os, dn,
1150332525Smav		    last_removal_txg, offset)) != 0) {
1151332525Smav			break;
1152332525Smav		}
1153332525Smav		offset += l1span;
1154332525Smav	}
1155332525Smav
1156332525Smav	dnode_rele(dn, FTAG);
1157332525Smav	return (err);
1158332525Smav}
1159332525Smav
1160321549Smavvoid
1161219089Spjddmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1162219089Spjd    dmu_tx_t *tx)
1163219089Spjd{
1164219089Spjd	dmu_buf_t **dbp;
1165219089Spjd	int numbufs, i;
1166219089Spjd
1167219089Spjd	if (size == 0)
1168219089Spjd		return;
1169219089Spjd
1170219089Spjd	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
1171219089Spjd	    FALSE, FTAG, &numbufs, &dbp));
1172219089Spjd
1173219089Spjd	for (i = 0; i < numbufs; i++) {
1174219089Spjd		dmu_buf_t *db = dbp[i];
1175219089Spjd
1176219089Spjd		dmu_buf_will_not_fill(db, tx);
1177219089Spjd	}
1178219089Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1179219089Spjd}
1180219089Spjd
1181268075Sdelphijvoid
1182268075Sdelphijdmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
1183268075Sdelphij    void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
1184268075Sdelphij    int compressed_size, int byteorder, dmu_tx_t *tx)
1185268075Sdelphij{
1186268075Sdelphij	dmu_buf_t *db;
1187268075Sdelphij
1188268075Sdelphij	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
1189268075Sdelphij	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
1190268075Sdelphij	VERIFY0(dmu_buf_hold_noread(os, object, offset,
1191268075Sdelphij	    FTAG, &db));
1192268075Sdelphij
1193268075Sdelphij	dmu_buf_write_embedded(db,
1194268075Sdelphij	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
1195268075Sdelphij	    uncompressed_size, compressed_size, byteorder, tx);
1196268075Sdelphij
1197268075Sdelphij	dmu_buf_rele(db, FTAG);
1198268075Sdelphij}
1199268075Sdelphij
1200219089Spjd/*
1201219089Spjd * DMU support for xuio
1202219089Spjd */
1203219089Spjdkstat_t *xuio_ksp = NULL;
1204219089Spjd
1205219089Spjdint
1206219089Spjddmu_xuio_init(xuio_t *xuio, int nblk)
1207219089Spjd{
1208219089Spjd	dmu_xuio_t *priv;
1209219089Spjd	uio_t *uio = &xuio->xu_uio;
1210219089Spjd
1211219089Spjd	uio->uio_iovcnt = nblk;
1212219089Spjd	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
1213219089Spjd
1214219089Spjd	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
1215219089Spjd	priv->cnt = nblk;
1216219089Spjd	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
1217219089Spjd	priv->iovp = uio->uio_iov;
1218219089Spjd	XUIO_XUZC_PRIV(xuio) = priv;
1219219089Spjd
1220219089Spjd	if (XUIO_XUZC_RW(xuio) == UIO_READ)
1221219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
1222219089Spjd	else
1223219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
1224219089Spjd
1225219089Spjd	return (0);
1226219089Spjd}
1227219089Spjd
1228219089Spjdvoid
1229219089Spjddmu_xuio_fini(xuio_t *xuio)
1230219089Spjd{
1231219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1232219089Spjd	int nblk = priv->cnt;
1233219089Spjd
1234219089Spjd	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
1235219089Spjd	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
1236219089Spjd	kmem_free(priv, sizeof (dmu_xuio_t));
1237219089Spjd
1238219089Spjd	if (XUIO_XUZC_RW(xuio) == UIO_READ)
1239219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
1240219089Spjd	else
1241219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
1242219089Spjd}
1243219089Spjd
1244219089Spjd/*
1245219089Spjd * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
1246219089Spjd * and increase priv->next by 1.
1247219089Spjd */
1248219089Spjdint
1249219089Spjddmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
1250219089Spjd{
1251219089Spjd	struct iovec *iov;
1252219089Spjd	uio_t *uio = &xuio->xu_uio;
1253219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1254219089Spjd	int i = priv->next++;
1255219089Spjd
1256219089Spjd	ASSERT(i < priv->cnt);
1257321535Smav	ASSERT(off + n <= arc_buf_lsize(abuf));
1258219089Spjd	iov = uio->uio_iov + i;
1259219089Spjd	iov->iov_base = (char *)abuf->b_data + off;
1260219089Spjd	iov->iov_len = n;
1261219089Spjd	priv->bufs[i] = abuf;
1262219089Spjd	return (0);
1263219089Spjd}
1264219089Spjd
1265219089Spjdint
1266219089Spjddmu_xuio_cnt(xuio_t *xuio)
1267219089Spjd{
1268219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1269219089Spjd	return (priv->cnt);
1270219089Spjd}
1271219089Spjd
1272219089Spjdarc_buf_t *
1273219089Spjddmu_xuio_arcbuf(xuio_t *xuio, int i)
1274219089Spjd{
1275219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1276219089Spjd
1277219089Spjd	ASSERT(i < priv->cnt);
1278219089Spjd	return (priv->bufs[i]);
1279219089Spjd}
1280219089Spjd
1281219089Spjdvoid
1282219089Spjddmu_xuio_clear(xuio_t *xuio, int i)
1283219089Spjd{
1284219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1285219089Spjd
1286219089Spjd	ASSERT(i < priv->cnt);
1287219089Spjd	priv->bufs[i] = NULL;
1288219089Spjd}
1289219089Spjd
1290219089Spjdstatic void
1291219089Spjdxuio_stat_init(void)
1292219089Spjd{
1293219089Spjd	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
1294219089Spjd	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
1295219089Spjd	    KSTAT_FLAG_VIRTUAL);
1296219089Spjd	if (xuio_ksp != NULL) {
1297219089Spjd		xuio_ksp->ks_data = &xuio_stats;
1298219089Spjd		kstat_install(xuio_ksp);
1299219089Spjd	}
1300219089Spjd}
1301219089Spjd
1302219089Spjdstatic void
1303219089Spjdxuio_stat_fini(void)
1304219089Spjd{
1305219089Spjd	if (xuio_ksp != NULL) {
1306219089Spjd		kstat_delete(xuio_ksp);
1307219089Spjd		xuio_ksp = NULL;
1308219089Spjd	}
1309219089Spjd}
1310219089Spjd
1311219089Spjdvoid
1312321530Smavxuio_stat_wbuf_copied(void)
1313219089Spjd{
1314219089Spjd	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1315219089Spjd}
1316219089Spjd
1317219089Spjdvoid
1318321530Smavxuio_stat_wbuf_nocopy(void)
1319219089Spjd{
1320219089Spjd	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
1321219089Spjd}
1322219089Spjd
1323168404Spjd#ifdef _KERNEL
1324339128Smavint
1325272809Sdelphijdmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
1326168404Spjd{
1327168404Spjd	dmu_buf_t **dbp;
1328168404Spjd	int numbufs, i, err;
1329219089Spjd	xuio_t *xuio = NULL;
1330168404Spjd
1331168404Spjd	/*
1332168404Spjd	 * NB: we could do this block-at-a-time, but it's nice
1333168404Spjd	 * to be reading in parallel.
1334168404Spjd	 */
1335272809Sdelphij	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1336272809Sdelphij	    TRUE, FTAG, &numbufs, &dbp, 0);
1337168404Spjd	if (err)
1338168404Spjd		return (err);
1339168404Spjd
1340219089Spjd#ifdef UIO_XUIO
1341219089Spjd	if (uio->uio_extflg == UIO_XUIO)
1342219089Spjd		xuio = (xuio_t *)uio;
1343219089Spjd#endif
1344219089Spjd
1345168404Spjd	for (i = 0; i < numbufs; i++) {
1346168404Spjd		int tocpy;
1347168404Spjd		int bufoff;
1348168404Spjd		dmu_buf_t *db = dbp[i];
1349168404Spjd
1350168404Spjd		ASSERT(size > 0);
1351168404Spjd
1352168404Spjd		bufoff = uio->uio_loffset - db->db_offset;
1353168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
1354168404Spjd
1355219089Spjd		if (xuio) {
1356219089Spjd			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
1357219089Spjd			arc_buf_t *dbuf_abuf = dbi->db_buf;
1358219089Spjd			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
1359219089Spjd			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
1360219089Spjd			if (!err) {
1361219089Spjd				uio->uio_resid -= tocpy;
1362219089Spjd				uio->uio_loffset += tocpy;
1363219089Spjd			}
1364219089Spjd
1365219089Spjd			if (abuf == dbuf_abuf)
1366219089Spjd				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
1367219089Spjd			else
1368219089Spjd				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
1369219089Spjd		} else {
1370298105Savg#ifdef illumos
1371219089Spjd			err = uiomove((char *)db->db_data + bufoff, tocpy,
1372219089Spjd			    UIO_READ, uio);
1373298105Savg#else
1374298105Savg			err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
1375298105Savg			    tocpy, uio);
1376298105Savg#endif
1377219089Spjd		}
1378168404Spjd		if (err)
1379168404Spjd			break;
1380168404Spjd
1381168404Spjd		size -= tocpy;
1382168404Spjd	}
1383168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1384168404Spjd
1385168404Spjd	return (err);
1386168404Spjd}
1387168404Spjd
1388272809Sdelphij/*
1389272809Sdelphij * Read 'size' bytes into the uio buffer.
1390272809Sdelphij * From object zdb->db_object.
1391272809Sdelphij * Starting at offset uio->uio_loffset.
1392272809Sdelphij *
1393272809Sdelphij * If the caller already has a dbuf in the target object
1394272809Sdelphij * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
1395272809Sdelphij * because we don't have to find the dnode_t for the object.
1396272809Sdelphij */
1397272809Sdelphijint
1398272809Sdelphijdmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
1399272809Sdelphij{
1400272809Sdelphij	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1401272809Sdelphij	dnode_t *dn;
1402272809Sdelphij	int err;
1403272809Sdelphij
1404272809Sdelphij	if (size == 0)
1405272809Sdelphij		return (0);
1406272809Sdelphij
1407272809Sdelphij	DB_DNODE_ENTER(db);
1408272809Sdelphij	dn = DB_DNODE(db);
1409272809Sdelphij	err = dmu_read_uio_dnode(dn, uio, size);
1410272809Sdelphij	DB_DNODE_EXIT(db);
1411272809Sdelphij
1412272809Sdelphij	return (err);
1413272809Sdelphij}
1414272809Sdelphij
1415272809Sdelphij/*
1416272809Sdelphij * Read 'size' bytes into the uio buffer.
1417272809Sdelphij * From the specified object
1418272809Sdelphij * Starting at offset uio->uio_loffset.
1419272809Sdelphij */
1420272809Sdelphijint
1421272809Sdelphijdmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
1422272809Sdelphij{
1423272809Sdelphij	dnode_t *dn;
1424272809Sdelphij	int err;
1425272809Sdelphij
1426272809Sdelphij	if (size == 0)
1427272809Sdelphij		return (0);
1428272809Sdelphij
1429272809Sdelphij	err = dnode_hold(os, object, FTAG, &dn);
1430272809Sdelphij	if (err)
1431272809Sdelphij		return (err);
1432272809Sdelphij
1433272809Sdelphij	err = dmu_read_uio_dnode(dn, uio, size);
1434272809Sdelphij
1435272809Sdelphij	dnode_rele(dn, FTAG);
1436272809Sdelphij
1437272809Sdelphij	return (err);
1438272809Sdelphij}
1439272809Sdelphij
1440339128Smavint
1441219089Spjddmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
1442168404Spjd{
1443168404Spjd	dmu_buf_t **dbp;
1444219089Spjd	int numbufs;
1445168404Spjd	int err = 0;
1446219089Spjd	int i;
1447168404Spjd
1448219089Spjd	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1449219089Spjd	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
1450168404Spjd	if (err)
1451168404Spjd		return (err);
1452168404Spjd
1453168404Spjd	for (i = 0; i < numbufs; i++) {
1454168404Spjd		int tocpy;
1455168404Spjd		int bufoff;
1456168404Spjd		dmu_buf_t *db = dbp[i];
1457168404Spjd
1458168404Spjd		ASSERT(size > 0);
1459168404Spjd
1460168404Spjd		bufoff = uio->uio_loffset - db->db_offset;
1461168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
1462168404Spjd
1463168404Spjd		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1464168404Spjd
1465168404Spjd		if (tocpy == db->db_size)
1466168404Spjd			dmu_buf_will_fill(db, tx);
1467168404Spjd		else
1468168404Spjd			dmu_buf_will_dirty(db, tx);
1469168404Spjd
1470298105Savg#ifdef illumos
1471168404Spjd		/*
1472168404Spjd		 * XXX uiomove could block forever (eg. nfs-backed
1473168404Spjd		 * pages).  There needs to be a uiolockdown() function
1474168404Spjd		 * to lock the pages in memory, so that uiomove won't
1475168404Spjd		 * block.
1476168404Spjd		 */
1477168404Spjd		err = uiomove((char *)db->db_data + bufoff, tocpy,
1478168404Spjd		    UIO_WRITE, uio);
1479298105Savg#else
1480298105Savg		err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy,
1481298105Savg		    uio);
1482298105Savg#endif
1483168404Spjd
1484168404Spjd		if (tocpy == db->db_size)
1485168404Spjd			dmu_buf_fill_done(db, tx);
1486168404Spjd
1487168404Spjd		if (err)
1488168404Spjd			break;
1489168404Spjd
1490168404Spjd		size -= tocpy;
1491168404Spjd	}
1492219089Spjd
1493168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1494168404Spjd	return (err);
1495168404Spjd}
1496168404Spjd
1497272809Sdelphij/*
1498272809Sdelphij * Write 'size' bytes from the uio buffer.
1499272809Sdelphij * To object zdb->db_object.
1500272809Sdelphij * Starting at offset uio->uio_loffset.
1501272809Sdelphij *
1502272809Sdelphij * If the caller already has a dbuf in the target object
1503272809Sdelphij * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
1504272809Sdelphij * because we don't have to find the dnode_t for the object.
1505272809Sdelphij */
1506168404Spjdint
1507219089Spjddmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
1508219089Spjd    dmu_tx_t *tx)
1509219089Spjd{
1510219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1511219089Spjd	dnode_t *dn;
1512219089Spjd	int err;
1513219089Spjd
1514219089Spjd	if (size == 0)
1515219089Spjd		return (0);
1516219089Spjd
1517219089Spjd	DB_DNODE_ENTER(db);
1518219089Spjd	dn = DB_DNODE(db);
1519219089Spjd	err = dmu_write_uio_dnode(dn, uio, size, tx);
1520219089Spjd	DB_DNODE_EXIT(db);
1521219089Spjd
1522219089Spjd	return (err);
1523219089Spjd}
1524219089Spjd
1525272809Sdelphij/*
1526272809Sdelphij * Write 'size' bytes from the uio buffer.
1527272809Sdelphij * To the specified object.
1528272809Sdelphij * Starting at offset uio->uio_loffset.
1529272809Sdelphij */
1530219089Spjdint
1531219089Spjddmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
1532219089Spjd    dmu_tx_t *tx)
1533219089Spjd{
1534219089Spjd	dnode_t *dn;
1535219089Spjd	int err;
1536219089Spjd
1537219089Spjd	if (size == 0)
1538219089Spjd		return (0);
1539219089Spjd
1540219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
1541219089Spjd	if (err)
1542219089Spjd		return (err);
1543219089Spjd
1544219089Spjd	err = dmu_write_uio_dnode(dn, uio, size, tx);
1545219089Spjd
1546219089Spjd	dnode_rele(dn, FTAG);
1547219089Spjd
1548219089Spjd	return (err);
1549219089Spjd}
1550219089Spjd
1551277300Ssmh#ifdef illumos
1552219089Spjdint
1553168404Spjddmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1554168404Spjd    page_t *pp, dmu_tx_t *tx)
1555168404Spjd{
1556168404Spjd	dmu_buf_t **dbp;
1557168404Spjd	int numbufs, i;
1558168404Spjd	int err;
1559168404Spjd
1560168404Spjd	if (size == 0)
1561168404Spjd		return (0);
1562168404Spjd
1563168404Spjd	err = dmu_buf_hold_array(os, object, offset, size,
1564168404Spjd	    FALSE, FTAG, &numbufs, &dbp);
1565168404Spjd	if (err)
1566168404Spjd		return (err);
1567168404Spjd
1568168404Spjd	for (i = 0; i < numbufs; i++) {
1569168404Spjd		int tocpy, copied, thiscpy;
1570168404Spjd		int bufoff;
1571168404Spjd		dmu_buf_t *db = dbp[i];
1572168404Spjd		caddr_t va;
1573168404Spjd
1574168404Spjd		ASSERT(size > 0);
1575168404Spjd		ASSERT3U(db->db_size, >=, PAGESIZE);
1576168404Spjd
1577168404Spjd		bufoff = offset - db->db_offset;
1578168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
1579168404Spjd
1580168404Spjd		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1581168404Spjd
1582168404Spjd		if (tocpy == db->db_size)
1583168404Spjd			dmu_buf_will_fill(db, tx);
1584168404Spjd		else
1585168404Spjd			dmu_buf_will_dirty(db, tx);
1586168404Spjd
1587168404Spjd		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
1588168404Spjd			ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
1589168404Spjd			thiscpy = MIN(PAGESIZE, tocpy - copied);
1590185029Spjd			va = zfs_map_page(pp, S_READ);
1591168404Spjd			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
1592185029Spjd			zfs_unmap_page(pp, va);
1593168404Spjd			pp = pp->p_next;
1594168404Spjd			bufoff += PAGESIZE;
1595168404Spjd		}
1596168404Spjd
1597168404Spjd		if (tocpy == db->db_size)
1598168404Spjd			dmu_buf_fill_done(db, tx);
1599168404Spjd
1600168404Spjd		offset += tocpy;
1601168404Spjd		size -= tocpy;
1602168404Spjd	}
1603168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1604168404Spjd	return (err);
1605168404Spjd}
1606258745Savg
1607277300Ssmh#else	/* !illumos */
1608258745Savg
1609258745Savgint
1610258745Savgdmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1611258745Savg    vm_page_t *ma, dmu_tx_t *tx)
1612258745Savg{
1613258745Savg	dmu_buf_t **dbp;
1614258745Savg	struct sf_buf *sf;
1615258745Savg	int numbufs, i;
1616258745Savg	int err;
1617258745Savg
1618258745Savg	if (size == 0)
1619258745Savg		return (0);
1620258745Savg
1621258745Savg	err = dmu_buf_hold_array(os, object, offset, size,
1622258745Savg	    FALSE, FTAG, &numbufs, &dbp);
1623258745Savg	if (err)
1624258745Savg		return (err);
1625258745Savg
1626258745Savg	for (i = 0; i < numbufs; i++) {
1627258745Savg		int tocpy, copied, thiscpy;
1628258745Savg		int bufoff;
1629258745Savg		dmu_buf_t *db = dbp[i];
1630258745Savg		caddr_t va;
1631258745Savg
1632258745Savg		ASSERT(size > 0);
1633258745Savg		ASSERT3U(db->db_size, >=, PAGESIZE);
1634258745Savg
1635258745Savg		bufoff = offset - db->db_offset;
1636258745Savg		tocpy = (int)MIN(db->db_size - bufoff, size);
1637258745Savg
1638258745Savg		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1639258745Savg
1640258745Savg		if (tocpy == db->db_size)
1641258745Savg			dmu_buf_will_fill(db, tx);
1642258745Savg		else
1643258745Savg			dmu_buf_will_dirty(db, tx);
1644258745Savg
1645258745Savg		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
1646258745Savg			ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff);
1647258745Savg			thiscpy = MIN(PAGESIZE, tocpy - copied);
1648258745Savg			va = zfs_map_page(*ma, &sf);
1649258745Savg			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
1650258745Savg			zfs_unmap_page(sf);
1651258745Savg			ma += 1;
1652258745Savg			bufoff += PAGESIZE;
1653258745Savg		}
1654258745Savg
1655258745Savg		if (tocpy == db->db_size)
1656258745Savg			dmu_buf_fill_done(db, tx);
1657258745Savg
1658258745Savg		offset += tocpy;
1659258745Savg		size -= tocpy;
1660258745Savg	}
1661258745Savg	dmu_buf_rele_array(dbp, numbufs, FTAG);
1662258745Savg	return (err);
1663258745Savg}
1664330991Savg
1665330991Savgint
1666330991Savgdmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
1667330991Savg    int *rbehind, int *rahead, int last_size)
1668330991Savg{
1669330991Savg	struct sf_buf *sf;
1670330991Savg	vm_object_t vmobj;
1671330991Savg	vm_page_t m;
1672330991Savg	dmu_buf_t **dbp;
1673330991Savg	dmu_buf_t *db;
1674330991Savg	caddr_t va;
1675330991Savg	int numbufs, i;
1676330991Savg	int bufoff, pgoff, tocpy;
1677330991Savg	int mi, di;
1678330991Savg	int err;
1679330991Savg
1680330991Savg	ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
1681330991Savg	ASSERT(last_size <= PAGE_SIZE);
1682330991Savg
1683330991Savg	err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
1684330991Savg	    IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
1685330991Savg	if (err != 0)
1686330991Savg		return (err);
1687330991Savg
1688330991Savg#ifdef DEBUG
1689330991Savg	IMPLY(last_size < PAGE_SIZE, *rahead == 0);
1690330991Savg	if (dbp[0]->db_offset != 0 || numbufs > 1) {
1691330991Savg		for (i = 0; i < numbufs; i++) {
1692330991Savg			ASSERT(ISP2(dbp[i]->db_size));
1693330991Savg			ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
1694330991Savg			ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
1695330991Savg		}
1696330991Savg	}
1697330991Savg#endif
1698330991Savg
1699330991Savg	vmobj = ma[0]->object;
1700330991Savg	zfs_vmobject_wlock(vmobj);
1701330991Savg
1702330991Savg	db = dbp[0];
1703330991Savg	for (i = 0; i < *rbehind; i++) {
1704330991Savg		m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i,
1705330991Savg		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
1706330991Savg		if (m == NULL)
1707330991Savg			break;
1708330991Savg		if (m->valid != 0) {
1709330991Savg			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
1710330991Savg			break;
1711330991Savg		}
1712330991Savg		ASSERT(m->dirty == 0);
1713330991Savg		ASSERT(!pmap_page_is_mapped(m));
1714330991Savg
1715330991Savg		ASSERT(db->db_size > PAGE_SIZE);
1716330991Savg		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
1717330991Savg		va = zfs_map_page(m, &sf);
1718330991Savg		bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
1719330991Savg		zfs_unmap_page(sf);
1720330991Savg		m->valid = VM_PAGE_BITS_ALL;
1721330991Savg		vm_page_lock(m);
1722330991Savg		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
1723330991Savg			vm_page_activate(m);
1724330991Savg		else
1725330991Savg			vm_page_deactivate(m);
1726330991Savg		vm_page_unlock(m);
1727330991Savg	}
1728330991Savg	*rbehind = i;
1729330991Savg
1730330991Savg	bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
1731330991Savg	pgoff = 0;
1732330991Savg	for (mi = 0, di = 0; mi < count && di < numbufs; ) {
1733330991Savg		if (pgoff == 0) {
1734330991Savg			m = ma[mi];
1735330991Savg			vm_page_assert_xbusied(m);
1736330991Savg			ASSERT(m->valid == 0);
1737330991Savg			ASSERT(m->dirty == 0);
1738330991Savg			ASSERT(!pmap_page_is_mapped(m));
1739330991Savg			va = zfs_map_page(m, &sf);
1740330991Savg		}
1741330991Savg		if (bufoff == 0)
1742330991Savg			db = dbp[di];
1743330991Savg
1744330991Savg		ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
1745330991Savg		    db->db_offset + bufoff);
1746330991Savg
1747330991Savg		/*
1748330991Savg		 * We do not need to clamp the copy size by the file
1749330991Savg		 * size as the last block is zero-filled beyond the
1750330991Savg		 * end of file anyway.
1751330991Savg		 */
1752330991Savg		tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
1753330991Savg		bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
1754330991Savg
1755330991Savg		pgoff += tocpy;
1756330991Savg		ASSERT(pgoff <= PAGESIZE);
1757330991Savg		if (pgoff == PAGESIZE) {
1758330991Savg			zfs_unmap_page(sf);
1759330991Savg			m->valid = VM_PAGE_BITS_ALL;
1760330991Savg			ASSERT(mi < count);
1761330991Savg			mi++;
1762330991Savg			pgoff = 0;
1763330991Savg		}
1764330991Savg
1765330991Savg		bufoff += tocpy;
1766330991Savg		ASSERT(bufoff <= db->db_size);
1767330991Savg		if (bufoff == db->db_size) {
1768330991Savg			ASSERT(di < numbufs);
1769330991Savg			di++;
1770330991Savg			bufoff = 0;
1771330991Savg		}
1772330991Savg	}
1773330991Savg
1774330991Savg#ifdef DEBUG
1775330991Savg	/*
1776330991Savg	 * Three possibilities:
1777330991Savg	 * - last requested page ends at a buffer boundary and , thus,
1778330991Savg	 *   all pages and buffers have been iterated;
1779330991Savg	 * - all requested pages are filled, but the last buffer
1780330991Savg	 *   has not been exhausted;
1781330991Savg	 *   the read-ahead is possible only in this case;
1782330991Savg	 * - all buffers have been read, but the last page has not been
1783330991Savg	 *   fully filled;
1784330991Savg	 *   this is only possible if the file has only a single buffer
1785330991Savg	 *   with a size that is not a multiple of the page size.
1786330991Savg	 */
1787330991Savg	if (mi == count) {
1788330991Savg		ASSERT(di >= numbufs - 1);
1789330991Savg		IMPLY(*rahead != 0, di == numbufs - 1);
1790330991Savg		IMPLY(*rahead != 0, bufoff != 0);
1791330991Savg		ASSERT(pgoff == 0);
1792330991Savg	}
1793330991Savg	if (di == numbufs) {
1794330991Savg		ASSERT(mi >= count - 1);
1795330991Savg		ASSERT(*rahead == 0);
1796330991Savg		IMPLY(pgoff == 0, mi == count);
1797330991Savg		if (pgoff != 0) {
1798330991Savg			ASSERT(mi == count - 1);
1799330991Savg			ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
1800330991Savg		}
1801330991Savg	}
1802330991Savg#endif
1803330991Savg	if (pgoff != 0) {
1804330991Savg		bzero(va + pgoff, PAGESIZE - pgoff);
1805330991Savg		zfs_unmap_page(sf);
1806330991Savg		m->valid = VM_PAGE_BITS_ALL;
1807330991Savg	}
1808330991Savg
1809330991Savg	for (i = 0; i < *rahead; i++) {
1810330991Savg		m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i,
1811330991Savg		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
1812330991Savg		if (m == NULL)
1813330991Savg			break;
1814330991Savg		if (m->valid != 0) {
1815330991Savg			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
1816330991Savg			break;
1817330991Savg		}
1818330991Savg		ASSERT(m->dirty == 0);
1819330991Savg		ASSERT(!pmap_page_is_mapped(m));
1820330991Savg
1821330991Savg		ASSERT(db->db_size > PAGE_SIZE);
1822330991Savg		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
1823330991Savg		tocpy = MIN(db->db_size - bufoff, PAGESIZE);
1824330991Savg		va = zfs_map_page(m, &sf);
1825330991Savg		bcopy((char *)db->db_data + bufoff, va, tocpy);
1826330991Savg		if (tocpy < PAGESIZE) {
1827330991Savg			ASSERT(i == *rahead - 1);
1828330991Savg			ASSERT((db->db_size & PAGE_MASK) != 0);
1829330991Savg			bzero(va + tocpy, PAGESIZE - tocpy);
1830330991Savg		}
1831330991Savg		zfs_unmap_page(sf);
1832330991Savg		m->valid = VM_PAGE_BITS_ALL;
1833330991Savg		vm_page_lock(m);
1834330991Savg		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
1835330991Savg			vm_page_activate(m);
1836330991Savg		else
1837330991Savg			vm_page_deactivate(m);
1838330991Savg		vm_page_unlock(m);
1839330991Savg	}
1840330991Savg	*rahead = i;
1841330991Savg	zfs_vmobject_wunlock(vmobj);
1842330991Savg
1843330991Savg	dmu_buf_rele_array(dbp, numbufs, FTAG);
1844330991Savg	return (0);
1845330991Savg}
1846277300Ssmh#endif	/* illumos */
1847277300Ssmh#endif	/* _KERNEL */
1848168404Spjd
1849209962Smm/*
1850209962Smm * Allocate a loaned anonymous arc buffer.
1851209962Smm */
1852209962Smmarc_buf_t *
1853209962Smmdmu_request_arcbuf(dmu_buf_t *handle, int size)
1854209962Smm{
1855219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
1856209962Smm
1857321535Smav	return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
1858209962Smm}
1859209962Smm
1860209962Smm/*
1861209962Smm * Free a loaned arc buffer.
1862209962Smm */
1863209962Smmvoid
1864209962Smmdmu_return_arcbuf(arc_buf_t *buf)
1865209962Smm{
1866209962Smm	arc_return_buf(buf, FTAG);
1867307265Smav	arc_buf_destroy(buf, FTAG);
1868209962Smm}
1869209962Smm
1870209962Smm/*
1871209962Smm * When possible directly assign passed loaned arc buffer to a dbuf.
1872209962Smm * If this is not possible copy the contents of passed arc buf via
1873209962Smm * dmu_write().
1874209962Smm */
1875209962Smmvoid
1876339128Smavdmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
1877209962Smm    dmu_tx_t *tx)
1878209962Smm{
1879209962Smm	dmu_buf_impl_t *db;
1880321535Smav	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
1881209962Smm	uint64_t blkid;
1882209962Smm
1883209962Smm	rw_enter(&dn->dn_struct_rwlock, RW_READER);
1884286705Smav	blkid = dbuf_whichblock(dn, 0, offset);
1885209962Smm	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
1886209962Smm	rw_exit(&dn->dn_struct_rwlock);
1887209962Smm
1888272601Sdelphij	/*
1889272601Sdelphij	 * We can only assign if the offset is aligned, the arc buf is the
1890321535Smav	 * same size as the dbuf, and the dbuf is not metadata.
1891272601Sdelphij	 */
1892321535Smav	if (offset == db->db.db_offset && blksz == db->db.db_size) {
1893294625Strasz#ifdef _KERNEL
1894294625Strasz		curthread->td_ru.ru_oublock++;
1895297633Strasz#ifdef RACCT
1896297633Strasz		if (racct_enable) {
1897297633Strasz			PROC_LOCK(curproc);
1898297633Strasz			racct_add_force(curproc, RACCT_WRITEBPS, blksz);
1899297633Strasz			racct_add_force(curproc, RACCT_WRITEIOPS, 1);
1900297633Strasz			PROC_UNLOCK(curproc);
1901297633Strasz		}
1902297633Strasz#endif /* RACCT */
1903297633Strasz#endif /* _KERNEL */
1904209962Smm		dbuf_assign_arcbuf(db, buf, tx);
1905209962Smm		dbuf_rele(db, FTAG);
1906209962Smm	} else {
1907219089Spjd		objset_t *os;
1908219089Spjd		uint64_t object;
1909219089Spjd
1910321535Smav		/* compressed bufs must always be assignable to their dbuf */
1911321535Smav		ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
1912321535Smav		ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
1913321535Smav
1914219089Spjd		os = dn->dn_objset;
1915219089Spjd		object = dn->dn_object;
1916219089Spjd
1917209962Smm		dbuf_rele(db, FTAG);
1918219089Spjd		dmu_write(os, object, offset, blksz, buf->b_data, tx);
1919209962Smm		dmu_return_arcbuf(buf);
1920219089Spjd		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1921209962Smm	}
1922209962Smm}
1923209962Smm
1924339128Smavvoid
1925339128Smavdmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
1926339128Smav    dmu_tx_t *tx)
1927339128Smav{
1928339128Smav	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
1929339128Smav
1930339128Smav	DB_DNODE_ENTER(dbuf);
1931339128Smav	dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx);
1932339128Smav	DB_DNODE_EXIT(dbuf);
1933339128Smav}
1934339128Smav
1935168404Spjdtypedef struct {
1936219089Spjd	dbuf_dirty_record_t	*dsa_dr;
1937219089Spjd	dmu_sync_cb_t		*dsa_done;
1938219089Spjd	zgd_t			*dsa_zgd;
1939219089Spjd	dmu_tx_t		*dsa_tx;
1940168404Spjd} dmu_sync_arg_t;
1941168404Spjd
1942168404Spjd/* ARGSUSED */
1943168404Spjdstatic void
1944185029Spjddmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
1945185029Spjd{
1946219089Spjd	dmu_sync_arg_t *dsa = varg;
1947219089Spjd	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
1948185029Spjd	blkptr_t *bp = zio->io_bp;
1949185029Spjd
1950219089Spjd	if (zio->io_error == 0) {
1951219089Spjd		if (BP_IS_HOLE(bp)) {
1952219089Spjd			/*
1953219089Spjd			 * A block of zeros may compress to a hole, but the
1954219089Spjd			 * block size still needs to be known for replay.
1955219089Spjd			 */
1956219089Spjd			BP_SET_LSIZE(bp, db->db_size);
1957268075Sdelphij		} else if (!BP_IS_EMBEDDED(bp)) {
1958219089Spjd			ASSERT(BP_GET_LEVEL(bp) == 0);
1959219089Spjd			bp->blk_fill = 1;
1960219089Spjd		}
1961185029Spjd	}
1962185029Spjd}
1963185029Spjd
1964219089Spjdstatic void
1965219089Spjddmu_sync_late_arrival_ready(zio_t *zio)
1966219089Spjd{
1967219089Spjd	dmu_sync_ready(zio, NULL, zio->io_private);
1968219089Spjd}
1969219089Spjd
1970185029Spjd/* ARGSUSED */
1971185029Spjdstatic void
1972168404Spjddmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
1973168404Spjd{
1974219089Spjd	dmu_sync_arg_t *dsa = varg;
1975219089Spjd	dbuf_dirty_record_t *dr = dsa->dsa_dr;
1976168404Spjd	dmu_buf_impl_t *db = dr->dr_dbuf;
1977168404Spjd
1978168404Spjd	mutex_enter(&db->db_mtx);
1979168404Spjd	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
1980219089Spjd	if (zio->io_error == 0) {
1981243524Smm		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
1982243524Smm		if (dr->dt.dl.dr_nopwrite) {
1983243524Smm			blkptr_t *bp = zio->io_bp;
1984243524Smm			blkptr_t *bp_orig = &zio->io_bp_orig;
1985243524Smm			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
1986243524Smm
1987243524Smm			ASSERT(BP_EQUAL(bp, bp_orig));
1988323748Savg			VERIFY(BP_EQUAL(bp, db->db_blkptr));
1989243524Smm			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
1990289422Smav			ASSERT(zio_checksum_table[chksum].ci_flags &
1991289422Smav			    ZCHECKSUM_FLAG_NOPWRITE);
1992243524Smm		}
1993219089Spjd		dr->dt.dl.dr_overridden_by = *zio->io_bp;
1994219089Spjd		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
1995219089Spjd		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
1996286677Smav
1997286677Smav		/*
1998286677Smav		 * Old style holes are filled with all zeros, whereas
1999286677Smav		 * new-style holes maintain their lsize, type, level,
2000286677Smav		 * and birth time (see zio_write_compress). While we
2001286677Smav		 * need to reset the BP_SET_LSIZE() call that happened
2002286677Smav		 * in dmu_sync_ready for old style holes, we do *not*
2003286677Smav		 * want to wipe out the information contained in new
2004286677Smav		 * style holes. Thus, only zero out the block pointer if
2005286677Smav		 * it's an old style hole.
2006286677Smav		 */
2007286677Smav		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
2008286677Smav		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
2009219089Spjd			BP_ZERO(&dr->dt.dl.dr_overridden_by);
2010219089Spjd	} else {
2011219089Spjd		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2012219089Spjd	}
2013168404Spjd	cv_broadcast(&db->db_changed);
2014168404Spjd	mutex_exit(&db->db_mtx);
2015168404Spjd
2016219089Spjd	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
2017168404Spjd
2018219089Spjd	kmem_free(dsa, sizeof (*dsa));
2019168404Spjd}
2020168404Spjd
2021219089Spjdstatic void
2022219089Spjddmu_sync_late_arrival_done(zio_t *zio)
2023219089Spjd{
2024219089Spjd	blkptr_t *bp = zio->io_bp;
2025219089Spjd	dmu_sync_arg_t *dsa = zio->io_private;
2026243524Smm	blkptr_t *bp_orig = &zio->io_bp_orig;
2027219089Spjd
2028219089Spjd	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
2029323748Savg		ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
2030323748Savg		ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
2031323748Savg		ASSERT(zio->io_bp->blk_birth == zio->io_txg);
2032323748Savg		ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
2033323748Savg		zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
2034219089Spjd	}
2035219089Spjd
2036219089Spjd	dmu_tx_commit(dsa->dsa_tx);
2037219089Spjd
2038219089Spjd	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
2039219089Spjd
2040321610Smav	abd_put(zio->io_abd);
2041219089Spjd	kmem_free(dsa, sizeof (*dsa));
2042219089Spjd}
2043219089Spjd
2044219089Spjdstatic int
2045219089Spjddmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
2046268123Sdelphij    zio_prop_t *zp, zbookmark_phys_t *zb)
2047219089Spjd{
2048219089Spjd	dmu_sync_arg_t *dsa;
2049219089Spjd	dmu_tx_t *tx;
2050219089Spjd
2051219089Spjd	tx = dmu_tx_create(os);
2052219089Spjd	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
2053219089Spjd	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
2054219089Spjd		dmu_tx_abort(tx);
2055249195Smm		/* Make zl_get_data do txg_waited_synced() */
2056249195Smm		return (SET_ERROR(EIO));
2057219089Spjd	}
2058219089Spjd
2059325132Savg	/*
2060325132Savg	 * In order to prevent the zgd's lwb from being free'd prior to
2061325132Savg	 * dmu_sync_late_arrival_done() being called, we have to ensure
2062325132Savg	 * the lwb's "max txg" takes this tx's txg into account.
2063325132Savg	 */
2064325132Savg	zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
2065325132Savg
2066219089Spjd	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
2067219089Spjd	dsa->dsa_dr = NULL;
2068219089Spjd	dsa->dsa_done = done;
2069219089Spjd	dsa->dsa_zgd = zgd;
2070219089Spjd	dsa->dsa_tx = tx;
2071219089Spjd
2072323748Savg	/*
2073323748Savg	 * Since we are currently syncing this txg, it's nontrivial to
2074323748Savg	 * determine what BP to nopwrite against, so we disable nopwrite.
2075323748Savg	 *
2076323748Savg	 * When syncing, the db_blkptr is initially the BP of the previous
2077323748Savg	 * txg.  We can not nopwrite against it because it will be changed
2078323748Savg	 * (this is similar to the non-late-arrival case where the dbuf is
2079323748Savg	 * dirty in a future txg).
2080323748Savg	 *
2081323748Savg	 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
2082323748Savg	 * We can not nopwrite against it because although the BP will not
2083323748Savg	 * (typically) be changed, the data has not yet been persisted to this
2084323748Savg	 * location.
2085323748Savg	 *
2086323748Savg	 * Finally, when dbuf_write_done() is called, it is theoretically
2087323748Savg	 * possible to always nopwrite, because the data that was written in
2088323748Savg	 * this txg is the same data that we are trying to write.  However we
2089323748Savg	 * would need to check that this dbuf is not dirty in any future
2090323748Savg	 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
2091323748Savg	 * don't nopwrite in this case.
2092323748Savg	 */
2093323748Savg	zp->zp_nopwrite = B_FALSE;
2094323748Savg
2095321535Smav	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
2096321610Smav	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
2097321610Smav	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
2098321610Smav	    dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
2099321610Smav	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
2100219089Spjd
2101219089Spjd	return (0);
2102219089Spjd}
2103219089Spjd
2104168404Spjd/*
2105168404Spjd * Intent log support: sync the block associated with db to disk.
2106168404Spjd * N.B. and XXX: the caller is responsible for making sure that the
2107168404Spjd * data isn't changing while dmu_sync() is writing it.
2108168404Spjd *
2109168404Spjd * Return values:
2110168404Spjd *
2111243524Smm *	EEXIST: this txg has already been synced, so there's nothing to do.
2112168404Spjd *		The caller should not log the write.
2113168404Spjd *
2114168404Spjd *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
2115168404Spjd *		The caller should not log the write.
2116168404Spjd *
2117168404Spjd *	EALREADY: this block is already in the process of being synced.
2118168404Spjd *		The caller should track its progress (somehow).
2119168404Spjd *
2120219089Spjd *	EIO: could not do the I/O.
2121219089Spjd *		The caller should do a txg_wait_synced().
2122168404Spjd *
2123219089Spjd *	0: the I/O has been initiated.
2124219089Spjd *		The caller should log this blkptr in the done callback.
2125219089Spjd *		It is possible that the I/O will fail, in which case
2126219089Spjd *		the error will be reported to the done callback and
2127219089Spjd *		propagated to pio from zio_done().
2128168404Spjd */
2129168404Spjdint
2130219089Spjddmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
2131168404Spjd{
2132219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
2133219089Spjd	objset_t *os = db->db_objset;
2134219089Spjd	dsl_dataset_t *ds = os->os_dsl_dataset;
2135168404Spjd	dbuf_dirty_record_t *dr;
2136219089Spjd	dmu_sync_arg_t *dsa;
2137268123Sdelphij	zbookmark_phys_t zb;
2138219089Spjd	zio_prop_t zp;
2139219089Spjd	dnode_t *dn;
2140168404Spjd
2141219089Spjd	ASSERT(pio != NULL);
2142168404Spjd	ASSERT(txg != 0);
2143168404Spjd
2144219089Spjd	SET_BOOKMARK(&zb, ds->ds_object,
2145219089Spjd	    db->db.db_object, db->db_level, db->db_blkid);
2146168404Spjd
2147219089Spjd	DB_DNODE_ENTER(db);
2148219089Spjd	dn = DB_DNODE(db);
2149321573Smav	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
2150219089Spjd	DB_DNODE_EXIT(db);
2151219089Spjd
2152168404Spjd	/*
2153219089Spjd	 * If we're frozen (running ziltest), we always need to generate a bp.
2154168404Spjd	 */
2155219089Spjd	if (txg > spa_freeze_txg(os->os_spa))
2156219089Spjd		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
2157168404Spjd
2158168404Spjd	/*
2159219089Spjd	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
2160219089Spjd	 * and us.  If we determine that this txg is not yet syncing,
2161219089Spjd	 * but it begins to sync a moment later, that's OK because the
2162219089Spjd	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
2163168404Spjd	 */
2164219089Spjd	mutex_enter(&db->db_mtx);
2165219089Spjd
2166219089Spjd	if (txg <= spa_last_synced_txg(os->os_spa)) {
2167168404Spjd		/*
2168219089Spjd		 * This txg has already synced.  There's nothing to do.
2169168404Spjd		 */
2170219089Spjd		mutex_exit(&db->db_mtx);
2171249195Smm		return (SET_ERROR(EEXIST));
2172168404Spjd	}
2173168404Spjd
2174219089Spjd	if (txg <= spa_syncing_txg(os->os_spa)) {
2175219089Spjd		/*
2176219089Spjd		 * This txg is currently syncing, so we can't mess with
2177219089Spjd		 * the dirty record anymore; just write a new log block.
2178219089Spjd		 */
2179219089Spjd		mutex_exit(&db->db_mtx);
2180219089Spjd		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
2181168404Spjd	}
2182168404Spjd
2183168404Spjd	dr = db->db_last_dirty;
2184219089Spjd	while (dr && dr->dr_txg != txg)
2185168404Spjd		dr = dr->dr_next;
2186219089Spjd
2187219089Spjd	if (dr == NULL) {
2188168404Spjd		/*
2189219089Spjd		 * There's no dr for this dbuf, so it must have been freed.
2190168404Spjd		 * There's no need to log writes to freed blocks, so we're done.
2191168404Spjd		 */
2192168404Spjd		mutex_exit(&db->db_mtx);
2193249195Smm		return (SET_ERROR(ENOENT));
2194168404Spjd	}
2195168404Spjd
2196243524Smm	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
2197243524Smm
2198323748Savg	if (db->db_blkptr != NULL) {
2199323748Savg		/*
2200323748Savg		 * We need to fill in zgd_bp with the current blkptr so that
2201323748Savg		 * the nopwrite code can check if we're writing the same
2202323748Savg		 * data that's already on disk.  We can only nopwrite if we
2203323748Savg		 * are sure that after making the copy, db_blkptr will not
2204323748Savg		 * change until our i/o completes.  We ensure this by
2205323748Savg		 * holding the db_mtx, and only allowing nopwrite if the
2206323748Savg		 * block is not already dirty (see below).  This is verified
2207323748Savg		 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
2208323748Savg		 * not changed.
2209323748Savg		 */
2210323748Savg		*zgd->zgd_bp = *db->db_blkptr;
2211323748Savg	}
2212323748Savg
2213243524Smm	/*
2214286589Smav	 * Assume the on-disk data is X, the current syncing data (in
2215286589Smav	 * txg - 1) is Y, and the current in-memory data is Z (currently
2216286589Smav	 * in dmu_sync).
2217286589Smav	 *
2218286589Smav	 * We usually want to perform a nopwrite if X and Z are the
2219286589Smav	 * same.  However, if Y is different (i.e. the BP is going to
2220286589Smav	 * change before this write takes effect), then a nopwrite will
2221286589Smav	 * be incorrect - we would override with X, which could have
2222286589Smav	 * been freed when Y was written.
2223286589Smav	 *
2224286589Smav	 * (Note that this is not a concern when we are nop-writing from
2225286589Smav	 * syncing context, because X and Y must be identical, because
2226286589Smav	 * all previous txgs have been synced.)
2227286589Smav	 *
2228286589Smav	 * Therefore, we disable nopwrite if the current BP could change
2229286589Smav	 * before this TXG.  There are two ways it could change: by
2230286589Smav	 * being dirty (dr_next is non-NULL), or by being freed
2231286589Smav	 * (dnode_block_freed()).  This behavior is verified by
2232286589Smav	 * zio_done(), which VERIFYs that the override BP is identical
2233286589Smav	 * to the on-disk BP.
2234243524Smm	 */
2235286589Smav	DB_DNODE_ENTER(db);
2236286589Smav	dn = DB_DNODE(db);
2237286589Smav	if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
2238243524Smm		zp.zp_nopwrite = B_FALSE;
2239286589Smav	DB_DNODE_EXIT(db);
2240243524Smm
2241168404Spjd	ASSERT(dr->dr_txg == txg);
2242219089Spjd	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
2243219089Spjd	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2244168404Spjd		/*
2245219089Spjd		 * We have already issued a sync write for this buffer,
2246219089Spjd		 * or this buffer has already been synced.  It could not
2247219089Spjd		 * have been dirtied since, or we would have cleared the state.
2248168404Spjd		 */
2249168404Spjd		mutex_exit(&db->db_mtx);
2250249195Smm		return (SET_ERROR(EALREADY));
2251168404Spjd	}
2252168404Spjd
2253219089Spjd	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2254168404Spjd	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
2255168404Spjd	mutex_exit(&db->db_mtx);
2256168404Spjd
2257219089Spjd	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
2258219089Spjd	dsa->dsa_dr = dr;
2259219089Spjd	dsa->dsa_done = done;
2260219089Spjd	dsa->dsa_zgd = zgd;
2261219089Spjd	dsa->dsa_tx = NULL;
2262168404Spjd
2263219089Spjd	zio_nowait(arc_write(pio, os->os_spa, txg,
2264323748Savg	    zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
2265307265Smav	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
2266304138Savg	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
2267185029Spjd
2268219089Spjd	return (0);
2269168404Spjd}
2270168404Spjd
2271168404Spjdint
2272168404Spjddmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
2273289562Smav    dmu_tx_t *tx)
2274168404Spjd{
2275168404Spjd	dnode_t *dn;
2276168404Spjd	int err;
2277168404Spjd
2278219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
2279168404Spjd	if (err)
2280168404Spjd		return (err);
2281168404Spjd	err = dnode_set_blksz(dn, size, ibs, tx);
2282168404Spjd	dnode_rele(dn, FTAG);
2283168404Spjd	return (err);
2284168404Spjd}
2285168404Spjd
2286168404Spjdvoid
2287168404Spjddmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
2288289562Smav    dmu_tx_t *tx)
2289168404Spjd{
2290168404Spjd	dnode_t *dn;
2291168404Spjd
2292268075Sdelphij	/*
2293268075Sdelphij	 * Send streams include each object's checksum function.  This
2294268075Sdelphij	 * check ensures that the receiving system can understand the
2295268075Sdelphij	 * checksum function transmitted.
2296268075Sdelphij	 */
2297268075Sdelphij	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
2298268075Sdelphij
2299268075Sdelphij	VERIFY0(dnode_hold(os, object, FTAG, &dn));
2300268075Sdelphij	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
2301168404Spjd	dn->dn_checksum = checksum;
2302168404Spjd	dnode_setdirty(dn, tx);
2303168404Spjd	dnode_rele(dn, FTAG);
2304168404Spjd}
2305168404Spjd
2306168404Spjdvoid
2307168404Spjddmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
2308289562Smav    dmu_tx_t *tx)
2309168404Spjd{
2310168404Spjd	dnode_t *dn;
2311168404Spjd
2312268075Sdelphij	/*
2313268075Sdelphij	 * Send streams include each object's compression function.  This
2314268075Sdelphij	 * check ensures that the receiving system can understand the
2315268075Sdelphij	 * compression function transmitted.
2316268075Sdelphij	 */
2317268075Sdelphij	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
2318268075Sdelphij
2319268075Sdelphij	VERIFY0(dnode_hold(os, object, FTAG, &dn));
2320168404Spjd	dn->dn_compress = compress;
2321168404Spjd	dnode_setdirty(dn, tx);
2322168404Spjd	dnode_rele(dn, FTAG);
2323168404Spjd}
2324168404Spjd
2325219089Spjdint zfs_mdcomp_disable = 0;
2326267992ShselaskySYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN,
2327219089Spjd    &zfs_mdcomp_disable, 0, "Disable metadata compression");
2328219089Spjd
2329266771Sdelphij/*
2330266771Sdelphij * When the "redundant_metadata" property is set to "most", only indirect
2331266771Sdelphij * blocks of this level and higher will have an additional ditto block.
2332266771Sdelphij */
2333266771Sdelphijint zfs_redundant_metadata_most_ditto_level = 2;
2334266771Sdelphij
2335219089Spjdvoid
2336321573Smavdmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
2337219089Spjd{
2338219089Spjd	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
2339236884Smm	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
2340219089Spjd	    (wp & WP_SPILL));
2341219089Spjd	enum zio_checksum checksum = os->os_checksum;
2342219089Spjd	enum zio_compress compress = os->os_compress;
2343219089Spjd	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
2344243524Smm	boolean_t dedup = B_FALSE;
2345243524Smm	boolean_t nopwrite = B_FALSE;
2346219089Spjd	boolean_t dedup_verify = os->os_dedup_verify;
2347219089Spjd	int copies = os->os_copies;
2348219089Spjd
2349219089Spjd	/*
2350243524Smm	 * We maintain different write policies for each of the following
2351243524Smm	 * types of data:
2352243524Smm	 *	 1. metadata
2353243524Smm	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
2354243524Smm	 *	 3. all other level 0 blocks
2355219089Spjd	 */
2356219089Spjd	if (ismd) {
2357268126Sdelphij		if (zfs_mdcomp_disable) {
2358268126Sdelphij			compress = ZIO_COMPRESS_EMPTY;
2359268126Sdelphij		} else {
2360286547Smav			/*
2361286547Smav			 * XXX -- we should design a compression algorithm
2362286547Smav			 * that specializes in arrays of bps.
2363286547Smav			 */
2364286547Smav			compress = zio_compress_select(os->os_spa,
2365286547Smav			    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
2366268126Sdelphij		}
2367268126Sdelphij
2368243524Smm		/*
2369219089Spjd		 * Metadata always gets checksummed.  If the data
2370219089Spjd		 * checksum is multi-bit correctable, and it's not a
2371219089Spjd		 * ZBT-style checksum, then it's suitable for metadata
2372219089Spjd		 * as well.  Otherwise, the metadata checksum defaults
2373219089Spjd		 * to fletcher4.
2374219089Spjd		 */
2375289422Smav		if (!(zio_checksum_table[checksum].ci_flags &
2376289422Smav		    ZCHECKSUM_FLAG_METADATA) ||
2377289422Smav		    (zio_checksum_table[checksum].ci_flags &
2378289422Smav		    ZCHECKSUM_FLAG_EMBEDDED))
2379219089Spjd			checksum = ZIO_CHECKSUM_FLETCHER_4;
2380266771Sdelphij
2381266771Sdelphij		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
2382266771Sdelphij		    (os->os_redundant_metadata ==
2383266771Sdelphij		    ZFS_REDUNDANT_METADATA_MOST &&
2384266771Sdelphij		    (level >= zfs_redundant_metadata_most_ditto_level ||
2385266771Sdelphij		    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
2386266771Sdelphij			copies++;
2387243524Smm	} else if (wp & WP_NOFILL) {
2388243524Smm		ASSERT(level == 0);
2389219089Spjd
2390219089Spjd		/*
2391243524Smm		 * If we're writing preallocated blocks, we aren't actually
2392243524Smm		 * writing them so don't set any policy properties.  These
2393243524Smm		 * blocks are currently only used by an external subsystem
2394243524Smm		 * outside of zfs (i.e. dump) and not written by the zio
2395243524Smm		 * pipeline.
2396219089Spjd		 */
2397243524Smm		compress = ZIO_COMPRESS_OFF;
2398255750Sdelphij		checksum = ZIO_CHECKSUM_NOPARITY;
2399219089Spjd	} else {
2400286547Smav		compress = zio_compress_select(os->os_spa, dn->dn_compress,
2401286547Smav		    compress);
2402219089Spjd
2403243524Smm		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
2404243524Smm		    zio_checksum_select(dn->dn_checksum, checksum) :
2405243524Smm		    dedup_checksum;
2406219089Spjd
2407243524Smm		/*
2408243524Smm		 * Determine dedup setting.  If we are in dmu_sync(),
2409243524Smm		 * we won't actually dedup now because that's all
2410243524Smm		 * done in syncing context; but we do want to use the
2411243524Smm		 * dedup checkum.  If the checksum is not strong
2412243524Smm		 * enough to ensure unique signatures, force
2413243524Smm		 * dedup_verify.
2414243524Smm		 */
2415243524Smm		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
2416243524Smm			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
2417289422Smav			if (!(zio_checksum_table[checksum].ci_flags &
2418289422Smav			    ZCHECKSUM_FLAG_DEDUP))
2419243524Smm				dedup_verify = B_TRUE;
2420243524Smm		}
2421219089Spjd
2422243524Smm		/*
2423289422Smav		 * Enable nopwrite if we have secure enough checksum
2424289422Smav		 * algorithm (see comment in zio_nop_write) and
2425289422Smav		 * compression is enabled.  We don't enable nopwrite if
2426289422Smav		 * dedup is enabled as the two features are mutually
2427289422Smav		 * exclusive.
2428243524Smm		 */
2429289422Smav		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
2430289422Smav		    ZCHECKSUM_FLAG_NOPWRITE) &&
2431243524Smm		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
2432219089Spjd	}
2433219089Spjd
2434219089Spjd	zp->zp_checksum = checksum;
2435321573Smav	zp->zp_compress = compress;
2436321535Smav	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
2437321535Smav
2438219089Spjd	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
2439219089Spjd	zp->zp_level = level;
2440266771Sdelphij	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
2441219089Spjd	zp->zp_dedup = dedup;
2442219089Spjd	zp->zp_dedup_verify = dedup && dedup_verify;
2443243524Smm	zp->zp_nopwrite = nopwrite;
2444219089Spjd}
2445219089Spjd
2446168404Spjdint
2447168404Spjddmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
2448168404Spjd{
2449168404Spjd	dnode_t *dn;
2450287103Savg	int err;
2451168404Spjd
2452168404Spjd	/*
2453168404Spjd	 * Sync any current changes before
2454168404Spjd	 * we go trundling through the block pointers.
2455168404Spjd	 */
2456287103Savg	err = dmu_object_wait_synced(os, object);
2457287103Savg	if (err) {
2458287103Savg		return (err);
2459168404Spjd	}
2460287103Savg
2461287103Savg	err = dnode_hold(os, object, FTAG, &dn);
2462287103Savg	if (err) {
2463287103Savg		return (err);
2464168404Spjd	}
2465168404Spjd
2466185029Spjd	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
2467168404Spjd	dnode_rele(dn, FTAG);
2468168404Spjd
2469168404Spjd	return (err);
2470168404Spjd}
2471168404Spjd
2472287103Savg/*
2473287103Savg * Given the ZFS object, if it contains any dirty nodes
2474287103Savg * this function flushes all dirty blocks to disk. This
2475287103Savg * ensures the DMU object info is updated. A more efficient
2476287103Savg * future version might just find the TXG with the maximum
2477287103Savg * ID and wait for that to be synced.
2478287103Savg */
2479287103Savgint
2480289562Smavdmu_object_wait_synced(objset_t *os, uint64_t object)
2481289562Smav{
2482287103Savg	dnode_t *dn;
2483287103Savg	int error, i;
2484287103Savg
2485287103Savg	error = dnode_hold(os, object, FTAG, &dn);
2486287103Savg	if (error) {
2487287103Savg		return (error);
2488287103Savg	}
2489287103Savg
2490287103Savg	for (i = 0; i < TXG_SIZE; i++) {
2491287103Savg		if (list_link_active(&dn->dn_dirty_link[i])) {
2492287103Savg			break;
2493287103Savg		}
2494287103Savg	}
2495287103Savg	dnode_rele(dn, FTAG);
2496287103Savg	if (i != TXG_SIZE) {
2497287103Savg		txg_wait_synced(dmu_objset_pool(os), 0);
2498287103Savg	}
2499287103Savg
2500287103Savg	return (0);
2501287103Savg}
2502287103Savg
2503168404Spjdvoid
2504168404Spjddmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
2505168404Spjd{
2506219089Spjd	dnode_phys_t *dnp;
2507219089Spjd
2508168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
2509168404Spjd	mutex_enter(&dn->dn_mtx);
2510168404Spjd
2511219089Spjd	dnp = dn->dn_phys;
2512219089Spjd
2513168404Spjd	doi->doi_data_block_size = dn->dn_datablksz;
2514168404Spjd	doi->doi_metadata_block_size = dn->dn_indblkshift ?
2515168404Spjd	    1ULL << dn->dn_indblkshift : 0;
2516219089Spjd	doi->doi_type = dn->dn_type;
2517219089Spjd	doi->doi_bonus_type = dn->dn_bonustype;
2518219089Spjd	doi->doi_bonus_size = dn->dn_bonuslen;
2519168404Spjd	doi->doi_indirection = dn->dn_nlevels;
2520168404Spjd	doi->doi_checksum = dn->dn_checksum;
2521168404Spjd	doi->doi_compress = dn->dn_compress;
2522272810Sdelphij	doi->doi_nblkptr = dn->dn_nblkptr;
2523219089Spjd	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
2524247852Smm	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
2525219089Spjd	doi->doi_fill_count = 0;
2526219089Spjd	for (int i = 0; i < dnp->dn_nblkptr; i++)
2527268075Sdelphij		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
2528168404Spjd
2529168404Spjd	mutex_exit(&dn->dn_mtx);
2530168404Spjd	rw_exit(&dn->dn_struct_rwlock);
2531168404Spjd}
2532168404Spjd
2533168404Spjd/*
2534168404Spjd * Get information on a DMU object.
2535168404Spjd * If doi is NULL, just indicates whether the object exists.
2536168404Spjd */
2537168404Spjdint
2538168404Spjddmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
2539168404Spjd{
2540168404Spjd	dnode_t *dn;
2541219089Spjd	int err = dnode_hold(os, object, FTAG, &dn);
2542168404Spjd
2543168404Spjd	if (err)
2544168404Spjd		return (err);
2545168404Spjd
2546168404Spjd	if (doi != NULL)
2547168404Spjd		dmu_object_info_from_dnode(dn, doi);
2548168404Spjd
2549168404Spjd	dnode_rele(dn, FTAG);
2550168404Spjd	return (0);
2551168404Spjd}
2552168404Spjd
2553168404Spjd/*
2554168404Spjd * As above, but faster; can be used when you have a held dbuf in hand.
2555168404Spjd */
2556168404Spjdvoid
2557219089Spjddmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
2558168404Spjd{
2559219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2560219089Spjd
2561219089Spjd	DB_DNODE_ENTER(db);
2562219089Spjd	dmu_object_info_from_dnode(DB_DNODE(db), doi);
2563219089Spjd	DB_DNODE_EXIT(db);
2564168404Spjd}
2565168404Spjd
2566168404Spjd/*
2567168404Spjd * Faster still when you only care about the size.
2568168404Spjd * This is specifically optimized for zfs_getattr().
2569168404Spjd */
2570168404Spjdvoid
2571219089Spjddmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
2572219089Spjd    u_longlong_t *nblk512)
2573168404Spjd{
2574219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2575219089Spjd	dnode_t *dn;
2576168404Spjd
2577219089Spjd	DB_DNODE_ENTER(db);
2578219089Spjd	dn = DB_DNODE(db);
2579219089Spjd
2580168404Spjd	*blksize = dn->dn_datablksz;
2581168404Spjd	/* add 1 for dnode space */
2582168404Spjd	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
2583168404Spjd	    SPA_MINBLOCKSHIFT) + 1;
2584219089Spjd	DB_DNODE_EXIT(db);
2585168404Spjd}
2586168404Spjd
2587168404Spjdvoid
2588168404Spjdbyteswap_uint64_array(void *vbuf, size_t size)
2589168404Spjd{
2590168404Spjd	uint64_t *buf = vbuf;
2591168404Spjd	size_t count = size >> 3;
2592168404Spjd	int i;
2593168404Spjd
2594168404Spjd	ASSERT((size & 7) == 0);
2595168404Spjd
2596168404Spjd	for (i = 0; i < count; i++)
2597168404Spjd		buf[i] = BSWAP_64(buf[i]);
2598168404Spjd}
2599168404Spjd
2600168404Spjdvoid
2601168404Spjdbyteswap_uint32_array(void *vbuf, size_t size)
2602168404Spjd{
2603168404Spjd	uint32_t *buf = vbuf;
2604168404Spjd	size_t count = size >> 2;
2605168404Spjd	int i;
2606168404Spjd
2607168404Spjd	ASSERT((size & 3) == 0);
2608168404Spjd
2609168404Spjd	for (i = 0; i < count; i++)
2610168404Spjd		buf[i] = BSWAP_32(buf[i]);
2611168404Spjd}
2612168404Spjd
2613168404Spjdvoid
2614168404Spjdbyteswap_uint16_array(void *vbuf, size_t size)
2615168404Spjd{
2616168404Spjd	uint16_t *buf = vbuf;
2617168404Spjd	size_t count = size >> 1;
2618168404Spjd	int i;
2619168404Spjd
2620168404Spjd	ASSERT((size & 1) == 0);
2621168404Spjd
2622168404Spjd	for (i = 0; i < count; i++)
2623168404Spjd		buf[i] = BSWAP_16(buf[i]);
2624168404Spjd}
2625168404Spjd
2626168404Spjd/* ARGSUSED */
2627168404Spjdvoid
2628168404Spjdbyteswap_uint8_array(void *vbuf, size_t size)
2629168404Spjd{
2630168404Spjd}
2631168404Spjd
2632168404Spjdvoid
2633168404Spjddmu_init(void)
2634168404Spjd{
2635321610Smav	abd_init();
2636219089Spjd	zfs_dbgmsg_init();
2637219089Spjd	sa_cache_init();
2638219089Spjd	xuio_stat_init();
2639219089Spjd	dmu_objset_init();
2640219089Spjd	dnode_init();
2641208130Smm	zfetch_init();
2642254608Sgibbs	zio_compress_init();
2643239620Smm	l2arc_init();
2644168404Spjd	arc_init();
2645307265Smav	dbuf_init();
2646168404Spjd}
2647168404Spjd
2648168404Spjdvoid
2649168404Spjddmu_fini(void)
2650168404Spjd{
2651251629Sdelphij	arc_fini(); /* arc depends on l2arc, so arc must go first */
2652219089Spjd	l2arc_fini();
2653208130Smm	zfetch_fini();
2654254608Sgibbs	zio_compress_fini();
2655219089Spjd	dbuf_fini();
2656168404Spjd	dnode_fini();
2657219089Spjd	dmu_objset_fini();
2658219089Spjd	xuio_stat_fini();
2659219089Spjd	sa_cache_fini();
2660219089Spjd	zfs_dbgmsg_fini();
2661321610Smav	abd_fini();
2662168404Spjd}
2663