1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23339140Smav * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24286575Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25168404Spjd */
26168404Spjd
27168404Spjd#ifndef	_SYS_DNODE_H
28168404Spjd#define	_SYS_DNODE_H
29168404Spjd
30168404Spjd#include <sys/zfs_context.h>
31168404Spjd#include <sys/avl.h>
32168404Spjd#include <sys/spa.h>
33168404Spjd#include <sys/txg.h>
34168404Spjd#include <sys/zio.h>
35168404Spjd#include <sys/refcount.h>
36168404Spjd#include <sys/dmu_zfetch.h>
37219089Spjd#include <sys/zrlock.h>
38321553Smav#include <sys/multilist.h>
39168404Spjd
40168404Spjd#ifdef	__cplusplus
41168404Spjdextern "C" {
42168404Spjd#endif
43168404Spjd
44168404Spjd/*
45185029Spjd * dnode_hold() flags.
46168404Spjd */
47168404Spjd#define	DNODE_MUST_BE_ALLOCATED	1
48168404Spjd#define	DNODE_MUST_BE_FREE	2
49168404Spjd
50168404Spjd/*
51185029Spjd * dnode_next_offset() flags.
52185029Spjd */
53185029Spjd#define	DNODE_FIND_HOLE		1
54185029Spjd#define	DNODE_FIND_BACKWARDS	2
55185029Spjd#define	DNODE_FIND_HAVELOCK	4
56185029Spjd
57185029Spjd/*
58168404Spjd * Fixed constants.
59168404Spjd */
60168404Spjd#define	DNODE_SHIFT		9	/* 512 bytes */
61271526Sdelphij#define	DN_MIN_INDBLKSHIFT	12	/* 4k */
62321541Smav/*
63321541Smav * If we ever increase this value beyond 20, we need to revisit all logic that
64321541Smav * does x << level * ebps to handle overflow.  With a 1M indirect block size,
65321541Smav * 4 levels of indirect blocks would not be able to guarantee addressing an
66321541Smav * entire object, so 5 levels will be used, but 5 * (20 - 7) = 65.
67321541Smav */
68307112Smav#define	DN_MAX_INDBLKSHIFT	17	/* 128k */
69168404Spjd#define	DNODE_BLOCK_SHIFT	14	/* 16k */
70168404Spjd#define	DNODE_CORE_SIZE		64	/* 64 bytes for dnode sans blkptrs */
71168404Spjd#define	DN_MAX_OBJECT_SHIFT	48	/* 256 trillion (zfs_fid_t limit) */
72168404Spjd#define	DN_MAX_OFFSET_SHIFT	64	/* 2^64 bytes in a dnode */
73168404Spjd
74168404Spjd/*
75219089Spjd * dnode id flags
76219089Spjd *
77219089Spjd * Note: a file will never ever have its
78219089Spjd * ids moved from bonus->spill
79219089Spjd * and only in a crypto environment would it be on spill
80219089Spjd */
81219089Spjd#define	DN_ID_CHKED_BONUS	0x1
82219089Spjd#define	DN_ID_CHKED_SPILL	0x2
83219089Spjd#define	DN_ID_OLD_EXIST		0x4
84219089Spjd#define	DN_ID_NEW_EXIST		0x8
85219089Spjd
86219089Spjd/*
87168404Spjd * Derived constants.
88168404Spjd */
89168404Spjd#define	DNODE_SIZE	(1 << DNODE_SHIFT)
90168404Spjd#define	DN_MAX_NBLKPTR	((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
91168404Spjd#define	DN_MAX_BONUSLEN	(DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
92168404Spjd#define	DN_MAX_OBJECT	(1ULL << DN_MAX_OBJECT_SHIFT)
93185029Spjd#define	DN_ZERO_BONUSLEN	(DN_MAX_BONUSLEN + 1)
94219089Spjd#define	DN_KILL_SPILLBLK (1)
95168404Spjd
96168404Spjd#define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
97168404Spjd#define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
98307112Smav
99307112Smav/*
100307112Smav * This is inaccurate if the indblkshift of the particular object is not the
101307112Smav * max.  But it's only used by userland to calculate the zvol reservation.
102307112Smav */
103168404Spjd#define	DNODES_PER_LEVEL_SHIFT	(DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
104219089Spjd#define	DNODES_PER_LEVEL	(1ULL << DNODES_PER_LEVEL_SHIFT)
105168404Spjd
106168404Spjd/* The +2 here is a cheesy way to round up */
107168404Spjd#define	DN_MAX_LEVELS	(2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
108168404Spjd	(DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
109168404Spjd
110168404Spjd#define	DN_BONUS(dnp)	((void*)((dnp)->dn_bonus + \
111168404Spjd	(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
112168404Spjd
113168404Spjd#define	DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
114168404Spjd	(dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
115168404Spjd
116168404Spjd#define	EPB(blkshift, typeshift)	(1 << (blkshift - typeshift))
117168404Spjd
118168404Spjdstruct dmu_buf_impl;
119219089Spjdstruct objset;
120168404Spjdstruct zio;
121168404Spjd
122168404Spjdenum dnode_dirtycontext {
123168404Spjd	DN_UNDIRTIED,
124168404Spjd	DN_DIRTY_OPEN,
125168404Spjd	DN_DIRTY_SYNC
126168404Spjd};
127168404Spjd
128168404Spjd/* Is dn_used in bytes?  if not, it's in multiples of SPA_MINBLOCKSIZE */
129209962Smm#define	DNODE_FLAG_USED_BYTES		(1<<0)
130209962Smm#define	DNODE_FLAG_USERUSED_ACCOUNTED	(1<<1)
131168404Spjd
132219089Spjd/* Does dnode have a SA spill blkptr in bonus? */
133219089Spjd#define	DNODE_FLAG_SPILL_BLKPTR	(1<<2)
134219089Spjd
135168404Spjdtypedef struct dnode_phys {
136168404Spjd	uint8_t dn_type;		/* dmu_object_type_t */
137168404Spjd	uint8_t dn_indblkshift;		/* ln2(indirect block size) */
138168404Spjd	uint8_t dn_nlevels;		/* 1=dn_blkptr->data blocks */
139168404Spjd	uint8_t dn_nblkptr;		/* length of dn_blkptr */
140168404Spjd	uint8_t dn_bonustype;		/* type of data in bonus buffer */
141168404Spjd	uint8_t	dn_checksum;		/* ZIO_CHECKSUM type */
142168404Spjd	uint8_t	dn_compress;		/* ZIO_COMPRESS type */
143168404Spjd	uint8_t dn_flags;		/* DNODE_FLAG_* */
144168404Spjd	uint16_t dn_datablkszsec;	/* data block size in 512b sectors */
145168404Spjd	uint16_t dn_bonuslen;		/* length of dn_bonus */
146168404Spjd	uint8_t dn_pad2[4];
147168404Spjd
148168404Spjd	/* accounting is protected by dn_dirty_mtx */
149168404Spjd	uint64_t dn_maxblkid;		/* largest allocated block ID */
150168404Spjd	uint64_t dn_used;		/* bytes (or sectors) of disk space */
151168404Spjd
152168404Spjd	uint64_t dn_pad3[4];
153168404Spjd
154168404Spjd	blkptr_t dn_blkptr[1];
155219089Spjd	uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
156219089Spjd	blkptr_t dn_spill;
157168404Spjd} dnode_phys_t;
158168404Spjd
159307290Smavstruct dnode {
160168404Spjd	/*
161251631Sdelphij	 * Protects the structure of the dnode, including the number of levels
162251631Sdelphij	 * of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
163168404Spjd	 */
164168404Spjd	krwlock_t dn_struct_rwlock;
165168404Spjd
166209962Smm	/* Our link on dn_objset->os_dnodes list; protected by os_lock.  */
167168404Spjd	list_node_t dn_link;
168168404Spjd
169168404Spjd	/* immutable: */
170219089Spjd	struct objset *dn_objset;
171168404Spjd	uint64_t dn_object;
172168404Spjd	struct dmu_buf_impl *dn_dbuf;
173219089Spjd	struct dnode_handle *dn_handle;
174168404Spjd	dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
175168404Spjd
176168404Spjd	/*
177168404Spjd	 * Copies of stuff in dn_phys.  They're valid in the open
178168404Spjd	 * context (eg. even before the dnode is first synced).
179168404Spjd	 * Where necessary, these are protected by dn_struct_rwlock.
180168404Spjd	 */
181168404Spjd	dmu_object_type_t dn_type;	/* object type */
182168404Spjd	uint16_t dn_bonuslen;		/* bonus length */
183168404Spjd	uint8_t dn_bonustype;		/* bonus type */
184168404Spjd	uint8_t dn_nblkptr;		/* number of blkptrs (immutable) */
185168404Spjd	uint8_t dn_checksum;		/* ZIO_CHECKSUM type */
186168404Spjd	uint8_t dn_compress;		/* ZIO_COMPRESS type */
187168404Spjd	uint8_t dn_nlevels;
188168404Spjd	uint8_t dn_indblkshift;
189168404Spjd	uint8_t dn_datablkshift;	/* zero if blksz not power of 2! */
190219089Spjd	uint8_t dn_moved;		/* Has this dnode been moved? */
191168404Spjd	uint16_t dn_datablkszsec;	/* in 512b sectors */
192168404Spjd	uint32_t dn_datablksz;		/* in bytes */
193168404Spjd	uint64_t dn_maxblkid;
194259813Sdelphij	uint8_t dn_next_type[TXG_SIZE];
195196703Spjd	uint8_t dn_next_nblkptr[TXG_SIZE];
196168404Spjd	uint8_t dn_next_nlevels[TXG_SIZE];
197168404Spjd	uint8_t dn_next_indblkshift[TXG_SIZE];
198219089Spjd	uint8_t dn_next_bonustype[TXG_SIZE];
199219089Spjd	uint8_t dn_rm_spillblk[TXG_SIZE];	/* for removing spill blk */
200185029Spjd	uint16_t dn_next_bonuslen[TXG_SIZE];
201168404Spjd	uint32_t dn_next_blksz[TXG_SIZE];	/* next block size in bytes */
202168404Spjd
203219089Spjd	/* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
204219089Spjd	uint32_t dn_dbufs_count;	/* count of dn_dbufs */
205219089Spjd
206168404Spjd	/* protected by os_lock: */
207321553Smav	multilist_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
208168404Spjd
209168404Spjd	/* protected by dn_mtx: */
210168404Spjd	kmutex_t dn_mtx;
211168404Spjd	list_t dn_dirty_records[TXG_SIZE];
212264669Sdelphij	struct range_tree *dn_free_ranges[TXG_SIZE];
213168404Spjd	uint64_t dn_allocated_txg;
214168404Spjd	uint64_t dn_free_txg;
215168404Spjd	uint64_t dn_assigned_txg;
216168404Spjd	kcondvar_t dn_notxholds;
217168404Spjd	enum dnode_dirtycontext dn_dirtyctx;
218168404Spjd	uint8_t *dn_dirtyctx_firstset;		/* dbg: contents meaningless */
219168404Spjd
220168404Spjd	/* protected by own devices */
221168404Spjd	refcount_t dn_tx_holds;
222168404Spjd	refcount_t dn_holds;
223168404Spjd
224168404Spjd	kmutex_t dn_dbufs_mtx;
225270383Sdelphij	/*
226270383Sdelphij	 * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs
227270383Sdelphij	 * can contain multiple dbufs of the same (level, blkid) when a
228270383Sdelphij	 * dbuf is marked DB_EVICTING without being removed from
229270383Sdelphij	 * dn_dbufs. To maintain the avl invariant that there cannot be
230270383Sdelphij	 * duplicate entries, we order the dbufs by an arbitrary value -
231270383Sdelphij	 * their address in memory. This means that dn_dbufs cannot be used to
232270383Sdelphij	 * directly look up a dbuf. Instead, callers must use avl_walk, have
233270383Sdelphij	 * a reference to the dbuf, or look up a non-existant node with
234270383Sdelphij	 * db_state = DB_SEARCH (see dbuf_free_range for an example).
235270383Sdelphij	 */
236270383Sdelphij	avl_tree_t dn_dbufs;
237219089Spjd
238219089Spjd	/* protected by dn_struct_rwlock */
239168404Spjd	struct dmu_buf_impl *dn_bonus;	/* bonus buffer dbuf */
240168404Spjd
241219089Spjd	boolean_t dn_have_spill;	/* have spill or are spilling */
242219089Spjd
243168404Spjd	/* parent IO for current sync write */
244168404Spjd	zio_t *dn_zio;
245168404Spjd
246209962Smm	/* used in syncing context */
247219089Spjd	uint64_t dn_oldused;	/* old phys used bytes */
248219089Spjd	uint64_t dn_oldflags;	/* old phys dn_flags */
249219089Spjd	uint64_t dn_olduid, dn_oldgid;
250219089Spjd	uint64_t dn_newuid, dn_newgid;
251219089Spjd	int dn_id_flags;
252209962Smm
253168404Spjd	/* holds prefetch structure */
254168404Spjd	struct zfetch	dn_zfetch;
255307290Smav};
256168404Spjd
257219089Spjd/*
258219089Spjd * Adds a level of indirection between the dbuf and the dnode to avoid
259219089Spjd * iterating descendent dbufs in dnode_move(). Handles are not allocated
260219089Spjd * individually, but as an array of child dnodes in dnode_hold_impl().
261219089Spjd */
262219089Spjdtypedef struct dnode_handle {
263219089Spjd	/* Protects dnh_dnode from modification by dnode_move(). */
264219089Spjd	zrlock_t dnh_zrlock;
265219089Spjd	dnode_t *dnh_dnode;
266219089Spjd} dnode_handle_t;
267219089Spjd
268219089Spjdtypedef struct dnode_children {
269286575Smav	dmu_buf_user_t dnc_dbu;		/* User evict data */
270219089Spjd	size_t dnc_count;		/* number of children */
271269431Sdelphij	dnode_handle_t dnc_children[];	/* sized dynamically */
272219089Spjd} dnode_children_t;
273219089Spjd
274168404Spjdtypedef struct free_range {
275168404Spjd	avl_node_t fr_node;
276168404Spjd	uint64_t fr_blkid;
277168404Spjd	uint64_t fr_nblks;
278168404Spjd} free_range_t;
279168404Spjd
280286575Smavvoid dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
281219089Spjd    uint64_t object, dnode_handle_t *dnh);
282219089Spjdvoid dnode_special_close(dnode_handle_t *dnh);
283168404Spjd
284185029Spjdvoid dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
285219089Spjdvoid dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
286219089Spjdvoid dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
287219089Spjd
288219089Spjdint dnode_hold(struct objset *dd, uint64_t object,
289168404Spjd    void *ref, dnode_t **dnp);
290219089Spjdint dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
291168404Spjd    void *ref, dnode_t **dnp);
292185029Spjdboolean_t dnode_add_ref(dnode_t *dn, void *ref);
293168404Spjdvoid dnode_rele(dnode_t *dn, void *ref);
294339140Smavvoid dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
295168404Spjdvoid dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
296168404Spjdvoid dnode_sync(dnode_t *dn, dmu_tx_t *tx);
297168404Spjdvoid dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
298168404Spjd    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
299168404Spjdvoid dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
300168404Spjd    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
301168404Spjdvoid dnode_free(dnode_t *dn, dmu_tx_t *tx);
302168404Spjdvoid dnode_byteswap(dnode_phys_t *dnp);
303168404Spjdvoid dnode_buf_byteswap(void *buf, size_t size);
304168404Spjdvoid dnode_verify(dnode_t *dn);
305168404Spjdint dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
306168404Spjdvoid dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
307168404Spjdvoid dnode_diduse_space(dnode_t *dn, int64_t space);
308185029Spjdvoid dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
309168404Spjduint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
310168404Spjdvoid dnode_init(void);
311168404Spjdvoid dnode_fini(void);
312185029Spjdint dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
313185029Spjd    int minlvl, uint64_t blkfill, uint64_t txg);
314185029Spjdvoid dnode_evict_dbufs(dnode_t *dn);
315286545Smavvoid dnode_evict_bonus(dnode_t *dn);
316332525Smavboolean_t dnode_needs_remap(const dnode_t *dn);
317168404Spjd
318297832Smav#define	DNODE_IS_CACHEABLE(_dn)						\
319297832Smav	((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
320297832Smav	(DMU_OT_IS_METADATA((_dn)->dn_type) &&				\
321297832Smav	(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))
322297832Smav
323297832Smav#define	DNODE_META_IS_CACHEABLE(_dn)					\
324297832Smav	((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
325297832Smav	(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
326297832Smav
327168404Spjd#ifdef ZFS_DEBUG
328168404Spjd
329168404Spjd/*
330168404Spjd * There should be a ## between the string literal and fmt, to make it
331168404Spjd * clear that we're joining two strings together, but that piece of shit
332168404Spjd * gcc doesn't support that preprocessor token.
333168404Spjd */
334168404Spjd#define	dprintf_dnode(dn, fmt, ...) do { \
335168404Spjd	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
336168404Spjd	char __db_buf[32]; \
337168404Spjd	uint64_t __db_obj = (dn)->dn_object; \
338168404Spjd	if (__db_obj == DMU_META_DNODE_OBJECT) \
339168404Spjd		(void) strcpy(__db_buf, "mdn"); \
340168404Spjd	else \
341168404Spjd		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
342168404Spjd		    (u_longlong_t)__db_obj);\
343168404Spjd	dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
344168404Spjd	    __db_buf, __VA_ARGS__); \
345168404Spjd	} \
346168404Spjd_NOTE(CONSTCOND) } while (0)
347168404Spjd
348168404Spjd#define	DNODE_VERIFY(dn)		dnode_verify(dn)
349168404Spjd#define	FREE_VERIFY(db, start, end, tx)	free_verify(db, start, end, tx)
350168404Spjd
351168404Spjd#else
352168404Spjd
353168404Spjd#define	dprintf_dnode(db, fmt, ...)
354168404Spjd#define	DNODE_VERIFY(dn)
355168404Spjd#define	FREE_VERIFY(db, start, end, tx)
356168404Spjd
357168404Spjd#endif
358168404Spjd
359168404Spjd#ifdef	__cplusplus
360168404Spjd}
361168404Spjd#endif
362168404Spjd
363168404Spjd#endif	/* _SYS_DNODE_H */
364