1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23307292Smav * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
24288549Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25168404Spjd */
26168404Spjd
27168404Spjd#ifndef	_SYS_DNODE_H
28168404Spjd#define	_SYS_DNODE_H
29168404Spjd
30168404Spjd#include <sys/zfs_context.h>
31168404Spjd#include <sys/avl.h>
32168404Spjd#include <sys/spa.h>
33168404Spjd#include <sys/txg.h>
34168404Spjd#include <sys/zio.h>
35168404Spjd#include <sys/refcount.h>
36168404Spjd#include <sys/dmu_zfetch.h>
37219089Spjd#include <sys/zrlock.h>
38168404Spjd
39168404Spjd#ifdef	__cplusplus
40168404Spjdextern "C" {
41168404Spjd#endif
42168404Spjd
43168404Spjd/*
44185029Spjd * dnode_hold() flags.
45168404Spjd */
46168404Spjd#define	DNODE_MUST_BE_ALLOCATED	1
47168404Spjd#define	DNODE_MUST_BE_FREE	2
48168404Spjd
49168404Spjd/*
50185029Spjd * dnode_next_offset() flags.
51185029Spjd */
52185029Spjd#define	DNODE_FIND_HOLE		1
53185029Spjd#define	DNODE_FIND_BACKWARDS	2
54185029Spjd#define	DNODE_FIND_HAVELOCK	4
55185029Spjd
56185029Spjd/*
57168404Spjd * Fixed constants.
58168404Spjd */
59168404Spjd#define	DNODE_SHIFT		9	/* 512 bytes */
60272332Sdelphij#define	DN_MIN_INDBLKSHIFT	12	/* 4k */
61307126Smav#define	DN_MAX_INDBLKSHIFT	17	/* 128k */
62168404Spjd#define	DNODE_BLOCK_SHIFT	14	/* 16k */
63168404Spjd#define	DNODE_CORE_SIZE		64	/* 64 bytes for dnode sans blkptrs */
64168404Spjd#define	DN_MAX_OBJECT_SHIFT	48	/* 256 trillion (zfs_fid_t limit) */
65168404Spjd#define	DN_MAX_OFFSET_SHIFT	64	/* 2^64 bytes in a dnode */
66168404Spjd
67168404Spjd/*
68219089Spjd * dnode id flags
69219089Spjd *
70219089Spjd * Note: a file will never ever have its
71219089Spjd * ids moved from bonus->spill
72219089Spjd * and only in a crypto environment would it be on spill
73219089Spjd */
74219089Spjd#define	DN_ID_CHKED_BONUS	0x1
75219089Spjd#define	DN_ID_CHKED_SPILL	0x2
76219089Spjd#define	DN_ID_OLD_EXIST		0x4
77219089Spjd#define	DN_ID_NEW_EXIST		0x8
78219089Spjd
79219089Spjd/*
80168404Spjd * Derived constants.
81168404Spjd */
82168404Spjd#define	DNODE_SIZE	(1 << DNODE_SHIFT)
83168404Spjd#define	DN_MAX_NBLKPTR	((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
84168404Spjd#define	DN_MAX_BONUSLEN	(DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
85168404Spjd#define	DN_MAX_OBJECT	(1ULL << DN_MAX_OBJECT_SHIFT)
86185029Spjd#define	DN_ZERO_BONUSLEN	(DN_MAX_BONUSLEN + 1)
87219089Spjd#define	DN_KILL_SPILLBLK (1)
88168404Spjd
89168404Spjd#define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
90168404Spjd#define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
91307126Smav
92307126Smav/*
93307126Smav * This is inaccurate if the indblkshift of the particular object is not the
94307126Smav * max.  But it's only used by userland to calculate the zvol reservation.
95307126Smav */
96168404Spjd#define	DNODES_PER_LEVEL_SHIFT	(DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
97219089Spjd#define	DNODES_PER_LEVEL	(1ULL << DNODES_PER_LEVEL_SHIFT)
98168404Spjd
99168404Spjd/* The +2 here is a cheesy way to round up */
100168404Spjd#define	DN_MAX_LEVELS	(2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
101168404Spjd	(DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
102168404Spjd
103168404Spjd#define	DN_BONUS(dnp)	((void*)((dnp)->dn_bonus + \
104168404Spjd	(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
105168404Spjd
106168404Spjd#define	DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
107168404Spjd	(dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
108168404Spjd
109168404Spjd#define	EPB(blkshift, typeshift)	(1 << (blkshift - typeshift))
110168404Spjd
111168404Spjdstruct dmu_buf_impl;
112219089Spjdstruct objset;
113168404Spjdstruct zio;
114168404Spjd
115168404Spjdenum dnode_dirtycontext {
116168404Spjd	DN_UNDIRTIED,
117168404Spjd	DN_DIRTY_OPEN,
118168404Spjd	DN_DIRTY_SYNC
119168404Spjd};
120168404Spjd
121168404Spjd/* Is dn_used in bytes?  if not, it's in multiples of SPA_MINBLOCKSIZE */
122209962Smm#define	DNODE_FLAG_USED_BYTES		(1<<0)
123209962Smm#define	DNODE_FLAG_USERUSED_ACCOUNTED	(1<<1)
124168404Spjd
125219089Spjd/* Does dnode have a SA spill blkptr in bonus? */
126219089Spjd#define	DNODE_FLAG_SPILL_BLKPTR	(1<<2)
127219089Spjd
128168404Spjdtypedef struct dnode_phys {
129168404Spjd	uint8_t dn_type;		/* dmu_object_type_t */
130168404Spjd	uint8_t dn_indblkshift;		/* ln2(indirect block size) */
131168404Spjd	uint8_t dn_nlevels;		/* 1=dn_blkptr->data blocks */
132168404Spjd	uint8_t dn_nblkptr;		/* length of dn_blkptr */
133168404Spjd	uint8_t dn_bonustype;		/* type of data in bonus buffer */
134168404Spjd	uint8_t	dn_checksum;		/* ZIO_CHECKSUM type */
135168404Spjd	uint8_t	dn_compress;		/* ZIO_COMPRESS type */
136168404Spjd	uint8_t dn_flags;		/* DNODE_FLAG_* */
137168404Spjd	uint16_t dn_datablkszsec;	/* data block size in 512b sectors */
138168404Spjd	uint16_t dn_bonuslen;		/* length of dn_bonus */
139168404Spjd	uint8_t dn_pad2[4];
140168404Spjd
141168404Spjd	/* accounting is protected by dn_dirty_mtx */
142168404Spjd	uint64_t dn_maxblkid;		/* largest allocated block ID */
143168404Spjd	uint64_t dn_used;		/* bytes (or sectors) of disk space */
144168404Spjd
145168404Spjd	uint64_t dn_pad3[4];
146168404Spjd
147168404Spjd	blkptr_t dn_blkptr[1];
148219089Spjd	uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
149219089Spjd	blkptr_t dn_spill;
150168404Spjd} dnode_phys_t;
151168404Spjd
152307292Smavstruct dnode {
153168404Spjd	/*
154251631Sdelphij	 * Protects the structure of the dnode, including the number of levels
155251631Sdelphij	 * of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
156168404Spjd	 */
157168404Spjd	krwlock_t dn_struct_rwlock;
158168404Spjd
159209962Smm	/* Our link on dn_objset->os_dnodes list; protected by os_lock.  */
160168404Spjd	list_node_t dn_link;
161168404Spjd
162168404Spjd	/* immutable: */
163219089Spjd	struct objset *dn_objset;
164168404Spjd	uint64_t dn_object;
165168404Spjd	struct dmu_buf_impl *dn_dbuf;
166219089Spjd	struct dnode_handle *dn_handle;
167168404Spjd	dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
168168404Spjd
169168404Spjd	/*
170168404Spjd	 * Copies of stuff in dn_phys.  They're valid in the open
171168404Spjd	 * context (eg. even before the dnode is first synced).
172168404Spjd	 * Where necessary, these are protected by dn_struct_rwlock.
173168404Spjd	 */
174168404Spjd	dmu_object_type_t dn_type;	/* object type */
175168404Spjd	uint16_t dn_bonuslen;		/* bonus length */
176168404Spjd	uint8_t dn_bonustype;		/* bonus type */
177168404Spjd	uint8_t dn_nblkptr;		/* number of blkptrs (immutable) */
178168404Spjd	uint8_t dn_checksum;		/* ZIO_CHECKSUM type */
179168404Spjd	uint8_t dn_compress;		/* ZIO_COMPRESS type */
180168404Spjd	uint8_t dn_nlevels;
181168404Spjd	uint8_t dn_indblkshift;
182168404Spjd	uint8_t dn_datablkshift;	/* zero if blksz not power of 2! */
183219089Spjd	uint8_t dn_moved;		/* Has this dnode been moved? */
184168404Spjd	uint16_t dn_datablkszsec;	/* in 512b sectors */
185168404Spjd	uint32_t dn_datablksz;		/* in bytes */
186168404Spjd	uint64_t dn_maxblkid;
187263390Sdelphij	uint8_t dn_next_type[TXG_SIZE];
188196703Spjd	uint8_t dn_next_nblkptr[TXG_SIZE];
189168404Spjd	uint8_t dn_next_nlevels[TXG_SIZE];
190168404Spjd	uint8_t dn_next_indblkshift[TXG_SIZE];
191219089Spjd	uint8_t dn_next_bonustype[TXG_SIZE];
192219089Spjd	uint8_t dn_rm_spillblk[TXG_SIZE];	/* for removing spill blk */
193185029Spjd	uint16_t dn_next_bonuslen[TXG_SIZE];
194168404Spjd	uint32_t dn_next_blksz[TXG_SIZE];	/* next block size in bytes */
195168404Spjd
196219089Spjd	/* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
197219089Spjd	uint32_t dn_dbufs_count;	/* count of dn_dbufs */
198254753Sdelphij	/* There are no level-0 blocks of this blkid or higher in dn_dbufs */
199254753Sdelphij	uint64_t dn_unlisted_l0_blkid;
200219089Spjd
201168404Spjd	/* protected by os_lock: */
202168404Spjd	list_node_t dn_dirty_link[TXG_SIZE];	/* next on dataset's dirty */
203168404Spjd
204168404Spjd	/* protected by dn_mtx: */
205168404Spjd	kmutex_t dn_mtx;
206168404Spjd	list_t dn_dirty_records[TXG_SIZE];
207265740Sdelphij	struct range_tree *dn_free_ranges[TXG_SIZE];
208168404Spjd	uint64_t dn_allocated_txg;
209168404Spjd	uint64_t dn_free_txg;
210168404Spjd	uint64_t dn_assigned_txg;
211168404Spjd	kcondvar_t dn_notxholds;
212168404Spjd	enum dnode_dirtycontext dn_dirtyctx;
213168404Spjd	uint8_t *dn_dirtyctx_firstset;		/* dbg: contents meaningless */
214168404Spjd
215168404Spjd	/* protected by own devices */
216168404Spjd	refcount_t dn_tx_holds;
217168404Spjd	refcount_t dn_holds;
218168404Spjd
219168404Spjd	kmutex_t dn_dbufs_mtx;
220270809Sdelphij	/*
221270809Sdelphij	 * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs
222270809Sdelphij	 * can contain multiple dbufs of the same (level, blkid) when a
223270809Sdelphij	 * dbuf is marked DB_EVICTING without being removed from
224270809Sdelphij	 * dn_dbufs. To maintain the avl invariant that there cannot be
225270809Sdelphij	 * duplicate entries, we order the dbufs by an arbitrary value -
226270809Sdelphij	 * their address in memory. This means that dn_dbufs cannot be used to
227270809Sdelphij	 * directly look up a dbuf. Instead, callers must use avl_walk, have
228270809Sdelphij	 * a reference to the dbuf, or look up a non-existant node with
229270809Sdelphij	 * db_state = DB_SEARCH (see dbuf_free_range for an example).
230270809Sdelphij	 */
231270809Sdelphij	avl_tree_t dn_dbufs;
232219089Spjd
233219089Spjd	/* protected by dn_struct_rwlock */
234168404Spjd	struct dmu_buf_impl *dn_bonus;	/* bonus buffer dbuf */
235168404Spjd
236219089Spjd	boolean_t dn_have_spill;	/* have spill or are spilling */
237219089Spjd
238168404Spjd	/* parent IO for current sync write */
239168404Spjd	zio_t *dn_zio;
240168404Spjd
241209962Smm	/* used in syncing context */
242219089Spjd	uint64_t dn_oldused;	/* old phys used bytes */
243219089Spjd	uint64_t dn_oldflags;	/* old phys dn_flags */
244219089Spjd	uint64_t dn_olduid, dn_oldgid;
245219089Spjd	uint64_t dn_newuid, dn_newgid;
246219089Spjd	int dn_id_flags;
247209962Smm
248168404Spjd	/* holds prefetch structure */
249168404Spjd	struct zfetch	dn_zfetch;
250307292Smav};
251168404Spjd
252219089Spjd/*
253219089Spjd * Adds a level of indirection between the dbuf and the dnode to avoid
254219089Spjd * iterating descendent dbufs in dnode_move(). Handles are not allocated
255219089Spjd * individually, but as an array of child dnodes in dnode_hold_impl().
256219089Spjd */
257219089Spjdtypedef struct dnode_handle {
258219089Spjd	/* Protects dnh_dnode from modification by dnode_move(). */
259219089Spjd	zrlock_t dnh_zrlock;
260219089Spjd	dnode_t *dnh_dnode;
261219089Spjd} dnode_handle_t;
262219089Spjd
263219089Spjdtypedef struct dnode_children {
264288549Smav	dmu_buf_user_t dnc_dbu;		/* User evict data */
265219089Spjd	size_t dnc_count;		/* number of children */
266270127Sdelphij	dnode_handle_t dnc_children[];	/* sized dynamically */
267219089Spjd} dnode_children_t;
268219089Spjd
269168404Spjdtypedef struct free_range {
270168404Spjd	avl_node_t fr_node;
271168404Spjd	uint64_t fr_blkid;
272168404Spjd	uint64_t fr_nblks;
273168404Spjd} free_range_t;
274168404Spjd
275288549Smavvoid dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
276219089Spjd    uint64_t object, dnode_handle_t *dnh);
277219089Spjdvoid dnode_special_close(dnode_handle_t *dnh);
278168404Spjd
279185029Spjdvoid dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
280219089Spjdvoid dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
281219089Spjdvoid dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
282219089Spjd
283219089Spjdint dnode_hold(struct objset *dd, uint64_t object,
284168404Spjd    void *ref, dnode_t **dnp);
285219089Spjdint dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
286168404Spjd    void *ref, dnode_t **dnp);
287185029Spjdboolean_t dnode_add_ref(dnode_t *dn, void *ref);
288168404Spjdvoid dnode_rele(dnode_t *dn, void *ref);
289288541Smavvoid dnode_rele_and_unlock(dnode_t *dn, void *tag);
290168404Spjdvoid dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
291168404Spjdvoid dnode_sync(dnode_t *dn, dmu_tx_t *tx);
292168404Spjdvoid dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
293168404Spjd    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
294168404Spjdvoid dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
295168404Spjd    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
296168404Spjdvoid dnode_free(dnode_t *dn, dmu_tx_t *tx);
297168404Spjdvoid dnode_byteswap(dnode_phys_t *dnp);
298168404Spjdvoid dnode_buf_byteswap(void *buf, size_t size);
299168404Spjdvoid dnode_verify(dnode_t *dn);
300168404Spjdint dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
301168404Spjdvoid dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
302168404Spjdvoid dnode_diduse_space(dnode_t *dn, int64_t space);
303168404Spjdvoid dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
304185029Spjdvoid dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
305168404Spjduint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
306168404Spjdvoid dnode_init(void);
307168404Spjdvoid dnode_fini(void);
308185029Spjdint dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
309185029Spjd    int minlvl, uint64_t blkfill, uint64_t txg);
310185029Spjdvoid dnode_evict_dbufs(dnode_t *dn);
311288541Smavvoid dnode_evict_bonus(dnode_t *dn);
312168404Spjd
313299433Smav#define	DNODE_IS_CACHEABLE(_dn)						\
314299433Smav	((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
315299433Smav	(DMU_OT_IS_METADATA((_dn)->dn_type) &&				\
316299433Smav	(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))
317299433Smav
318299433Smav#define	DNODE_META_IS_CACHEABLE(_dn)					\
319299433Smav	((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
320299433Smav	(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
321299433Smav
322168404Spjd#ifdef ZFS_DEBUG
323168404Spjd
324168404Spjd/*
325168404Spjd * There should be a ## between the string literal and fmt, to make it
326168404Spjd * clear that we're joining two strings together, but that piece of shit
327168404Spjd * gcc doesn't support that preprocessor token.
328168404Spjd */
329168404Spjd#define	dprintf_dnode(dn, fmt, ...) do { \
330168404Spjd	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
331168404Spjd	char __db_buf[32]; \
332168404Spjd	uint64_t __db_obj = (dn)->dn_object; \
333168404Spjd	if (__db_obj == DMU_META_DNODE_OBJECT) \
334168404Spjd		(void) strcpy(__db_buf, "mdn"); \
335168404Spjd	else \
336168404Spjd		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
337168404Spjd		    (u_longlong_t)__db_obj);\
338168404Spjd	dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
339168404Spjd	    __db_buf, __VA_ARGS__); \
340168404Spjd	} \
341168404Spjd_NOTE(CONSTCOND) } while (0)
342168404Spjd
343168404Spjd#define	DNODE_VERIFY(dn)		dnode_verify(dn)
344168404Spjd#define	FREE_VERIFY(db, start, end, tx)	free_verify(db, start, end, tx)
345168404Spjd
346168404Spjd#else
347168404Spjd
348168404Spjd#define	dprintf_dnode(db, fmt, ...)
349168404Spjd#define	DNODE_VERIFY(dn)
350168404Spjd#define	FREE_VERIFY(db, start, end, tx)
351168404Spjd
352168404Spjd#endif
353168404Spjd
354168404Spjd#ifdef	__cplusplus
355168404Spjd}
356168404Spjd#endif
357168404Spjd
358168404Spjd#endif	/* _SYS_DNODE_H */
359