1/*
2 * CDDL HEADER START
3 *
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
8 *
9 * A full copy of the text of the CDDL should have accompanied this
10 * source.  A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
12 *
13 * CDDL HEADER END
14 */
15/*
16 * Copyright (c) 2019 by Delphix. All rights reserved.
17 */
18
19#ifndef	_BTREE_H
20#define	_BTREE_H
21
22#ifdef	__cplusplus
23extern "C" {
24#endif
25
26#include	<sys/zfs_context.h>
27
28/*
29 * This file defines the interface for a B-Tree implementation for ZFS. The
30 * tree can be used to store arbitrary sortable data types with low overhead
31 * and good operation performance. In addition the tree intelligently
32 * optimizes bulk in-order insertions to improve memory use and performance.
33 *
34 * Note that for all B-Tree functions, the values returned are pointers to the
35 * internal copies of the data in the tree. The internal data can only be
36 * safely mutated if the changes cannot change the ordering of the element
37 * with respect to any other elements in the tree.
38 *
39 * The major drawback of the B-Tree is that any returned elements or indexes
40 * are only valid until a side-effectful operation occurs, since these can
41 * result in reallocation or relocation of data. Side effectful operations are
42 * defined as insertion, removal, and zfs_btree_destroy_nodes.
43 *
44 * The B-Tree has two types of nodes: core nodes, and leaf nodes. Core
45 * nodes have an array of children pointing to other nodes, and an array of
46 * elements that act as separators between the elements of the subtrees rooted
47 * at its children. Leaf nodes only contain data elements, and form the bottom
48 * layer of the tree. Unlike B+ Trees, in this B-Tree implementation the
49 * elements in the core nodes are not copies of or references to leaf node
50 * elements.  Each element occurs only once in the tree, no matter what kind
51 * of node it is in.
52 *
53 * The tree's height is the same throughout, unlike many other forms of search
54 * tree. Each node (except for the root) must be between half minus one and
55 * completely full of elements (and children) at all times. Any operation that
56 * would put the node outside of that range results in a rebalancing operation
57 * (taking, merging, or splitting).
58 *
59 * This tree was implemented using descriptions from Wikipedia's articles on
60 * B-Trees and B+ Trees.
61 */
62
63/*
64 * Decreasing these values results in smaller memmove operations, but more of
65 * them, and increased memory overhead. Increasing these values results in
66 * higher variance in operation time, and reduces memory overhead.
67 */
68#define	BTREE_CORE_ELEMS	126
69#define	BTREE_LEAF_SIZE		4096
70
71extern kmem_cache_t *zfs_btree_leaf_cache;
72
73typedef struct zfs_btree_hdr {
74	struct zfs_btree_core	*bth_parent;
75	/*
76	 * Set to -1 to indicate core nodes. Other values represent first
77	 * valid element offset for leaf nodes.
78	 */
79	uint32_t		bth_first;
80	/*
81	 * For both leaf and core nodes, represents the number of elements in
82	 * the node. For core nodes, they will have bth_count + 1 children.
83	 */
84	uint32_t		bth_count;
85} zfs_btree_hdr_t;
86
87typedef struct zfs_btree_core {
88	zfs_btree_hdr_t	btc_hdr;
89	zfs_btree_hdr_t	*btc_children[BTREE_CORE_ELEMS + 1];
90	uint8_t		btc_elems[];
91} zfs_btree_core_t;
92
93typedef struct zfs_btree_leaf {
94	zfs_btree_hdr_t	btl_hdr;
95	uint8_t		btl_elems[];
96} zfs_btree_leaf_t;
97
98typedef struct zfs_btree_index {
99	zfs_btree_hdr_t	*bti_node;
100	uint32_t	bti_offset;
101	/*
102	 * True if the location is before the list offset, false if it's at
103	 * the listed offset.
104	 */
105	boolean_t	bti_before;
106} zfs_btree_index_t;
107
108typedef struct btree zfs_btree_t;
109typedef void * (*bt_find_in_buf_f) (zfs_btree_t *, uint8_t *, uint32_t,
110    const void *, zfs_btree_index_t *);
111
112struct btree {
113	int (*bt_compar) (const void *, const void *);
114	bt_find_in_buf_f	bt_find_in_buf;
115	size_t			bt_elem_size;
116	size_t			bt_leaf_size;
117	uint32_t		bt_leaf_cap;
118	int32_t			bt_height;
119	uint64_t		bt_num_elems;
120	uint64_t		bt_num_nodes;
121	zfs_btree_hdr_t		*bt_root;
122	zfs_btree_leaf_t	*bt_bulk; // non-null if bulk loading
123};
124
125/*
126 * Implementation of Shar's algorithm designed to accelerate binary search by
127 * eliminating impossible to predict branches.
128 *
129 * For optimality, this should be used to generate the search function in the
130 * same file as the comparator  and the comparator should be marked
131 * `__attribute__((always_inline) inline` so that the compiler will inline it.
132 *
133 * Arguments are:
134 *
135 * NAME   - The function name for this instance of the search function. Use it
136 *          in a subsequent call to zfs_btree_create().
137 * T      - The element type stored inside the B-Tree.
138 * COMP   - A comparator to compare two nodes, it must return exactly: -1, 0,
139 *          or +1 -1 for <, 0 for ==, and +1 for >. For trivial comparisons,
140 *          TREE_CMP() from avl.h can be used in a boilerplate function.
141 */
142/* BEGIN CSTYLED */
143#define	ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP)			\
144_Pragma("GCC diagnostic push")						\
145_Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")			\
146static void *								\
147NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,			\
148    const void *value, zfs_btree_index_t *where)			\
149{									\
150	T *i = (T *)buf;						\
151	(void) tree;							\
152	_Pragma("GCC unroll 9")						\
153	while (nelems > 1) {						\
154		uint32_t half = nelems / 2;				\
155		nelems -= half;						\
156		i += (COMP(&i[half - 1], value) < 0) * half;		\
157	}								\
158									\
159	int comp = COMP(i, value);					\
160	where->bti_offset = (i - (T *)buf) + (comp < 0);		\
161	where->bti_before = (comp != 0);				\
162									\
163	if (comp == 0) {						\
164		return (i);						\
165	}								\
166									\
167	return (NULL);							\
168}									\
169_Pragma("GCC diagnostic pop")
170/* END CSTYLED */
171
172/*
173 * Allocate and deallocate caches for btree nodes.
174 */
175void zfs_btree_init(void);
176void zfs_btree_fini(void);
177
178/*
179 * Initialize an B-Tree. Arguments are:
180 *
181 * tree   - the tree to be initialized
182 * compar - function to compare two nodes, it must return exactly: -1, 0, or +1
183 *          -1 for <, 0 for ==, and +1 for >
184 * find   - optional function to accelerate searches inside B-Tree nodes
185 *          through Shar's algorithm and comparator inlining. Setting this to
186 *          NULL will use a generic function. The function should be created
187 *          using ZFS_BTREE_FIND_IN_BUF_FUNC() in the same file as compar.
188 *          compar should be marked `__attribute__((always_inline)) inline` or
189 *          performance is unlikely to improve very much.
190 * size   - the value of sizeof(struct my_type)
191 * lsize  - custom leaf size
192 */
193void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *),
194    bt_find_in_buf_f, size_t);
195void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *),
196    bt_find_in_buf_f, size_t, size_t);
197
198/*
199 * Find a node with a matching value in the tree. Returns the matching node
200 * found. If not found, it returns NULL and then if "where" is not NULL it sets
201 * "where" for use with zfs_btree_add_idx() or zfs_btree_nearest().
202 *
203 * node   - node that has the value being looked for
204 * where  - position for use with zfs_btree_nearest() or zfs_btree_add_idx(),
205 *          may be NULL
206 */
207void *zfs_btree_find(zfs_btree_t *, const void *, zfs_btree_index_t *);
208
209/*
210 * Insert a node into the tree.
211 *
212 * node   - the node to insert
213 * where  - position as returned from zfs_btree_find()
214 */
215void zfs_btree_add_idx(zfs_btree_t *, const void *, const zfs_btree_index_t *);
216
217/*
218 * Return the first or last valued node in the tree. Will return NULL if the
219 * tree is empty. The index can be NULL if the location of the first or last
220 * element isn't required.
221 */
222void *zfs_btree_first(zfs_btree_t *, zfs_btree_index_t *);
223void *zfs_btree_last(zfs_btree_t *, zfs_btree_index_t *);
224
225/*
226 * Return the next or previous valued node in the tree. The second index can
227 * safely be NULL, if the location of the next or previous value isn't
228 * required.
229 */
230void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *,
231    zfs_btree_index_t *);
232void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *,
233    zfs_btree_index_t *);
234
235/*
236 * Get a value from a tree and an index.
237 */
238void *zfs_btree_get(zfs_btree_t *, zfs_btree_index_t *);
239
240/*
241 * Add a single value to the tree. The value must not compare equal to any
242 * other node already in the tree. Note that the value will be copied out, not
243 * inserted directly. It is safe to free or destroy the value once this
244 * function returns.
245 */
246void zfs_btree_add(zfs_btree_t *, const void *);
247
248/*
249 * Remove a single value from the tree.  The value must be in the tree. The
250 * pointer passed in may be a pointer into a tree-controlled buffer, but it
251 * need not be.
252 */
253void zfs_btree_remove(zfs_btree_t *, const void *);
254
255/*
256 * Remove the value at the given location from the tree.
257 */
258void zfs_btree_remove_idx(zfs_btree_t *, zfs_btree_index_t *);
259
260/*
261 * Return the number of nodes in the tree
262 */
263ulong_t zfs_btree_numnodes(zfs_btree_t *);
264
265/*
266 * Used to destroy any remaining nodes in a tree. The cookie argument should
267 * be initialized to NULL before the first call. Returns a node that has been
268 * removed from the tree and may be free()'d. Returns NULL when the tree is
269 * empty.
270 *
271 * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it
272 * and finally zfs_btree_destroy(). No other B-Tree routines will be valid.
273 *
274 * cookie - an index used to save state between calls to
275 * zfs_btree_destroy_nodes()
276 *
277 * EXAMPLE:
278 *	zfs_btree_t *tree;
279 *	struct my_data *node;
280 *	zfs_btree_index_t *cookie;
281 *
282 *	cookie = NULL;
283 *	while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
284 *		data_destroy(node);
285 *	zfs_btree_destroy(tree);
286 */
287void *zfs_btree_destroy_nodes(zfs_btree_t *, zfs_btree_index_t **);
288
289/*
290 * Destroys all nodes in the tree quickly. This doesn't give the caller an
291 * opportunity to iterate over each node and do its own cleanup; for that, use
292 * zfs_btree_destroy_nodes().
293 */
294void zfs_btree_clear(zfs_btree_t *);
295
296/*
297 * Final destroy of an B-Tree. Arguments are:
298 *
299 * tree   - the empty tree to destroy
300 */
301void zfs_btree_destroy(zfs_btree_t *tree);
302
303/* Runs a variety of self-checks on the btree to verify integrity. */
304void zfs_btree_verify(zfs_btree_t *tree);
305
306#ifdef	__cplusplus
307}
308#endif
309
310#endif	/* _BTREE_H */
311