radixtree.c revision 1.33
1/*	$NetBSD: radixtree.c,v 1.33 2023/09/23 19:17:38 ad Exp $	*/
2
3/*-
4 * Copyright (c)2011,2012,2013 YAMAMOTO Takashi,
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29/*
30 * radixtree.c
31 *
32 * Overview:
33 *
34 * This is an implementation of radix tree, whose keys are uint64_t and leafs
35 * are user provided pointers.
36 *
37 * Leaf nodes are just void * and this implementation doesn't care about
38 * what they actually point to.  However, this implementation has an assumption
39 * about their alignment.  Specifically, this implementation assumes that their
40 * 2 LSBs are always zero and uses them for internal accounting.
41 *
42 * Intermediate nodes and memory allocation:
43 *
44 * Intermediate nodes are automatically allocated and freed internally and
45 * basically users don't need to care about them.  The allocation is done via
46 * kmem_zalloc(9) for _KERNEL, malloc(3) for userland, and alloc() for
47 * _STANDALONE environment.  Only radix_tree_insert_node function can allocate
48 * memory for intermediate nodes and thus can fail for ENOMEM.
49 *
50 * Memory Efficiency:
51 *
52 * It's designed to work efficiently with dense index distribution.
53 * The memory consumption (number of necessary intermediate nodes) heavily
54 * depends on the index distribution.  Basically, more dense index distribution
55 * consumes less nodes per item.  Approximately,
56 *
57 *  - the best case: about RADIX_TREE_PTR_PER_NODE items per intermediate node.
58 *    it would look like the following.
59 *
60 *     root (t_height=1)
61 *      |
62 *      v
63 *      [ | | | ]   (intermediate node.  RADIX_TREE_PTR_PER_NODE=4 in this fig)
64 *       | | | |
65 *       v v v v
66 *       p p p p    (items)
67 *
68 *  - the worst case: RADIX_TREE_MAX_HEIGHT intermediate nodes per item.
69 *    it would look like the following if RADIX_TREE_MAX_HEIGHT=3.
70 *
71 *     root (t_height=3)
72 *      |
73 *      v
74 *      [ | | | ]
75 *           |
76 *           v
77 *           [ | | | ]
78 *                |
79 *                v
80 *                [ | | | ]
81 *                   |
82 *                   v
83 *                   p
84 *
85 * The height of tree (t_height) is dynamic.  It's smaller if only small
86 * index values are used.  As an extreme case, if only index 0 is used,
87 * the corresponding value is directly stored in the root of the tree
88 * (struct radix_tree) without allocating any intermediate nodes.  In that
89 * case, t_height=0.
90 *
91 * Gang lookup:
92 *
93 * This implementation provides a way to scan many nodes quickly via
94 * radix_tree_gang_lookup_node function and its varients.
95 *
96 * Tags:
97 *
98 * This implementation provides tagging functionality, which allows quick
99 * scanning of a subset of leaf nodes.  Leaf nodes are untagged when inserted
100 * into the tree and can be tagged by radix_tree_set_tag function.
101 * radix_tree_gang_lookup_tagged_node function and its variants returns only
102 * leaf nodes with the given tag.  To reduce amount of nodes to visit for
103 * these functions, this implementation keeps tagging information in internal
104 * intermediate nodes and quickly skips uninterested parts of a tree.
105 *
106 * A tree has RADIX_TREE_TAG_ID_MAX independent tag spaces, each of which are
107 * identified by a zero-origin numbers, tagid.  For the current implementation,
108 * RADIX_TREE_TAG_ID_MAX is 2.  A set of tags is described as a bitmask tagmask,
109 * which is a bitwise OR of (1 << tagid).
110 */
111
112#include <sys/cdefs.h>
113
114#if defined(_KERNEL) || defined(_STANDALONE)
115__KERNEL_RCSID(0, "$NetBSD: radixtree.c,v 1.33 2023/09/23 19:17:38 ad Exp $");
116#include <sys/param.h>
117#include <sys/errno.h>
118#include <sys/kmem.h>
119#include <sys/radixtree.h>
120#include <lib/libkern/libkern.h>
121#if defined(_STANDALONE)
122#include <lib/libsa/stand.h>
123#endif /* defined(_STANDALONE) */
124#else /* defined(_KERNEL) || defined(_STANDALONE) */
125__RCSID("$NetBSD: radixtree.c,v 1.33 2023/09/23 19:17:38 ad Exp $");
126#include <assert.h>
127#include <errno.h>
128#include <stdbool.h>
129#include <stdlib.h>
130#include <string.h>
131#if 1
132#define KASSERT assert
133#else
134#define KASSERT(a)	/* nothing */
135#endif
136#endif /* defined(_KERNEL) || defined(_STANDALONE) */
137
138#include <sys/radixtree.h>
139
140#define	RADIX_TREE_BITS_PER_HEIGHT	4	/* XXX tune */
141#define	RADIX_TREE_PTR_PER_NODE		(1 << RADIX_TREE_BITS_PER_HEIGHT)
142#define	RADIX_TREE_MAX_HEIGHT		(64 / RADIX_TREE_BITS_PER_HEIGHT)
143#define	RADIX_TREE_INVALID_HEIGHT	(RADIX_TREE_MAX_HEIGHT + 1)
144__CTASSERT((64 % RADIX_TREE_BITS_PER_HEIGHT) == 0);
145
146__CTASSERT(((1 << RADIX_TREE_TAG_ID_MAX) & (sizeof(int) - 1)) == 0);
147#define	RADIX_TREE_TAG_MASK	((1 << RADIX_TREE_TAG_ID_MAX) - 1)
148
149static inline void *
150entry_ptr(void *p)
151{
152
153	return (void *)((uintptr_t)p & ~RADIX_TREE_TAG_MASK);
154}
155
156static inline unsigned int
157entry_tagmask(void *p)
158{
159
160	return (uintptr_t)p & RADIX_TREE_TAG_MASK;
161}
162
163static inline void *
164entry_compose(void *p, unsigned int tagmask)
165{
166
167	return (void *)((uintptr_t)p | tagmask);
168}
169
170static inline bool
171entry_match_p(void *p, unsigned int tagmask)
172{
173
174	KASSERT(entry_ptr(p) != NULL || entry_tagmask(p) == 0);
175	if (p == NULL) {
176		return false;
177	}
178	if (tagmask == 0) {
179		return true;
180	}
181	return (entry_tagmask(p) & tagmask) != 0;
182}
183
184/*
185 * radix_tree_node: an intermediate node
186 *
187 * we don't care the type of leaf nodes.  they are just void *.
188 *
189 * we used to maintain a count of non-NULL nodes in this structure, but it
190 * prevented it from being aligned to a cache line boundary; the performance
191 * benefit from being cache friendly is greater than the benefit of having
192 * a dedicated count value, especially in multi-processor situations where
193 * we need to avoid intra-pool-page false sharing.
194 */
195
196struct radix_tree_node {
197	void *n_ptrs[RADIX_TREE_PTR_PER_NODE];
198};
199
200/*
201 * p_refs[0].pptr == &t->t_root
202 *	:
203 * p_refs[n].pptr == &(*p_refs[n-1])->n_ptrs[x]
204 *	:
205 *	:
206 * p_refs[t->t_height].pptr == &leaf_pointer
207 */
208
209struct radix_tree_path {
210	struct radix_tree_node_ref {
211		void **pptr;
212	} p_refs[RADIX_TREE_MAX_HEIGHT + 1]; /* +1 for the root ptr */
213	/*
214	 * p_lastidx is either the index of the last valid element of p_refs[]
215	 * or RADIX_TREE_INVALID_HEIGHT.
216	 * RADIX_TREE_INVALID_HEIGHT means that radix_tree_lookup_ptr found
217	 * that the height of the tree is not enough to cover the given index.
218	 */
219	unsigned int p_lastidx;
220};
221
222static inline void **
223path_pptr(const struct radix_tree *t, const struct radix_tree_path *p,
224    unsigned int height)
225{
226
227	KASSERT(height <= t->t_height);
228	return p->p_refs[height].pptr;
229}
230
231static inline struct radix_tree_node *
232path_node(const struct radix_tree * t, const struct radix_tree_path *p,
233    unsigned int height)
234{
235
236	KASSERT(height <= t->t_height);
237	return entry_ptr(*path_pptr(t, p, height));
238}
239
240/*
241 * radix_tree_init_tree:
242 *
243 * Initialize a tree.
244 */
245
246void
247radix_tree_init_tree(struct radix_tree *t)
248{
249
250	t->t_height = 0;
251	t->t_root = NULL;
252}
253
254/*
255 * radix_tree_fini_tree:
256 *
257 * Finish using a tree.
258 */
259
260void
261radix_tree_fini_tree(struct radix_tree *t)
262{
263
264	KASSERT(t->t_root == NULL);
265	KASSERT(t->t_height == 0);
266}
267
268/*
269 * radix_tree_empty_tree_p:
270 *
271 * Return if the tree is empty.
272 */
273
274bool
275radix_tree_empty_tree_p(struct radix_tree *t)
276{
277
278	return t->t_root == NULL;
279}
280
281/*
282 * radix_tree_empty_tree_p:
283 *
284 * Return true if the tree has any nodes with the given tag.  Otherwise
285 * return false.
286 *
287 * It's illegal to call this function with tagmask 0.
288 */
289
290bool
291radix_tree_empty_tagged_tree_p(struct radix_tree *t, unsigned int tagmask)
292{
293
294	KASSERT(tagmask != 0);
295	return (entry_tagmask(t->t_root) & tagmask) == 0;
296}
297
298static void
299radix_tree_node_init(struct radix_tree_node *n)
300{
301
302	memset(n, 0, sizeof(*n));
303}
304
305#if defined(_KERNEL)
306/*
307 * radix_tree_init:
308 *
309 * initialize the subsystem.
310 */
311
312void
313radix_tree_init(void)
314{
315
316	/* nothing right now */
317}
318
319/*
320 * radix_tree_await_memory:
321 *
322 * after an insert has failed with ENOMEM, wait for memory to become
323 * available, so the caller can retry.  this needs to ensure that the
324 * maximum possible required number of nodes is available.
325 */
326
327void
328radix_tree_await_memory(void)
329{
330	struct radix_tree_node *nodes[RADIX_TREE_MAX_HEIGHT];
331	int i;
332
333	for (i = 0; i < __arraycount(nodes); i++) {
334		nodes[i] = kmem_intr_alloc(sizeof(struct radix_tree_node),
335		    KM_SLEEP);
336	}
337	while (--i >= 0) {
338		kmem_intr_free(nodes[i], sizeof(struct radix_tree_node));
339	}
340}
341
342#endif /* defined(_KERNEL) */
343
344/*
345 * radix_tree_sum_node:
346 *
347 * return the logical sum of all entries in the given node.  used to quickly
348 * check for tag masks or empty nodes.
349 */
350
351static uintptr_t
352radix_tree_sum_node(const struct radix_tree_node *n)
353{
354#if RADIX_TREE_PTR_PER_NODE > 16
355	unsigned int i;
356	uintptr_t sum;
357
358	for (i = 0, sum = 0; i < RADIX_TREE_PTR_PER_NODE; i++) {
359		sum |= (uintptr_t)n->n_ptrs[i];
360	}
361	return sum;
362#else /* RADIX_TREE_PTR_PER_NODE > 16 */
363	uintptr_t sum;
364
365	/*
366	 * Unrolling the above is much better than a tight loop with two
367	 * test+branch pairs.  On x86 with gcc 5.5.0 this compiles into 19
368	 * deterministic instructions including the "return" and prologue &
369	 * epilogue.
370	 */
371	sum = (uintptr_t)n->n_ptrs[0];
372	sum |= (uintptr_t)n->n_ptrs[1];
373	sum |= (uintptr_t)n->n_ptrs[2];
374	sum |= (uintptr_t)n->n_ptrs[3];
375#if RADIX_TREE_PTR_PER_NODE > 4
376	sum |= (uintptr_t)n->n_ptrs[4];
377	sum |= (uintptr_t)n->n_ptrs[5];
378	sum |= (uintptr_t)n->n_ptrs[6];
379	sum |= (uintptr_t)n->n_ptrs[7];
380#endif
381#if RADIX_TREE_PTR_PER_NODE > 8
382	sum |= (uintptr_t)n->n_ptrs[8];
383	sum |= (uintptr_t)n->n_ptrs[9];
384	sum |= (uintptr_t)n->n_ptrs[10];
385	sum |= (uintptr_t)n->n_ptrs[11];
386	sum |= (uintptr_t)n->n_ptrs[12];
387	sum |= (uintptr_t)n->n_ptrs[13];
388	sum |= (uintptr_t)n->n_ptrs[14];
389	sum |= (uintptr_t)n->n_ptrs[15];
390#endif
391	return sum;
392#endif /* RADIX_TREE_PTR_PER_NODE > 16 */
393}
394
395static int __unused
396radix_tree_node_count_ptrs(const struct radix_tree_node *n)
397{
398	unsigned int i, c;
399
400	for (i = c = 0; i < RADIX_TREE_PTR_PER_NODE; i++) {
401		c += (n->n_ptrs[i] != NULL);
402	}
403	return c;
404}
405
406static struct radix_tree_node *
407radix_tree_alloc_node(void)
408{
409	struct radix_tree_node *n;
410
411#if defined(_KERNEL)
412	/*
413	 * note that kmem_alloc can block.
414	 */
415	n = kmem_intr_alloc(sizeof(struct radix_tree_node), KM_SLEEP);
416#elif defined(_STANDALONE)
417	n = alloc(sizeof(*n));
418#else /* defined(_STANDALONE) */
419	n = malloc(sizeof(*n));
420#endif /* defined(_STANDALONE) */
421	if (n != NULL) {
422		radix_tree_node_init(n);
423	}
424	KASSERT(n == NULL || radix_tree_sum_node(n) == 0);
425	return n;
426}
427
428static void
429radix_tree_free_node(struct radix_tree_node *n)
430{
431
432	KASSERT(radix_tree_sum_node(n) == 0);
433#if defined(_KERNEL)
434	kmem_intr_free(n, sizeof(struct radix_tree_node));
435#elif defined(_STANDALONE)
436	dealloc(n, sizeof(*n));
437#else
438	free(n);
439#endif
440}
441
442/*
443 * radix_tree_grow:
444 *
445 * increase the height of the tree.
446 */
447
448static __noinline int
449radix_tree_grow(struct radix_tree *t, unsigned int newheight)
450{
451	const unsigned int tagmask = entry_tagmask(t->t_root);
452	struct radix_tree_node *newnodes[RADIX_TREE_MAX_HEIGHT];
453	void *root;
454	int h;
455
456	KASSERT(newheight <= RADIX_TREE_MAX_HEIGHT);
457	if ((root = t->t_root) == NULL) {
458		t->t_height = newheight;
459		return 0;
460	}
461	for (h = t->t_height; h < newheight; h++) {
462		newnodes[h] = radix_tree_alloc_node();
463		if (__predict_false(newnodes[h] == NULL)) {
464			while (--h >= (int)t->t_height) {
465				newnodes[h]->n_ptrs[0] = NULL;
466				radix_tree_free_node(newnodes[h]);
467			}
468			return ENOMEM;
469		}
470		newnodes[h]->n_ptrs[0] = root;
471		root = entry_compose(newnodes[h], tagmask);
472	}
473	t->t_root = root;
474	t->t_height = h;
475	return 0;
476}
477
478/*
479 * radix_tree_lookup_ptr:
480 *
481 * an internal helper function used for various exported functions.
482 *
483 * return the pointer to store the node for the given index.
484 *
485 * if alloc is true, try to allocate the storage.  (note for _KERNEL:
486 * in that case, this function can block.)  if the allocation failed or
487 * alloc is false, return NULL.
488 *
489 * if path is not NULL, fill it for the caller's investigation.
490 *
491 * if tagmask is not zero, search only for nodes with the tag set.
492 * note that, however, this function doesn't check the tagmask for the leaf
493 * pointer.  it's a caller's responsibility to investigate the value which
494 * is pointed by the returned pointer if necessary.
495 *
496 * while this function is a bit large, as it's called with some constant
497 * arguments, inlining might have benefits.  anyway, a compiler will decide.
498 */
499
500static inline void **
501radix_tree_lookup_ptr(struct radix_tree *t, uint64_t idx,
502    struct radix_tree_path *path, bool alloc, const unsigned int tagmask)
503{
504	struct radix_tree_node *n;
505	int hshift = RADIX_TREE_BITS_PER_HEIGHT * t->t_height;
506	int shift;
507	void **vpp;
508	const uint64_t mask = (UINT64_C(1) << RADIX_TREE_BITS_PER_HEIGHT) - 1;
509	struct radix_tree_node_ref *refs = NULL;
510
511	/*
512	 * check unsupported combinations
513	 */
514	KASSERT(tagmask == 0 || !alloc);
515	KASSERT(path == NULL || !alloc);
516	vpp = &t->t_root;
517	if (path != NULL) {
518		refs = path->p_refs;
519		refs->pptr = vpp;
520	}
521	n = NULL;
522	for (shift = 64 - RADIX_TREE_BITS_PER_HEIGHT; shift >= 0;) {
523		struct radix_tree_node *c;
524		void *entry;
525		const uint64_t i = (idx >> shift) & mask;
526
527		if (shift >= hshift) {
528			unsigned int newheight;
529
530			KASSERT(vpp == &t->t_root);
531			if (i == 0) {
532				shift -= RADIX_TREE_BITS_PER_HEIGHT;
533				continue;
534			}
535			if (!alloc) {
536				if (path != NULL) {
537					KASSERT((refs - path->p_refs) == 0);
538					path->p_lastidx =
539					    RADIX_TREE_INVALID_HEIGHT;
540				}
541				return NULL;
542			}
543			newheight = shift / RADIX_TREE_BITS_PER_HEIGHT + 1;
544			if (radix_tree_grow(t, newheight)) {
545				return NULL;
546			}
547			hshift = RADIX_TREE_BITS_PER_HEIGHT * t->t_height;
548		}
549		entry = *vpp;
550		c = entry_ptr(entry);
551		if (c == NULL ||
552		    (tagmask != 0 &&
553		    (entry_tagmask(entry) & tagmask) == 0)) {
554			if (!alloc) {
555				if (path != NULL) {
556					path->p_lastidx = refs - path->p_refs;
557				}
558				return NULL;
559			}
560			c = radix_tree_alloc_node();
561			if (c == NULL) {
562				return NULL;
563			}
564			*vpp = c;
565		}
566		n = c;
567		vpp = &n->n_ptrs[i];
568		if (path != NULL) {
569			refs++;
570			refs->pptr = vpp;
571		}
572		shift -= RADIX_TREE_BITS_PER_HEIGHT;
573	}
574	if (alloc) {
575		KASSERT(*vpp == NULL);
576	}
577	if (path != NULL) {
578		path->p_lastidx = refs - path->p_refs;
579	}
580	return vpp;
581}
582
583/*
584 * radix_tree_undo_insert_node:
585 *
586 * Undo the effects of a failed insert.  The conditions that led to the
587 * insert may change and it may not be retried.  If the insert is not
588 * retried, there will be no corresponding radix_tree_remove_node() for
589 * this index in the future.  Therefore any adjustments made to the tree
590 * before memory was exhausted must be reverted.
591 */
592
593static __noinline void
594radix_tree_undo_insert_node(struct radix_tree *t, uint64_t idx)
595{
596	struct radix_tree_path path;
597	int i;
598
599	(void)radix_tree_lookup_ptr(t, idx, &path, false, 0);
600	if (path.p_lastidx == RADIX_TREE_INVALID_HEIGHT) {
601		/*
602		 * no nodes were inserted.
603		 */
604		return;
605	}
606	for (i = path.p_lastidx - 1; i >= 0; i--) {
607		struct radix_tree_node ** const pptr =
608		    (struct radix_tree_node **)path_pptr(t, &path, i);
609		struct radix_tree_node *n;
610
611		KASSERT(pptr != NULL);
612		n = entry_ptr(*pptr);
613		KASSERT(n != NULL);
614		if (radix_tree_sum_node(n) != 0) {
615			break;
616		}
617		radix_tree_free_node(n);
618		*pptr = NULL;
619	}
620	/*
621	 * fix up height
622	 */
623	if (i < 0) {
624		KASSERT(t->t_root == NULL);
625		t->t_height = 0;
626	}
627}
628
629/*
630 * radix_tree_insert_node:
631 *
632 * Insert the node at the given index.
633 *
634 * It's illegal to insert NULL.  It's illegal to insert a non-aligned pointer.
635 *
636 * This function returns ENOMEM if necessary memory allocation failed.
637 * Otherwise, this function returns 0.
638 *
639 * Note that inserting a node can involves memory allocation for intermediate
640 * nodes.  If _KERNEL, it's done with no-sleep IPL_NONE memory allocation.
641 *
642 * For the newly inserted node, all tags are cleared.
643 */
644
645int
646radix_tree_insert_node(struct radix_tree *t, uint64_t idx, void *p)
647{
648	void **vpp;
649
650	KASSERT(p != NULL);
651	KASSERT(entry_tagmask(entry_compose(p, 0)) == 0);
652	vpp = radix_tree_lookup_ptr(t, idx, NULL, true, 0);
653	if (__predict_false(vpp == NULL)) {
654		radix_tree_undo_insert_node(t, idx);
655		return ENOMEM;
656	}
657	KASSERT(*vpp == NULL);
658	*vpp = p;
659	return 0;
660}
661
662/*
663 * radix_tree_replace_node:
664 *
665 * Replace a node at the given index with the given node and return the
666 * replaced one.
667 *
668 * It's illegal to try to replace a node which has not been inserted.
669 *
670 * This function keeps tags intact.
671 */
672
673void *
674radix_tree_replace_node(struct radix_tree *t, uint64_t idx, void *p)
675{
676	void **vpp;
677	void *oldp;
678
679	KASSERT(p != NULL);
680	KASSERT(entry_tagmask(entry_compose(p, 0)) == 0);
681	vpp = radix_tree_lookup_ptr(t, idx, NULL, false, 0);
682	KASSERT(vpp != NULL);
683	oldp = *vpp;
684	KASSERT(oldp != NULL);
685	*vpp = entry_compose(p, entry_tagmask(*vpp));
686	return entry_ptr(oldp);
687}
688
689/*
690 * radix_tree_remove_node:
691 *
692 * Remove the node at the given index.
693 *
694 * It's illegal to try to remove a node which has not been inserted.
695 */
696
697void *
698radix_tree_remove_node(struct radix_tree *t, uint64_t idx)
699{
700	struct radix_tree_path path;
701	void **vpp;
702	void *oldp;
703	int i;
704
705	vpp = radix_tree_lookup_ptr(t, idx, &path, false, 0);
706	KASSERT(vpp != NULL);
707	oldp = *vpp;
708	KASSERT(oldp != NULL);
709	KASSERT(path.p_lastidx == t->t_height);
710	KASSERT(vpp == path_pptr(t, &path, path.p_lastidx));
711	*vpp = NULL;
712	for (i = t->t_height - 1; i >= 0; i--) {
713		void *entry;
714		struct radix_tree_node ** const pptr =
715		    (struct radix_tree_node **)path_pptr(t, &path, i);
716		struct radix_tree_node *n;
717
718		KASSERT(pptr != NULL);
719		entry = *pptr;
720		n = entry_ptr(entry);
721		KASSERT(n != NULL);
722		if (radix_tree_sum_node(n) != 0) {
723			break;
724		}
725		radix_tree_free_node(n);
726		*pptr = NULL;
727	}
728	/*
729	 * fix up height
730	 */
731	if (i < 0) {
732		KASSERT(t->t_root == NULL);
733		t->t_height = 0;
734	}
735	/*
736	 * update tags
737	 */
738	for (; i >= 0; i--) {
739		void *entry;
740		struct radix_tree_node ** const pptr =
741		    (struct radix_tree_node **)path_pptr(t, &path, i);
742		struct radix_tree_node *n;
743		unsigned int newmask;
744
745		KASSERT(pptr != NULL);
746		entry = *pptr;
747		n = entry_ptr(entry);
748		KASSERT(n != NULL);
749		KASSERT(radix_tree_sum_node(n) != 0);
750		newmask = radix_tree_sum_node(n) & RADIX_TREE_TAG_MASK;
751		if (newmask == entry_tagmask(entry)) {
752			break;
753		}
754		*pptr = entry_compose(n, newmask);
755	}
756	/*
757	 * XXX is it worth to try to reduce height?
758	 * if we do that, make radix_tree_grow rollback its change as well.
759	 */
760	return entry_ptr(oldp);
761}
762
763/*
764 * radix_tree_lookup_node:
765 *
766 * Returns the node at the given index.
767 * Returns NULL if nothing is found at the given index.
768 */
769
770void *
771radix_tree_lookup_node(struct radix_tree *t, uint64_t idx)
772{
773	void **vpp;
774
775	vpp = radix_tree_lookup_ptr(t, idx, NULL, false, 0);
776	if (vpp == NULL) {
777		return NULL;
778	}
779	return entry_ptr(*vpp);
780}
781
782static inline void
783gang_lookup_init(struct radix_tree *t, uint64_t idx,
784    struct radix_tree_path *path, const unsigned int tagmask)
785{
786	void **vpp __unused;
787
788	vpp = radix_tree_lookup_ptr(t, idx, path, false, tagmask);
789	KASSERT(vpp == NULL ||
790	    vpp == path_pptr(t, path, path->p_lastidx));
791	KASSERT(&t->t_root == path_pptr(t, path, 0));
792	KASSERT(path->p_lastidx == RADIX_TREE_INVALID_HEIGHT ||
793	   path->p_lastidx == t->t_height ||
794	   !entry_match_p(*path_pptr(t, path, path->p_lastidx), tagmask));
795}
796
797/*
798 * gang_lookup_scan:
799 *
800 * a helper routine for radix_tree_gang_lookup_node and its variants.
801 */
802
803static inline unsigned int
804__attribute__((__always_inline__))
805gang_lookup_scan(struct radix_tree *t, struct radix_tree_path *path,
806    void **results, const unsigned int maxresults, const unsigned int tagmask,
807    const bool reverse, const bool dense)
808{
809
810	/*
811	 * we keep the path updated only for lastidx-1.
812	 * vpp is what path_pptr(t, path, lastidx) would be.
813	 */
814	void **vpp;
815	unsigned int nfound;
816	unsigned int lastidx;
817	/*
818	 * set up scan direction dependant constants so that we can iterate
819	 * n_ptrs as the following.
820	 *
821	 *	for (i = first; i != guard; i += step)
822	 *		visit n->n_ptrs[i];
823	 */
824	const int step = reverse ? -1 : 1;
825	const unsigned int first = reverse ? RADIX_TREE_PTR_PER_NODE - 1 : 0;
826	const unsigned int last = reverse ? 0 : RADIX_TREE_PTR_PER_NODE - 1;
827	const unsigned int guard = last + step;
828
829	KASSERT(maxresults > 0);
830	KASSERT(&t->t_root == path_pptr(t, path, 0));
831	lastidx = path->p_lastidx;
832	KASSERT(lastidx == RADIX_TREE_INVALID_HEIGHT ||
833	   lastidx == t->t_height ||
834	   !entry_match_p(*path_pptr(t, path, lastidx), tagmask));
835	nfound = 0;
836	if (lastidx == RADIX_TREE_INVALID_HEIGHT) {
837		/*
838		 * requested idx is beyond the right-most node.
839		 */
840		if (reverse && !dense) {
841			lastidx = 0;
842			vpp = path_pptr(t, path, lastidx);
843			goto descend;
844		}
845		return 0;
846	}
847	vpp = path_pptr(t, path, lastidx);
848	while (/*CONSTCOND*/true) {
849		struct radix_tree_node *n;
850		unsigned int i;
851
852		if (entry_match_p(*vpp, tagmask)) {
853			KASSERT(lastidx == t->t_height);
854			/*
855			 * record the matching non-NULL leaf.
856			 */
857			results[nfound] = entry_ptr(*vpp);
858			nfound++;
859			if (nfound == maxresults) {
860				return nfound;
861			}
862		} else if (dense) {
863			return nfound;
864		}
865scan_siblings:
866		/*
867		 * try to find the next matching non-NULL sibling.
868		 */
869		if (lastidx == 0) {
870			/*
871			 * the root has no siblings.
872			 * we've done.
873			 */
874			KASSERT(vpp == &t->t_root);
875			break;
876		}
877		n = path_node(t, path, lastidx - 1);
878		for (i = vpp - n->n_ptrs + step; i != guard; i += step) {
879			KASSERT(i < RADIX_TREE_PTR_PER_NODE);
880			if (entry_match_p(n->n_ptrs[i], tagmask)) {
881				vpp = &n->n_ptrs[i];
882				break;
883			} else if (dense) {
884				return nfound;
885			}
886		}
887		if (i == guard) {
888			/*
889			 * not found.  go to parent.
890			 */
891			lastidx--;
892			vpp = path_pptr(t, path, lastidx);
893			goto scan_siblings;
894		}
895descend:
896		/*
897		 * following the left-most (or right-most in the case of
898		 * reverse scan) child node, descend until reaching the leaf or
899		 * a non-matching entry.
900		 */
901		while (entry_match_p(*vpp, tagmask) && lastidx < t->t_height) {
902			/*
903			 * save vpp in the path so that we can come back to this
904			 * node after finishing visiting children.
905			 */
906			path->p_refs[lastidx].pptr = vpp;
907			n = entry_ptr(*vpp);
908			vpp = &n->n_ptrs[first];
909			lastidx++;
910		}
911	}
912	return nfound;
913}
914
915/*
916 * radix_tree_gang_lookup_node:
917 *
918 * Scan the tree starting from the given index in the ascending order and
919 * return found nodes.
920 *
921 * results should be an array large enough to hold maxresults pointers.
922 * This function returns the number of nodes found, up to maxresults.
923 * Returning less than maxresults means there are no more nodes in the tree.
924 *
925 * If dense == true, this function stops scanning when it founds a hole of
926 * indexes.  I.e. an index for which radix_tree_lookup_node would returns NULL.
927 * If dense == false, this function skips holes and continue scanning until
928 * maxresults nodes are found or it reaches the limit of the index range.
929 *
930 * The result of this function is semantically equivalent to what could be
931 * obtained by repeated calls of radix_tree_lookup_node with increasing index.
932 * but this function is expected to be computationally cheaper when looking up
933 * multiple nodes at once.  Especially, it's expected to be much cheaper when
934 * node indexes are distributed sparsely.
935 *
936 * Note that this function doesn't return index values of found nodes.
937 * Thus, in the case of dense == false, if index values are important for
938 * a caller, it's the caller's responsibility to check them, typically
939 * by examining the returned nodes using some caller-specific knowledge
940 * about them.
941 * In the case of dense == true, a node returned via results[N] is always for
942 * the index (idx + N).
943 */
944
945unsigned int
946radix_tree_gang_lookup_node(struct radix_tree *t, uint64_t idx,
947    void **results, unsigned int maxresults, bool dense)
948{
949	struct radix_tree_path path;
950
951	gang_lookup_init(t, idx, &path, 0);
952	return gang_lookup_scan(t, &path, results, maxresults, 0, false, dense);
953}
954
955/*
956 * radix_tree_gang_lookup_node_reverse:
957 *
958 * Same as radix_tree_gang_lookup_node except that this one scans the
959 * tree in the reverse order.  I.e. descending index values.
960 */
961
962unsigned int
963radix_tree_gang_lookup_node_reverse(struct radix_tree *t, uint64_t idx,
964    void **results, unsigned int maxresults, bool dense)
965{
966	struct radix_tree_path path;
967
968	gang_lookup_init(t, idx, &path, 0);
969	return gang_lookup_scan(t, &path, results, maxresults, 0, true, dense);
970}
971
972/*
973 * radix_tree_gang_lookup_tagged_node:
974 *
975 * Same as radix_tree_gang_lookup_node except that this one only returns
976 * nodes tagged with tagid.
977 *
978 * It's illegal to call this function with tagmask 0.
979 */
980
981unsigned int
982radix_tree_gang_lookup_tagged_node(struct radix_tree *t, uint64_t idx,
983    void **results, unsigned int maxresults, bool dense, unsigned int tagmask)
984{
985	struct radix_tree_path path;
986
987	KASSERT(tagmask != 0);
988	gang_lookup_init(t, idx, &path, tagmask);
989	return gang_lookup_scan(t, &path, results, maxresults, tagmask, false,
990	    dense);
991}
992
993/*
994 * radix_tree_gang_lookup_tagged_node_reverse:
995 *
996 * Same as radix_tree_gang_lookup_tagged_node except that this one scans the
997 * tree in the reverse order.  I.e. descending index values.
998 */
999
1000unsigned int
1001radix_tree_gang_lookup_tagged_node_reverse(struct radix_tree *t, uint64_t idx,
1002    void **results, unsigned int maxresults, bool dense, unsigned int tagmask)
1003{
1004	struct radix_tree_path path;
1005
1006	KASSERT(tagmask != 0);
1007	gang_lookup_init(t, idx, &path, tagmask);
1008	return gang_lookup_scan(t, &path, results, maxresults, tagmask, true,
1009	    dense);
1010}
1011
1012/*
1013 * radix_tree_get_tag:
1014 *
1015 * Return the tagmask for the node at the given index.
1016 *
1017 * It's illegal to call this function for a node which has not been inserted.
1018 */
1019
1020unsigned int
1021radix_tree_get_tag(struct radix_tree *t, uint64_t idx, unsigned int tagmask)
1022{
1023	/*
1024	 * the following two implementations should behave same.
1025	 * the former one was chosen because it seems faster.
1026	 */
1027#if 1
1028	void **vpp;
1029
1030	vpp = radix_tree_lookup_ptr(t, idx, NULL, false, tagmask);
1031	if (vpp == NULL) {
1032		return false;
1033	}
1034	KASSERT(*vpp != NULL);
1035	return (entry_tagmask(*vpp) & tagmask);
1036#else
1037	void **vpp;
1038
1039	vpp = radix_tree_lookup_ptr(t, idx, NULL, false, 0);
1040	KASSERT(vpp != NULL);
1041	return (entry_tagmask(*vpp) & tagmask);
1042#endif
1043}
1044
1045/*
1046 * radix_tree_set_tag:
1047 *
1048 * Set the tag for the node at the given index.
1049 *
1050 * It's illegal to call this function for a node which has not been inserted.
1051 * It's illegal to call this function with tagmask 0.
1052 */
1053
1054void
1055radix_tree_set_tag(struct radix_tree *t, uint64_t idx, unsigned int tagmask)
1056{
1057	struct radix_tree_path path;
1058	void **vpp __unused;
1059	int i;
1060
1061	KASSERT(tagmask != 0);
1062	vpp = radix_tree_lookup_ptr(t, idx, &path, false, 0);
1063	KASSERT(vpp != NULL);
1064	KASSERT(*vpp != NULL);
1065	KASSERT(path.p_lastidx == t->t_height);
1066	KASSERT(vpp == path_pptr(t, &path, path.p_lastidx));
1067	for (i = t->t_height; i >= 0; i--) {
1068		void ** const pptr = (void **)path_pptr(t, &path, i);
1069		void *entry;
1070
1071		KASSERT(pptr != NULL);
1072		entry = *pptr;
1073		if ((entry_tagmask(entry) & tagmask) != 0) {
1074			break;
1075		}
1076		*pptr = (void *)((uintptr_t)entry | tagmask);
1077	}
1078}
1079
1080/*
1081 * radix_tree_clear_tag:
1082 *
1083 * Clear the tag for the node at the given index.
1084 *
1085 * It's illegal to call this function for a node which has not been inserted.
1086 * It's illegal to call this function with tagmask 0.
1087 */
1088
1089void
1090radix_tree_clear_tag(struct radix_tree *t, uint64_t idx, unsigned int tagmask)
1091{
1092	struct radix_tree_path path;
1093	void **vpp;
1094	int i;
1095
1096	KASSERT(tagmask != 0);
1097	vpp = radix_tree_lookup_ptr(t, idx, &path, false, 0);
1098	KASSERT(vpp != NULL);
1099	KASSERT(*vpp != NULL);
1100	KASSERT(path.p_lastidx == t->t_height);
1101	KASSERT(vpp == path_pptr(t, &path, path.p_lastidx));
1102	/*
1103	 * if already cleared, nothing to do
1104	 */
1105	if ((entry_tagmask(*vpp) & tagmask) == 0) {
1106		return;
1107	}
1108	/*
1109	 * clear the tag only if no children have the tag.
1110	 */
1111	for (i = t->t_height; i >= 0; i--) {
1112		void ** const pptr = (void **)path_pptr(t, &path, i);
1113		void *entry;
1114
1115		KASSERT(pptr != NULL);
1116		entry = *pptr;
1117		KASSERT((entry_tagmask(entry) & tagmask) != 0);
1118		*pptr = entry_compose(entry_ptr(entry),
1119		    entry_tagmask(entry) & ~tagmask);
1120		/*
1121		 * check if we should proceed to process the next level.
1122		 */
1123		if (0 < i) {
1124			struct radix_tree_node *n = path_node(t, &path, i - 1);
1125
1126			if ((radix_tree_sum_node(n) & tagmask) != 0) {
1127				break;
1128			}
1129		}
1130	}
1131}
1132
1133#if defined(UNITTEST)
1134
1135#include <inttypes.h>
1136#include <stdio.h>
1137
1138static void
1139radix_tree_dump_node(const struct radix_tree *t, void *vp,
1140    uint64_t offset, unsigned int height)
1141{
1142	struct radix_tree_node *n;
1143	unsigned int i;
1144
1145	for (i = 0; i < t->t_height - height; i++) {
1146		printf(" ");
1147	}
1148	if (entry_tagmask(vp) == 0) {
1149		printf("[%" PRIu64 "] %p", offset, entry_ptr(vp));
1150	} else {
1151		printf("[%" PRIu64 "] %p (tagmask=0x%x)", offset, entry_ptr(vp),
1152		    entry_tagmask(vp));
1153	}
1154	if (height == 0) {
1155		printf(" (leaf)\n");
1156		return;
1157	}
1158	n = entry_ptr(vp);
1159	assert((radix_tree_sum_node(n) & RADIX_TREE_TAG_MASK) ==
1160	    entry_tagmask(vp));
1161	printf(" (%u children)\n", radix_tree_node_count_ptrs(n));
1162	for (i = 0; i < __arraycount(n->n_ptrs); i++) {
1163		void *c;
1164
1165		c = n->n_ptrs[i];
1166		if (c == NULL) {
1167			continue;
1168		}
1169		radix_tree_dump_node(t, c,
1170		    offset + i * (UINT64_C(1) <<
1171		    (RADIX_TREE_BITS_PER_HEIGHT * (height - 1))), height - 1);
1172	}
1173}
1174
1175void radix_tree_dump(const struct radix_tree *);
1176
1177void
1178radix_tree_dump(const struct radix_tree *t)
1179{
1180
1181	printf("tree %p height=%u\n", t, t->t_height);
1182	radix_tree_dump_node(t, t->t_root, 0, t->t_height);
1183}
1184
1185static void
1186test1(void)
1187{
1188	struct radix_tree s;
1189	struct radix_tree *t = &s;
1190	void *results[3];
1191
1192	radix_tree_init_tree(t);
1193	radix_tree_dump(t);
1194	assert(radix_tree_lookup_node(t, 0) == NULL);
1195	assert(radix_tree_lookup_node(t, 1000) == NULL);
1196	assert(radix_tree_gang_lookup_node(t, 0, results, 3, false) == 0);
1197	assert(radix_tree_gang_lookup_node(t, 0, results, 3, true) == 0);
1198	assert(radix_tree_gang_lookup_node(t, 1000, results, 3, false) == 0);
1199	assert(radix_tree_gang_lookup_node(t, 1000, results, 3, true) == 0);
1200	assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, false) ==
1201	    0);
1202	assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, true) ==
1203	    0);
1204	assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, false)
1205	    == 0);
1206	assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, true)
1207	    == 0);
1208	assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, false, 1)
1209	    == 0);
1210	assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, true, 1)
1211	    == 0);
1212	assert(radix_tree_gang_lookup_tagged_node(t, 1000, results, 3, false, 1)
1213	    == 0);
1214	assert(radix_tree_gang_lookup_tagged_node(t, 1000, results, 3, true, 1)
1215	    == 0);
1216	assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3,
1217	    false, 1) == 0);
1218	assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3,
1219	    true, 1) == 0);
1220	assert(radix_tree_gang_lookup_tagged_node_reverse(t, 1000, results, 3,
1221	    false, 1) == 0);
1222	assert(radix_tree_gang_lookup_tagged_node_reverse(t, 1000, results, 3,
1223	    true, 1) == 0);
1224	assert(radix_tree_empty_tree_p(t));
1225	assert(radix_tree_empty_tagged_tree_p(t, 1));
1226	assert(radix_tree_empty_tagged_tree_p(t, 2));
1227	assert(radix_tree_insert_node(t, 0, (void *)0xdeadbea0) == 0);
1228	assert(!radix_tree_empty_tree_p(t));
1229	assert(radix_tree_empty_tagged_tree_p(t, 1));
1230	assert(radix_tree_empty_tagged_tree_p(t, 2));
1231	assert(radix_tree_lookup_node(t, 0) == (void *)0xdeadbea0);
1232	assert(radix_tree_lookup_node(t, 1000) == NULL);
1233	memset(results, 0, sizeof(results));
1234	assert(radix_tree_gang_lookup_node(t, 0, results, 3, false) == 1);
1235	assert(results[0] == (void *)0xdeadbea0);
1236	memset(results, 0, sizeof(results));
1237	assert(radix_tree_gang_lookup_node(t, 0, results, 3, true) == 1);
1238	assert(results[0] == (void *)0xdeadbea0);
1239	assert(radix_tree_gang_lookup_node(t, 1000, results, 3, false) == 0);
1240	assert(radix_tree_gang_lookup_node(t, 1000, results, 3, true) == 0);
1241	memset(results, 0, sizeof(results));
1242	assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, false) ==
1243	    1);
1244	assert(results[0] == (void *)0xdeadbea0);
1245	memset(results, 0, sizeof(results));
1246	assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, true) ==
1247	    1);
1248	assert(results[0] == (void *)0xdeadbea0);
1249	memset(results, 0, sizeof(results));
1250	assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, false)
1251	    == 1);
1252	assert(results[0] == (void *)0xdeadbea0);
1253	assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, true)
1254	    == 0);
1255	assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, false, 1)
1256	    == 0);
1257	assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, true, 1)
1258	    == 0);
1259	assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3,
1260	    false, 1) == 0);
1261	assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3,
1262	    true, 1) == 0);
1263	assert(radix_tree_insert_node(t, 1000, (void *)0xdeadbea0) == 0);
1264	assert(radix_tree_remove_node(t, 0) == (void *)0xdeadbea0);
1265	assert(!radix_tree_empty_tree_p(t));
1266	radix_tree_dump(t);
1267	assert(radix_tree_lookup_node(t, 0) == NULL);
1268	assert(radix_tree_lookup_node(t, 1000) == (void *)0xdeadbea0);
1269	memset(results, 0, sizeof(results));
1270	assert(radix_tree_gang_lookup_node(t, 0, results, 3, false) == 1);
1271	assert(results[0] == (void *)0xdeadbea0);
1272	assert(radix_tree_gang_lookup_node(t, 0, results, 3, true) == 0);
1273	memset(results, 0, sizeof(results));
1274	assert(radix_tree_gang_lookup_node(t, 1000, results, 3, false) == 1);
1275	assert(results[0] == (void *)0xdeadbea0);
1276	memset(results, 0, sizeof(results));
1277	assert(radix_tree_gang_lookup_node(t, 1000, results, 3, true) == 1);
1278	assert(results[0] == (void *)0xdeadbea0);
1279	assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, false)
1280	    == 0);
1281	assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, true)
1282	    == 0);
1283	memset(results, 0, sizeof(results));
1284	assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, false)
1285	    == 1);
1286	memset(results, 0, sizeof(results));
1287	assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, true)
1288	    == 1);
1289	assert(results[0] == (void *)0xdeadbea0);
1290	assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, false, 1)
1291	    == 0);
1292	assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, true, 1)
1293	    == 0);
1294	assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3,
1295	    false, 1) == 0);
1296	assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3,
1297	    true, 1) == 0);
1298	assert(!radix_tree_get_tag(t, 1000, 1));
1299	assert(!radix_tree_get_tag(t, 1000, 2));
1300	assert(radix_tree_get_tag(t, 1000, 2 | 1) == 0);
1301	assert(radix_tree_empty_tagged_tree_p(t, 1));
1302	assert(radix_tree_empty_tagged_tree_p(t, 2));
1303	radix_tree_set_tag(t, 1000, 2);
1304	assert(!radix_tree_get_tag(t, 1000, 1));
1305	assert(radix_tree_get_tag(t, 1000, 2));
1306	assert(radix_tree_get_tag(t, 1000, 2 | 1) == 2);
1307	assert(radix_tree_empty_tagged_tree_p(t, 1));
1308	assert(!radix_tree_empty_tagged_tree_p(t, 2));
1309	radix_tree_dump(t);
1310	assert(radix_tree_lookup_node(t, 1000) == (void *)0xdeadbea0);
1311	assert(radix_tree_insert_node(t, 0, (void *)0xbea0) == 0);
1312	radix_tree_dump(t);
1313	assert(radix_tree_lookup_node(t, 0) == (void *)0xbea0);
1314	assert(radix_tree_lookup_node(t, 1000) == (void *)0xdeadbea0);
1315	assert(radix_tree_insert_node(t, UINT64_C(10000000000), (void *)0xdea0)
1316	    == 0);
1317	radix_tree_dump(t);
1318	assert(radix_tree_lookup_node(t, 0) == (void *)0xbea0);
1319	assert(radix_tree_lookup_node(t, 1000) == (void *)0xdeadbea0);
1320	assert(radix_tree_lookup_node(t, UINT64_C(10000000000)) ==
1321	    (void *)0xdea0);
1322	radix_tree_dump(t);
1323	assert(!radix_tree_get_tag(t, 0, 2));
1324	assert(radix_tree_get_tag(t, 1000, 2));
1325	assert(!radix_tree_get_tag(t, UINT64_C(10000000000), 1));
1326	radix_tree_set_tag(t, 0, 2);
1327	radix_tree_set_tag(t, UINT64_C(10000000000), 2);
1328	radix_tree_dump(t);
1329	assert(radix_tree_get_tag(t, 0, 2));
1330	assert(radix_tree_get_tag(t, 1000, 2));
1331	assert(radix_tree_get_tag(t, UINT64_C(10000000000), 2));
1332	radix_tree_clear_tag(t, 0, 2);
1333	radix_tree_clear_tag(t, UINT64_C(10000000000), 2);
1334	radix_tree_dump(t);
1335	assert(!radix_tree_get_tag(t, 0, 2));
1336	assert(radix_tree_get_tag(t, 1000, 2));
1337	assert(!radix_tree_get_tag(t, UINT64_C(10000000000), 2));
1338	radix_tree_dump(t);
1339	assert(radix_tree_replace_node(t, 1000, (void *)0x12345678) ==
1340	    (void *)0xdeadbea0);
1341	assert(!radix_tree_get_tag(t, 1000, 1));
1342	assert(radix_tree_get_tag(t, 1000, 2));
1343	assert(radix_tree_get_tag(t, 1000, 2 | 1) == 2);
1344	memset(results, 0, sizeof(results));
1345	assert(radix_tree_gang_lookup_node(t, 0, results, 3, false) == 3);
1346	assert(results[0] == (void *)0xbea0);
1347	assert(results[1] == (void *)0x12345678);
1348	assert(results[2] == (void *)0xdea0);
1349	memset(results, 0, sizeof(results));
1350	assert(radix_tree_gang_lookup_node(t, 0, results, 3, true) == 1);
1351	assert(results[0] == (void *)0xbea0);
1352	memset(results, 0, sizeof(results));
1353	assert(radix_tree_gang_lookup_node(t, 1, results, 3, false) == 2);
1354	assert(results[0] == (void *)0x12345678);
1355	assert(results[1] == (void *)0xdea0);
1356	assert(radix_tree_gang_lookup_node(t, 1, results, 3, true) == 0);
1357	memset(results, 0, sizeof(results));
1358	assert(radix_tree_gang_lookup_node(t, 1001, results, 3, false) == 1);
1359	assert(results[0] == (void *)0xdea0);
1360	assert(radix_tree_gang_lookup_node(t, 1001, results, 3, true) == 0);
1361	assert(radix_tree_gang_lookup_node(t, UINT64_C(10000000001), results, 3,
1362	    false) == 0);
1363	assert(radix_tree_gang_lookup_node(t, UINT64_C(10000000001), results, 3,
1364	    true) == 0);
1365	assert(radix_tree_gang_lookup_node(t, UINT64_C(1000000000000), results,
1366	    3, false) == 0);
1367	assert(radix_tree_gang_lookup_node(t, UINT64_C(1000000000000), results,
1368	    3, true) == 0);
1369	memset(results, 0, sizeof(results));
1370	assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 100, false, 2)
1371	    == 1);
1372	assert(results[0] == (void *)0x12345678);
1373	assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 100, true, 2)
1374	    == 0);
1375	assert(entry_tagmask(t->t_root) != 0);
1376	assert(radix_tree_remove_node(t, 1000) == (void *)0x12345678);
1377	assert(entry_tagmask(t->t_root) == 0);
1378	radix_tree_dump(t);
1379	assert(radix_tree_insert_node(t, UINT64_C(10000000001), (void *)0xfff0)
1380	    == 0);
1381	memset(results, 0, sizeof(results));
1382	assert(radix_tree_gang_lookup_node(t, UINT64_C(10000000000), results, 3,
1383	    false) == 2);
1384	assert(results[0] == (void *)0xdea0);
1385	assert(results[1] == (void *)0xfff0);
1386	memset(results, 0, sizeof(results));
1387	assert(radix_tree_gang_lookup_node(t, UINT64_C(10000000000), results, 3,
1388	    true) == 2);
1389	assert(results[0] == (void *)0xdea0);
1390	assert(results[1] == (void *)0xfff0);
1391	memset(results, 0, sizeof(results));
1392	assert(radix_tree_gang_lookup_node_reverse(t, UINT64_C(10000000001),
1393	    results, 3, false) == 3);
1394	assert(results[0] == (void *)0xfff0);
1395	assert(results[1] == (void *)0xdea0);
1396	assert(results[2] == (void *)0xbea0);
1397	memset(results, 0, sizeof(results));
1398	assert(radix_tree_gang_lookup_node_reverse(t, UINT64_C(10000000001),
1399	    results, 3, true) == 2);
1400	assert(results[0] == (void *)0xfff0);
1401	assert(results[1] == (void *)0xdea0);
1402	assert(radix_tree_remove_node(t, UINT64_C(10000000000)) ==
1403	    (void *)0xdea0);
1404	assert(radix_tree_remove_node(t, UINT64_C(10000000001)) ==
1405	    (void *)0xfff0);
1406	radix_tree_dump(t);
1407	assert(radix_tree_remove_node(t, 0) == (void *)0xbea0);
1408	radix_tree_dump(t);
1409	radix_tree_fini_tree(t);
1410}
1411
1412#include <sys/time.h>
1413
1414struct testnode {
1415	uint64_t idx;
1416	bool tagged[RADIX_TREE_TAG_ID_MAX];
1417};
1418
1419static void
1420printops(const char *title, const char *name, int tag, unsigned int n,
1421    const struct timeval *stv, const struct timeval *etv)
1422{
1423	uint64_t s = stv->tv_sec * 1000000 + stv->tv_usec;
1424	uint64_t e = etv->tv_sec * 1000000 + etv->tv_usec;
1425
1426	printf("RESULT %s %s %d %lf op/s\n", title, name, tag,
1427	    (double)n / (e - s) * 1000000);
1428}
1429
1430#define	TEST2_GANG_LOOKUP_NODES	16
1431
1432static bool
1433test2_should_tag(unsigned int i, unsigned int tagid)
1434{
1435
1436	if (tagid == 0) {
1437		return (i % 4) == 0;	/* 25% */
1438	} else {
1439		return (i % 7) == 0;	/* 14% */
1440	}
1441	return 1;
1442}
1443
1444static void
1445check_tag_count(const unsigned int *ntagged, unsigned int tagmask,
1446    unsigned int count)
1447{
1448	unsigned int tag;
1449
1450	for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) {
1451		if ((tagmask & (1 << tag)) == 0) {
1452			continue;
1453		}
1454		if (((tagmask - 1) & tagmask) == 0) {
1455			assert(count == ntagged[tag]);
1456		} else {
1457			assert(count >= ntagged[tag]);
1458		}
1459	}
1460}
1461
1462static void
1463test2(const char *title, bool dense)
1464{
1465	struct radix_tree s;
1466	struct radix_tree *t = &s;
1467	struct testnode *n;
1468	unsigned int i;
1469	unsigned int nnodes = 100000;
1470	unsigned int removed;
1471	unsigned int tag;
1472	unsigned int tagmask;
1473	unsigned int ntagged[RADIX_TREE_TAG_ID_MAX];
1474	struct testnode *nodes;
1475	struct timeval stv;
1476	struct timeval etv;
1477
1478	nodes = malloc(nnodes * sizeof(*nodes));
1479	for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) {
1480		ntagged[tag] = 0;
1481	}
1482	radix_tree_init_tree(t);
1483	for (i = 0; i < nnodes; i++) {
1484		n = &nodes[i];
1485		n->idx = random();
1486		if (sizeof(long) == 4) {
1487			n->idx <<= 32;
1488			n->idx |= (uint32_t)random();
1489		}
1490		if (dense) {
1491			n->idx %= nnodes * 2;
1492		}
1493		while (radix_tree_lookup_node(t, n->idx) != NULL) {
1494			n->idx++;
1495		}
1496		radix_tree_insert_node(t, n->idx, n);
1497		for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) {
1498			tagmask = 1 << tag;
1499
1500			n->tagged[tag] = test2_should_tag(i, tag);
1501			if (n->tagged[tag]) {
1502				radix_tree_set_tag(t, n->idx, tagmask);
1503				ntagged[tag]++;
1504			}
1505			assert((n->tagged[tag] ? tagmask : 0) ==
1506			    radix_tree_get_tag(t, n->idx, tagmask));
1507		}
1508	}
1509
1510	gettimeofday(&stv, NULL);
1511	for (i = 0; i < nnodes; i++) {
1512		n = &nodes[i];
1513		assert(radix_tree_lookup_node(t, n->idx) == n);
1514	}
1515	gettimeofday(&etv, NULL);
1516	printops(title, "lookup", 0, nnodes, &stv, &etv);
1517
1518	for (tagmask = 1; tagmask <= RADIX_TREE_TAG_MASK; tagmask ++) {
1519		unsigned int count = 0;
1520
1521		gettimeofday(&stv, NULL);
1522		for (i = 0; i < nnodes; i++) {
1523			unsigned int tagged;
1524
1525			n = &nodes[i];
1526			tagged = radix_tree_get_tag(t, n->idx, tagmask);
1527			assert((tagged & ~tagmask) == 0);
1528			for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) {
1529				assert((tagmask & (1 << tag)) == 0 ||
1530				    n->tagged[tag] == !!(tagged & (1 << tag)));
1531			}
1532			if (tagged) {
1533				count++;
1534			}
1535		}
1536		gettimeofday(&etv, NULL);
1537		check_tag_count(ntagged, tagmask, count);
1538		printops(title, "get_tag", tagmask, nnodes, &stv, &etv);
1539	}
1540
1541	gettimeofday(&stv, NULL);
1542	for (i = 0; i < nnodes; i++) {
1543		n = &nodes[i];
1544		radix_tree_remove_node(t, n->idx);
1545	}
1546	gettimeofday(&etv, NULL);
1547	printops(title, "remove", 0, nnodes, &stv, &etv);
1548
1549	gettimeofday(&stv, NULL);
1550	for (i = 0; i < nnodes; i++) {
1551		n = &nodes[i];
1552		radix_tree_insert_node(t, n->idx, n);
1553	}
1554	gettimeofday(&etv, NULL);
1555	printops(title, "insert", 0, nnodes, &stv, &etv);
1556
1557	for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) {
1558		tagmask = 1 << tag;
1559
1560		ntagged[tag] = 0;
1561		gettimeofday(&stv, NULL);
1562		for (i = 0; i < nnodes; i++) {
1563			n = &nodes[i];
1564			if (n->tagged[tag]) {
1565				radix_tree_set_tag(t, n->idx, tagmask);
1566				ntagged[tag]++;
1567			}
1568		}
1569		gettimeofday(&etv, NULL);
1570		printops(title, "set_tag", tag, ntagged[tag], &stv, &etv);
1571	}
1572
1573	gettimeofday(&stv, NULL);
1574	{
1575		struct testnode *results[TEST2_GANG_LOOKUP_NODES];
1576		uint64_t nextidx;
1577		unsigned int nfound;
1578		unsigned int total;
1579
1580		nextidx = 0;
1581		total = 0;
1582		while ((nfound = radix_tree_gang_lookup_node(t, nextidx,
1583		    (void *)results, __arraycount(results), false)) > 0) {
1584			nextidx = results[nfound - 1]->idx + 1;
1585			total += nfound;
1586			if (nextidx == 0) {
1587				break;
1588			}
1589		}
1590		assert(total == nnodes);
1591	}
1592	gettimeofday(&etv, NULL);
1593	printops(title, "ganglookup", 0, nnodes, &stv, &etv);
1594
1595	gettimeofday(&stv, NULL);
1596	{
1597		struct testnode *results[TEST2_GANG_LOOKUP_NODES];
1598		uint64_t nextidx;
1599		unsigned int nfound;
1600		unsigned int total;
1601
1602		nextidx = UINT64_MAX;
1603		total = 0;
1604		while ((nfound = radix_tree_gang_lookup_node_reverse(t, nextidx,
1605		    (void *)results, __arraycount(results), false)) > 0) {
1606			nextidx = results[nfound - 1]->idx - 1;
1607			total += nfound;
1608			if (nextidx == UINT64_MAX) {
1609				break;
1610			}
1611		}
1612		assert(total == nnodes);
1613	}
1614	gettimeofday(&etv, NULL);
1615	printops(title, "ganglookup_reverse", 0, nnodes, &stv, &etv);
1616
1617	for (tagmask = 1; tagmask <= RADIX_TREE_TAG_MASK; tagmask ++) {
1618		unsigned int total = 0;
1619
1620		gettimeofday(&stv, NULL);
1621		{
1622			struct testnode *results[TEST2_GANG_LOOKUP_NODES];
1623			uint64_t nextidx;
1624			unsigned int nfound;
1625
1626			nextidx = 0;
1627			while ((nfound = radix_tree_gang_lookup_tagged_node(t,
1628			    nextidx, (void *)results, __arraycount(results),
1629			    false, tagmask)) > 0) {
1630				nextidx = results[nfound - 1]->idx + 1;
1631				total += nfound;
1632			}
1633		}
1634		gettimeofday(&etv, NULL);
1635		check_tag_count(ntagged, tagmask, total);
1636		assert(tagmask != 0 || total == 0);
1637		printops(title, "ganglookup_tag", tagmask, total, &stv, &etv);
1638	}
1639
1640	for (tagmask = 1; tagmask <= RADIX_TREE_TAG_MASK; tagmask ++) {
1641		unsigned int total = 0;
1642
1643		gettimeofday(&stv, NULL);
1644		{
1645			struct testnode *results[TEST2_GANG_LOOKUP_NODES];
1646			uint64_t nextidx;
1647			unsigned int nfound;
1648
1649			nextidx = UINT64_MAX;
1650			while ((nfound =
1651			    radix_tree_gang_lookup_tagged_node_reverse(t,
1652			    nextidx, (void *)results, __arraycount(results),
1653			    false, tagmask)) > 0) {
1654				nextidx = results[nfound - 1]->idx - 1;
1655				total += nfound;
1656				if (nextidx == UINT64_MAX) {
1657					break;
1658				}
1659			}
1660		}
1661		gettimeofday(&etv, NULL);
1662		check_tag_count(ntagged, tagmask, total);
1663		assert(tagmask != 0 || total == 0);
1664		printops(title, "ganglookup_tag_reverse", tagmask, total,
1665		    &stv, &etv);
1666	}
1667
1668	removed = 0;
1669	for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) {
1670		unsigned int total;
1671
1672		total = 0;
1673		tagmask = 1 << tag;
1674		gettimeofday(&stv, NULL);
1675		{
1676			struct testnode *results[TEST2_GANG_LOOKUP_NODES];
1677			uint64_t nextidx;
1678			unsigned int nfound;
1679
1680			nextidx = 0;
1681			while ((nfound = radix_tree_gang_lookup_tagged_node(t,
1682			    nextidx, (void *)results, __arraycount(results),
1683			    false, tagmask)) > 0) {
1684				for (i = 0; i < nfound; i++) {
1685					radix_tree_remove_node(t,
1686					    results[i]->idx);
1687				}
1688				nextidx = results[nfound - 1]->idx + 1;
1689				total += nfound;
1690				if (nextidx == 0) {
1691					break;
1692				}
1693			}
1694		}
1695		gettimeofday(&etv, NULL);
1696		if (tag == 0) {
1697			check_tag_count(ntagged, tagmask, total);
1698		} else {
1699			assert(total <= ntagged[tag]);
1700		}
1701		printops(title, "ganglookup_tag+remove", tagmask, total, &stv,
1702		    &etv);
1703		removed += total;
1704	}
1705
1706	gettimeofday(&stv, NULL);
1707	{
1708		struct testnode *results[TEST2_GANG_LOOKUP_NODES];
1709		uint64_t nextidx;
1710		unsigned int nfound;
1711		unsigned int total;
1712
1713		nextidx = 0;
1714		total = 0;
1715		while ((nfound = radix_tree_gang_lookup_node(t, nextidx,
1716		    (void *)results, __arraycount(results), false)) > 0) {
1717			for (i = 0; i < nfound; i++) {
1718				assert(results[i] == radix_tree_remove_node(t,
1719				    results[i]->idx));
1720			}
1721			nextidx = results[nfound - 1]->idx + 1;
1722			total += nfound;
1723			if (nextidx == 0) {
1724				break;
1725			}
1726		}
1727		assert(total == nnodes - removed);
1728	}
1729	gettimeofday(&etv, NULL);
1730	printops(title, "ganglookup+remove", 0, nnodes - removed, &stv, &etv);
1731
1732	assert(radix_tree_empty_tree_p(t));
1733	for (tagmask = 1; tagmask <= RADIX_TREE_TAG_MASK; tagmask ++) {
1734		assert(radix_tree_empty_tagged_tree_p(t, tagmask));
1735	}
1736	radix_tree_fini_tree(t);
1737	free(nodes);
1738}
1739
1740int
1741main(int argc, char *argv[])
1742{
1743
1744	test1();
1745	test2("dense", true);
1746	test2("sparse", false);
1747	return 0;
1748}
1749
1750#endif /* defined(UNITTEST) */
1751