1/*
2 * Copyright (c) 2013 EMC Corp.
3 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
4 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 */
29
30/*
31 * Path-compressed radix trie implementation.
32 *
33 * The implementation takes into account the following rationale:
34 * - Size of the nodes should be as small as possible but still big enough
35 *   to avoid a large maximum depth for the trie.  This is a balance
36 *   between the necessity to not wire too much physical memory for the nodes
37 *   and the necessity to avoid too much cache pollution during the trie
38 *   operations.
39 * - There is not a huge bias toward the number of lookup operations over
40 *   the number of insert and remove operations.  This basically implies
41 *   that optimizations supposedly helping one operation but hurting the
42 *   other might be carefully evaluated.
43 * - On average not many nodes are expected to be fully populated, hence
44 *   level compression may just complicate things.
45 */
46
47#include <sys/cdefs.h>
48__FBSDID("$FreeBSD$");
49
50#include "opt_ddb.h"
51
52#include <sys/param.h>
53#include <sys/systm.h>
54#include <sys/kernel.h>
55#include <sys/pctrie.h>
56
57#ifdef DDB
58#include <ddb/ddb.h>
59#endif
60
61/*
62 * These widths should allow the pointers to a node's children to fit within
63 * a single cache line.  The extra levels from a narrow width should not be
64 * a problem thanks to path compression.
65 */
66#ifdef __LP64__
67#define	PCTRIE_WIDTH	4
68#else
69#define	PCTRIE_WIDTH	3
70#endif
71
72#define	PCTRIE_COUNT	(1 << PCTRIE_WIDTH)
73#define	PCTRIE_MASK	(PCTRIE_COUNT - 1)
74#define	PCTRIE_LIMIT	(howmany((sizeof(uint64_t) * NBBY), PCTRIE_WIDTH) - 1)
75
76/* Flag bits stored in node pointers. */
77#define	PCTRIE_ISLEAF	0x1
78#define	PCTRIE_FLAGS	0x1
79#define	PCTRIE_PAD	PCTRIE_FLAGS
80
81/* Returns one unit associated with specified level. */
82#define	PCTRIE_UNITLEVEL(lev)						\
83	((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
84
85struct pctrie_node {
86	uint64_t	 pn_owner;			/* Owner of record. */
87	uint16_t	 pn_count;			/* Valid children. */
88	uint16_t	 pn_clev;			/* Current level. */
89	void		*pn_child[PCTRIE_COUNT];	/* Child nodes. */
90};
91
92/*
93 * Allocate a node.  Pre-allocation should ensure that the request
94 * will always be satisfied.
95 */
96static __inline struct pctrie_node *
97pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner,
98    uint16_t count, uint16_t clevel)
99{
100	struct pctrie_node *node;
101
102	node = allocfn(ptree);
103	if (node == NULL)
104		return (NULL);
105	node->pn_owner = owner;
106	node->pn_count = count;
107	node->pn_clev = clevel;
108
109	return (node);
110}
111
112/*
113 * Free radix node.
114 */
115static __inline void
116pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
117    pctrie_free_t freefn)
118{
119#ifdef INVARIANTS
120	int slot;
121
122	KASSERT(node->pn_count == 0,
123	    ("pctrie_node_put: node %p has %d children", node,
124	    node->pn_count));
125	for (slot = 0; slot < PCTRIE_COUNT; slot++)
126		KASSERT(node->pn_child[slot] == NULL,
127		    ("pctrie_node_put: node %p has a child", node));
128#endif
129	freefn(ptree, node);
130}
131
132/*
133 * Return the position in the array for a given level.
134 */
135static __inline int
136pctrie_slot(uint64_t index, uint16_t level)
137{
138
139	return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
140}
141
142/* Trims the key after the specified level. */
143static __inline uint64_t
144pctrie_trimkey(uint64_t index, uint16_t level)
145{
146	uint64_t ret;
147
148	ret = index;
149	if (level > 0) {
150		ret >>= level * PCTRIE_WIDTH;
151		ret <<= level * PCTRIE_WIDTH;
152	}
153	return (ret);
154}
155
156/*
157 * Get the root node for a tree.
158 */
159static __inline struct pctrie_node *
160pctrie_getroot(struct pctrie *ptree)
161{
162
163	return ((struct pctrie_node *)ptree->pt_root);
164}
165
166/*
167 * Set the root node for a tree.
168 */
169static __inline void
170pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node)
171{
172
173	ptree->pt_root = (uintptr_t)node;
174}
175
176/*
177 * Returns TRUE if the specified node is a leaf and FALSE otherwise.
178 */
179static __inline boolean_t
180pctrie_isleaf(struct pctrie_node *node)
181{
182
183	return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
184}
185
186/*
187 * Returns the associated val extracted from node.
188 */
189static __inline uint64_t *
190pctrie_toval(struct pctrie_node *node)
191{
192
193	return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
194}
195
196/*
197 * Adds the val as a child of the provided node.
198 */
199static __inline void
200pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev,
201    uint64_t *val)
202{
203	int slot;
204
205	slot = pctrie_slot(index, clev);
206	node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF);
207}
208
209/*
210 * Returns the slot where two keys differ.
211 * It cannot accept 2 equal keys.
212 */
213static __inline uint16_t
214pctrie_keydiff(uint64_t index1, uint64_t index2)
215{
216	uint16_t clev;
217
218	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
219	    __func__, (uintmax_t)index1));
220
221	index1 ^= index2;
222	for (clev = PCTRIE_LIMIT;; clev--)
223		if (pctrie_slot(index1, clev) != 0)
224			return (clev);
225}
226
227/*
228 * Returns TRUE if it can be determined that key does not belong to the
229 * specified node.  Otherwise, returns FALSE.
230 */
231static __inline boolean_t
232pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
233{
234
235	if (node->pn_clev < PCTRIE_LIMIT) {
236		idx = pctrie_trimkey(idx, node->pn_clev + 1);
237		return (idx != node->pn_owner);
238	}
239	return (FALSE);
240}
241
242/*
243 * Internal helper for pctrie_reclaim_allnodes().
244 * This function is recursive.
245 */
246static void
247pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
248    pctrie_free_t freefn)
249{
250	int slot;
251
252	KASSERT(node->pn_count <= PCTRIE_COUNT,
253	    ("pctrie_reclaim_allnodes_int: bad count in node %p", node));
254	for (slot = 0; node->pn_count != 0; slot++) {
255		if (node->pn_child[slot] == NULL)
256			continue;
257		if (!pctrie_isleaf(node->pn_child[slot]))
258			pctrie_reclaim_allnodes_int(ptree,
259			    node->pn_child[slot], freefn);
260		node->pn_child[slot] = NULL;
261		node->pn_count--;
262	}
263	pctrie_node_put(ptree, node, freefn);
264}
265
266/*
267 * pctrie node zone initializer.
268 */
269int
270pctrie_zone_init(void *mem, int size __unused, int flags __unused)
271{
272	struct pctrie_node *node;
273
274	node = mem;
275	memset(node->pn_child, 0, sizeof(node->pn_child));
276	return (0);
277}
278
279size_t
280pctrie_node_size(void)
281{
282
283	return (sizeof(struct pctrie_node));
284}
285
286/*
287 * Inserts the key-value pair into the trie.
288 * Panics if the key already exists.
289 */
290int
291pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
292{
293	uint64_t index, newind;
294	void **parentp;
295	struct pctrie_node *node, *tmp;
296	uint64_t *m;
297	int slot;
298	uint16_t clev;
299
300	index = *val;
301
302	/*
303	 * The owner of record for root is not really important because it
304	 * will never be used.
305	 */
306	node = pctrie_getroot(ptree);
307	if (node == NULL) {
308		ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF;
309		return (0);
310	}
311	parentp = (void **)&ptree->pt_root;
312	for (;;) {
313		if (pctrie_isleaf(node)) {
314			m = pctrie_toval(node);
315			if (*m == index)
316				panic("%s: key %jx is already present",
317				    __func__, (uintmax_t)index);
318			clev = pctrie_keydiff(*m, index);
319			tmp = pctrie_node_get(ptree, allocfn,
320			    pctrie_trimkey(index, clev + 1), 2, clev);
321			if (tmp == NULL)
322				return (ENOMEM);
323			*parentp = tmp;
324			pctrie_addval(tmp, index, clev, val);
325			pctrie_addval(tmp, *m, clev, m);
326			return (0);
327		} else if (pctrie_keybarr(node, index))
328			break;
329		slot = pctrie_slot(index, node->pn_clev);
330		if (node->pn_child[slot] == NULL) {
331			node->pn_count++;
332			pctrie_addval(node, index, node->pn_clev, val);
333			return (0);
334		}
335		parentp = &node->pn_child[slot];
336		node = node->pn_child[slot];
337	}
338
339	/*
340	 * A new node is needed because the right insertion level is reached.
341	 * Setup the new intermediate node and add the 2 children: the
342	 * new object and the older edge.
343	 */
344	newind = node->pn_owner;
345	clev = pctrie_keydiff(newind, index);
346	tmp = pctrie_node_get(ptree, allocfn,
347	    pctrie_trimkey(index, clev + 1), 2, clev);
348	if (tmp == NULL)
349		return (ENOMEM);
350	*parentp = tmp;
351	pctrie_addval(tmp, index, clev, val);
352	slot = pctrie_slot(newind, clev);
353	tmp->pn_child[slot] = node;
354
355	return (0);
356}
357
358/*
359 * Returns the value stored at the index.  If the index is not present,
360 * NULL is returned.
361 */
362uint64_t *
363pctrie_lookup(struct pctrie *ptree, uint64_t index)
364{
365	struct pctrie_node *node;
366	uint64_t *m;
367	int slot;
368
369	node = pctrie_getroot(ptree);
370	while (node != NULL) {
371		if (pctrie_isleaf(node)) {
372			m = pctrie_toval(node);
373			if (*m == index)
374				return (m);
375			else
376				break;
377		} else if (pctrie_keybarr(node, index))
378			break;
379		slot = pctrie_slot(index, node->pn_clev);
380		node = node->pn_child[slot];
381	}
382	return (NULL);
383}
384
385/*
386 * Look up the nearest entry at a position bigger than or equal to index.
387 */
388uint64_t *
389pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
390{
391	struct pctrie_node *stack[PCTRIE_LIMIT];
392	uint64_t inc;
393	uint64_t *m;
394	struct pctrie_node *child, *node;
395#ifdef INVARIANTS
396	int loops = 0;
397#endif
398	int slot, tos;
399
400	node = pctrie_getroot(ptree);
401	if (node == NULL)
402		return (NULL);
403	else if (pctrie_isleaf(node)) {
404		m = pctrie_toval(node);
405		if (*m >= index)
406			return (m);
407		else
408			return (NULL);
409	}
410	tos = 0;
411	for (;;) {
412		/*
413		 * If the keys differ before the current bisection node,
414		 * then the search key might rollback to the earliest
415		 * available bisection node or to the smallest key
416		 * in the current node (if the owner is bigger than the
417		 * search key).
418		 */
419		if (pctrie_keybarr(node, index)) {
420			if (index > node->pn_owner) {
421ascend:
422				KASSERT(++loops < 1000,
423				    ("pctrie_lookup_ge: too many loops"));
424
425				/*
426				 * Pop nodes from the stack until either the
427				 * stack is empty or a node that could have a
428				 * matching descendant is found.
429				 */
430				do {
431					if (tos == 0)
432						return (NULL);
433					node = stack[--tos];
434				} while (pctrie_slot(index,
435				    node->pn_clev) == (PCTRIE_COUNT - 1));
436
437				/*
438				 * The following computation cannot overflow
439				 * because index's slot at the current level
440				 * is less than PCTRIE_COUNT - 1.
441				 */
442				index = pctrie_trimkey(index,
443				    node->pn_clev);
444				index += PCTRIE_UNITLEVEL(node->pn_clev);
445			} else
446				index = node->pn_owner;
447			KASSERT(!pctrie_keybarr(node, index),
448			    ("pctrie_lookup_ge: keybarr failed"));
449		}
450		slot = pctrie_slot(index, node->pn_clev);
451		child = node->pn_child[slot];
452		if (pctrie_isleaf(child)) {
453			m = pctrie_toval(child);
454			if (*m >= index)
455				return (m);
456		} else if (child != NULL)
457			goto descend;
458
459		/*
460		 * Look for an available edge or val within the current
461		 * bisection node.
462		 */
463                if (slot < (PCTRIE_COUNT - 1)) {
464			inc = PCTRIE_UNITLEVEL(node->pn_clev);
465			index = pctrie_trimkey(index, node->pn_clev);
466			do {
467				index += inc;
468				slot++;
469				child = node->pn_child[slot];
470				if (pctrie_isleaf(child)) {
471					m = pctrie_toval(child);
472					if (*m >= index)
473						return (m);
474				} else if (child != NULL)
475					goto descend;
476			} while (slot < (PCTRIE_COUNT - 1));
477		}
478		KASSERT(child == NULL || pctrie_isleaf(child),
479		    ("pctrie_lookup_ge: child is radix node"));
480
481		/*
482		 * If a value or edge bigger than the search slot is not found
483		 * in the current node, ascend to the next higher-level node.
484		 */
485		goto ascend;
486descend:
487		KASSERT(node->pn_clev > 0,
488		    ("pctrie_lookup_ge: pushing leaf's parent"));
489		KASSERT(tos < PCTRIE_LIMIT,
490		    ("pctrie_lookup_ge: stack overflow"));
491		stack[tos++] = node;
492		node = child;
493	}
494}
495
496/*
497 * Look up the nearest entry at a position less than or equal to index.
498 */
499uint64_t *
500pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
501{
502	struct pctrie_node *stack[PCTRIE_LIMIT];
503	uint64_t inc;
504	uint64_t *m;
505	struct pctrie_node *child, *node;
506#ifdef INVARIANTS
507	int loops = 0;
508#endif
509	int slot, tos;
510
511	node = pctrie_getroot(ptree);
512	if (node == NULL)
513		return (NULL);
514	else if (pctrie_isleaf(node)) {
515		m = pctrie_toval(node);
516		if (*m <= index)
517			return (m);
518		else
519			return (NULL);
520	}
521	tos = 0;
522	for (;;) {
523		/*
524		 * If the keys differ before the current bisection node,
525		 * then the search key might rollback to the earliest
526		 * available bisection node or to the largest key
527		 * in the current node (if the owner is smaller than the
528		 * search key).
529		 */
530		if (pctrie_keybarr(node, index)) {
531			if (index > node->pn_owner) {
532				index = node->pn_owner + PCTRIE_COUNT *
533				    PCTRIE_UNITLEVEL(node->pn_clev);
534			} else {
535ascend:
536				KASSERT(++loops < 1000,
537				    ("pctrie_lookup_le: too many loops"));
538
539				/*
540				 * Pop nodes from the stack until either the
541				 * stack is empty or a node that could have a
542				 * matching descendant is found.
543				 */
544				do {
545					if (tos == 0)
546						return (NULL);
547					node = stack[--tos];
548				} while (pctrie_slot(index,
549				    node->pn_clev) == 0);
550
551				/*
552				 * The following computation cannot overflow
553				 * because index's slot at the current level
554				 * is greater than 0.
555				 */
556				index = pctrie_trimkey(index,
557				    node->pn_clev);
558			}
559			index--;
560			KASSERT(!pctrie_keybarr(node, index),
561			    ("pctrie_lookup_le: keybarr failed"));
562		}
563		slot = pctrie_slot(index, node->pn_clev);
564		child = node->pn_child[slot];
565		if (pctrie_isleaf(child)) {
566			m = pctrie_toval(child);
567			if (*m <= index)
568				return (m);
569		} else if (child != NULL)
570			goto descend;
571
572		/*
573		 * Look for an available edge or value within the current
574		 * bisection node.
575		 */
576		if (slot > 0) {
577			inc = PCTRIE_UNITLEVEL(node->pn_clev);
578			index |= inc - 1;
579			do {
580				index -= inc;
581				slot--;
582				child = node->pn_child[slot];
583				if (pctrie_isleaf(child)) {
584					m = pctrie_toval(child);
585					if (*m <= index)
586						return (m);
587				} else if (child != NULL)
588					goto descend;
589			} while (slot > 0);
590		}
591		KASSERT(child == NULL || pctrie_isleaf(child),
592		    ("pctrie_lookup_le: child is radix node"));
593
594		/*
595		 * If a value or edge smaller than the search slot is not found
596		 * in the current node, ascend to the next higher-level node.
597		 */
598		goto ascend;
599descend:
600		KASSERT(node->pn_clev > 0,
601		    ("pctrie_lookup_le: pushing leaf's parent"));
602		KASSERT(tos < PCTRIE_LIMIT,
603		    ("pctrie_lookup_le: stack overflow"));
604		stack[tos++] = node;
605		node = child;
606	}
607}
608
609/*
610 * Remove the specified index from the tree.
611 * Panics if the key is not present.
612 */
613void
614pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
615{
616	struct pctrie_node *node, *parent;
617	uint64_t *m;
618	int i, slot;
619
620	node = pctrie_getroot(ptree);
621	if (pctrie_isleaf(node)) {
622		m = pctrie_toval(node);
623		if (*m != index)
624			panic("%s: invalid key found", __func__);
625		pctrie_setroot(ptree, NULL);
626		return;
627	}
628	parent = NULL;
629	for (;;) {
630		if (node == NULL)
631			panic("pctrie_remove: impossible to locate the key");
632		slot = pctrie_slot(index, node->pn_clev);
633		if (pctrie_isleaf(node->pn_child[slot])) {
634			m = pctrie_toval(node->pn_child[slot]);
635			if (*m != index)
636				panic("%s: invalid key found", __func__);
637			node->pn_child[slot] = NULL;
638			node->pn_count--;
639			if (node->pn_count > 1)
640				break;
641			for (i = 0; i < PCTRIE_COUNT; i++)
642				if (node->pn_child[i] != NULL)
643					break;
644			KASSERT(i != PCTRIE_COUNT,
645			    ("%s: invalid node configuration", __func__));
646			if (parent == NULL)
647				pctrie_setroot(ptree, node->pn_child[i]);
648			else {
649				slot = pctrie_slot(index, parent->pn_clev);
650				KASSERT(parent->pn_child[slot] == node,
651				    ("%s: invalid child value", __func__));
652				parent->pn_child[slot] = node->pn_child[i];
653			}
654			node->pn_count--;
655			node->pn_child[i] = NULL;
656			pctrie_node_put(ptree, node, freefn);
657			break;
658		}
659		parent = node;
660		node = node->pn_child[slot];
661	}
662}
663
664/*
665 * Remove and free all the nodes from the tree.
666 * This function is recursive but there is a tight control on it as the
667 * maximum depth of the tree is fixed.
668 */
669void
670pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
671{
672	struct pctrie_node *root;
673
674	root = pctrie_getroot(ptree);
675	if (root == NULL)
676		return;
677	pctrie_setroot(ptree, NULL);
678	if (!pctrie_isleaf(root))
679		pctrie_reclaim_allnodes_int(ptree, root, freefn);
680}
681
682#ifdef DDB
683/*
684 * Show details about the given node.
685 */
686DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
687{
688	struct pctrie_node *node;
689	int i;
690
691        if (!have_addr)
692                return;
693	node = (struct pctrie_node *)addr;
694	db_printf("node %p, owner %jx, children count %u, level %u:\n",
695	    (void *)node, (uintmax_t)node->pn_owner, node->pn_count,
696	    node->pn_clev);
697	for (i = 0; i < PCTRIE_COUNT; i++)
698		if (node->pn_child[i] != NULL)
699			db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
700			    i, (void *)node->pn_child[i],
701			    pctrie_isleaf(node->pn_child[i]) ?
702			    pctrie_toval(node->pn_child[i]) : NULL,
703			    node->pn_clev);
704}
705#endif /* DDB */
706