1/*
2 * Copyright (c) 2013 EMC Corp.
3 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
4 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 */
29
30/*
31 * Path-compressed radix trie implementation.
32 *
33 * The implementation takes into account the following rationale:
34 * - Size of the nodes should be as small as possible but still big enough
35 *   to avoid a large maximum depth for the trie.  This is a balance
36 *   between the necessity to not wire too much physical memory for the nodes
37 *   and the necessity to avoid too much cache pollution during the trie
38 *   operations.
39 * - There is not a huge bias toward the number of lookup operations over
40 *   the number of insert and remove operations.  This basically implies
41 *   that optimizations supposedly helping one operation but hurting the
42 *   other might be carefully evaluated.
43 * - On average not many nodes are expected to be fully populated, hence
44 *   level compression may just complicate things.
45 */
46
47#include <sys/cdefs.h>
48__FBSDID("$FreeBSD: stable/11/sys/kern/subr_pctrie.c 321977 2017-08-03 07:28:54Z kib $");
49
50#include "opt_ddb.h"
51
52#include <sys/param.h>
53#include <sys/systm.h>
54#include <sys/kernel.h>
55#include <sys/pctrie.h>
56
57#ifdef DDB
58#include <ddb/ddb.h>
59#endif
60
61#define	PCTRIE_MASK	(PCTRIE_COUNT - 1)
62#define	PCTRIE_LIMIT	(howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1)
63
64/* Flag bits stored in node pointers. */
65#define	PCTRIE_ISLEAF	0x1
66#define	PCTRIE_FLAGS	0x1
67#define	PCTRIE_PAD	PCTRIE_FLAGS
68
69/* Returns one unit associated with specified level. */
70#define	PCTRIE_UNITLEVEL(lev)						\
71	((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
72
73struct pctrie_node {
74	uint64_t	 pn_owner;			/* Owner of record. */
75	uint16_t	 pn_count;			/* Valid children. */
76	uint16_t	 pn_clev;			/* Current level. */
77	void		*pn_child[PCTRIE_COUNT];	/* Child nodes. */
78};
79
80/*
81 * Allocate a node.  Pre-allocation should ensure that the request
82 * will always be satisfied.
83 */
84static __inline struct pctrie_node *
85pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner,
86    uint16_t count, uint16_t clevel)
87{
88	struct pctrie_node *node;
89
90	node = allocfn(ptree);
91	if (node == NULL)
92		return (NULL);
93	node->pn_owner = owner;
94	node->pn_count = count;
95	node->pn_clev = clevel;
96
97	return (node);
98}
99
100/*
101 * Free radix node.
102 */
103static __inline void
104pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
105    pctrie_free_t freefn)
106{
107#ifdef INVARIANTS
108	int slot;
109
110	KASSERT(node->pn_count == 0,
111	    ("pctrie_node_put: node %p has %d children", node,
112	    node->pn_count));
113	for (slot = 0; slot < PCTRIE_COUNT; slot++)
114		KASSERT(node->pn_child[slot] == NULL,
115		    ("pctrie_node_put: node %p has a child", node));
116#endif
117	freefn(ptree, node);
118}
119
120/*
121 * Return the position in the array for a given level.
122 */
123static __inline int
124pctrie_slot(uint64_t index, uint16_t level)
125{
126
127	return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
128}
129
130/* Trims the key after the specified level. */
131static __inline uint64_t
132pctrie_trimkey(uint64_t index, uint16_t level)
133{
134	uint64_t ret;
135
136	ret = index;
137	if (level > 0) {
138		ret >>= level * PCTRIE_WIDTH;
139		ret <<= level * PCTRIE_WIDTH;
140	}
141	return (ret);
142}
143
144/*
145 * Get the root node for a tree.
146 */
147static __inline struct pctrie_node *
148pctrie_getroot(struct pctrie *ptree)
149{
150
151	return ((struct pctrie_node *)ptree->pt_root);
152}
153
154/*
155 * Set the root node for a tree.
156 */
157static __inline void
158pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node)
159{
160
161	ptree->pt_root = (uintptr_t)node;
162}
163
164/*
165 * Returns TRUE if the specified node is a leaf and FALSE otherwise.
166 */
167static __inline boolean_t
168pctrie_isleaf(struct pctrie_node *node)
169{
170
171	return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
172}
173
174/*
175 * Returns the associated val extracted from node.
176 */
177static __inline uint64_t *
178pctrie_toval(struct pctrie_node *node)
179{
180
181	return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
182}
183
184/*
185 * Adds the val as a child of the provided node.
186 */
187static __inline void
188pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev,
189    uint64_t *val)
190{
191	int slot;
192
193	slot = pctrie_slot(index, clev);
194	node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF);
195}
196
197/*
198 * Returns the slot where two keys differ.
199 * It cannot accept 2 equal keys.
200 */
201static __inline uint16_t
202pctrie_keydiff(uint64_t index1, uint64_t index2)
203{
204	uint16_t clev;
205
206	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
207	    __func__, (uintmax_t)index1));
208
209	index1 ^= index2;
210	for (clev = PCTRIE_LIMIT;; clev--)
211		if (pctrie_slot(index1, clev) != 0)
212			return (clev);
213}
214
215/*
216 * Returns TRUE if it can be determined that key does not belong to the
217 * specified node.  Otherwise, returns FALSE.
218 */
219static __inline boolean_t
220pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
221{
222
223	if (node->pn_clev < PCTRIE_LIMIT) {
224		idx = pctrie_trimkey(idx, node->pn_clev + 1);
225		return (idx != node->pn_owner);
226	}
227	return (FALSE);
228}
229
230/*
231 * Internal helper for pctrie_reclaim_allnodes().
232 * This function is recursive.
233 */
234static void
235pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
236    pctrie_free_t freefn)
237{
238	int slot;
239
240	KASSERT(node->pn_count <= PCTRIE_COUNT,
241	    ("pctrie_reclaim_allnodes_int: bad count in node %p", node));
242	for (slot = 0; node->pn_count != 0; slot++) {
243		if (node->pn_child[slot] == NULL)
244			continue;
245		if (!pctrie_isleaf(node->pn_child[slot]))
246			pctrie_reclaim_allnodes_int(ptree,
247			    node->pn_child[slot], freefn);
248		node->pn_child[slot] = NULL;
249		node->pn_count--;
250	}
251	pctrie_node_put(ptree, node, freefn);
252}
253
254/*
255 * pctrie node zone initializer.
256 */
257int
258pctrie_zone_init(void *mem, int size __unused, int flags __unused)
259{
260	struct pctrie_node *node;
261
262	node = mem;
263	memset(node->pn_child, 0, sizeof(node->pn_child));
264	return (0);
265}
266
267size_t
268pctrie_node_size(void)
269{
270
271	return (sizeof(struct pctrie_node));
272}
273
274/*
275 * Inserts the key-value pair into the trie.
276 * Panics if the key already exists.
277 */
278int
279pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
280{
281	uint64_t index, newind;
282	void **parentp;
283	struct pctrie_node *node, *tmp;
284	uint64_t *m;
285	int slot;
286	uint16_t clev;
287
288	index = *val;
289
290	/*
291	 * The owner of record for root is not really important because it
292	 * will never be used.
293	 */
294	node = pctrie_getroot(ptree);
295	if (node == NULL) {
296		ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF;
297		return (0);
298	}
299	parentp = (void **)&ptree->pt_root;
300	for (;;) {
301		if (pctrie_isleaf(node)) {
302			m = pctrie_toval(node);
303			if (*m == index)
304				panic("%s: key %jx is already present",
305				    __func__, (uintmax_t)index);
306			clev = pctrie_keydiff(*m, index);
307			tmp = pctrie_node_get(ptree, allocfn,
308			    pctrie_trimkey(index, clev + 1), 2, clev);
309			if (tmp == NULL)
310				return (ENOMEM);
311			*parentp = tmp;
312			pctrie_addval(tmp, index, clev, val);
313			pctrie_addval(tmp, *m, clev, m);
314			return (0);
315		} else if (pctrie_keybarr(node, index))
316			break;
317		slot = pctrie_slot(index, node->pn_clev);
318		if (node->pn_child[slot] == NULL) {
319			node->pn_count++;
320			pctrie_addval(node, index, node->pn_clev, val);
321			return (0);
322		}
323		parentp = &node->pn_child[slot];
324		node = node->pn_child[slot];
325	}
326
327	/*
328	 * A new node is needed because the right insertion level is reached.
329	 * Setup the new intermediate node and add the 2 children: the
330	 * new object and the older edge.
331	 */
332	newind = node->pn_owner;
333	clev = pctrie_keydiff(newind, index);
334	tmp = pctrie_node_get(ptree, allocfn,
335	    pctrie_trimkey(index, clev + 1), 2, clev);
336	if (tmp == NULL)
337		return (ENOMEM);
338	*parentp = tmp;
339	pctrie_addval(tmp, index, clev, val);
340	slot = pctrie_slot(newind, clev);
341	tmp->pn_child[slot] = node;
342
343	return (0);
344}
345
346/*
347 * Returns the value stored at the index.  If the index is not present,
348 * NULL is returned.
349 */
350uint64_t *
351pctrie_lookup(struct pctrie *ptree, uint64_t index)
352{
353	struct pctrie_node *node;
354	uint64_t *m;
355	int slot;
356
357	node = pctrie_getroot(ptree);
358	while (node != NULL) {
359		if (pctrie_isleaf(node)) {
360			m = pctrie_toval(node);
361			if (*m == index)
362				return (m);
363			else
364				break;
365		} else if (pctrie_keybarr(node, index))
366			break;
367		slot = pctrie_slot(index, node->pn_clev);
368		node = node->pn_child[slot];
369	}
370	return (NULL);
371}
372
373/*
374 * Look up the nearest entry at a position bigger than or equal to index.
375 */
376uint64_t *
377pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
378{
379	struct pctrie_node *stack[PCTRIE_LIMIT];
380	uint64_t inc;
381	uint64_t *m;
382	struct pctrie_node *child, *node;
383#ifdef INVARIANTS
384	int loops = 0;
385#endif
386	int slot, tos;
387
388	node = pctrie_getroot(ptree);
389	if (node == NULL)
390		return (NULL);
391	else if (pctrie_isleaf(node)) {
392		m = pctrie_toval(node);
393		if (*m >= index)
394			return (m);
395		else
396			return (NULL);
397	}
398	tos = 0;
399	for (;;) {
400		/*
401		 * If the keys differ before the current bisection node,
402		 * then the search key might rollback to the earliest
403		 * available bisection node or to the smallest key
404		 * in the current node (if the owner is bigger than the
405		 * search key).
406		 */
407		if (pctrie_keybarr(node, index)) {
408			if (index > node->pn_owner) {
409ascend:
410				KASSERT(++loops < 1000,
411				    ("pctrie_lookup_ge: too many loops"));
412
413				/*
414				 * Pop nodes from the stack until either the
415				 * stack is empty or a node that could have a
416				 * matching descendant is found.
417				 */
418				do {
419					if (tos == 0)
420						return (NULL);
421					node = stack[--tos];
422				} while (pctrie_slot(index,
423				    node->pn_clev) == (PCTRIE_COUNT - 1));
424
425				/*
426				 * The following computation cannot overflow
427				 * because index's slot at the current level
428				 * is less than PCTRIE_COUNT - 1.
429				 */
430				index = pctrie_trimkey(index,
431				    node->pn_clev);
432				index += PCTRIE_UNITLEVEL(node->pn_clev);
433			} else
434				index = node->pn_owner;
435			KASSERT(!pctrie_keybarr(node, index),
436			    ("pctrie_lookup_ge: keybarr failed"));
437		}
438		slot = pctrie_slot(index, node->pn_clev);
439		child = node->pn_child[slot];
440		if (pctrie_isleaf(child)) {
441			m = pctrie_toval(child);
442			if (*m >= index)
443				return (m);
444		} else if (child != NULL)
445			goto descend;
446
447		/*
448		 * Look for an available edge or val within the current
449		 * bisection node.
450		 */
451                if (slot < (PCTRIE_COUNT - 1)) {
452			inc = PCTRIE_UNITLEVEL(node->pn_clev);
453			index = pctrie_trimkey(index, node->pn_clev);
454			do {
455				index += inc;
456				slot++;
457				child = node->pn_child[slot];
458				if (pctrie_isleaf(child)) {
459					m = pctrie_toval(child);
460					if (*m >= index)
461						return (m);
462				} else if (child != NULL)
463					goto descend;
464			} while (slot < (PCTRIE_COUNT - 1));
465		}
466		KASSERT(child == NULL || pctrie_isleaf(child),
467		    ("pctrie_lookup_ge: child is radix node"));
468
469		/*
470		 * If a value or edge bigger than the search slot is not found
471		 * in the current node, ascend to the next higher-level node.
472		 */
473		goto ascend;
474descend:
475		KASSERT(node->pn_clev > 0,
476		    ("pctrie_lookup_ge: pushing leaf's parent"));
477		KASSERT(tos < PCTRIE_LIMIT,
478		    ("pctrie_lookup_ge: stack overflow"));
479		stack[tos++] = node;
480		node = child;
481	}
482}
483
484/*
485 * Look up the nearest entry at a position less than or equal to index.
486 */
487uint64_t *
488pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
489{
490	struct pctrie_node *stack[PCTRIE_LIMIT];
491	uint64_t inc;
492	uint64_t *m;
493	struct pctrie_node *child, *node;
494#ifdef INVARIANTS
495	int loops = 0;
496#endif
497	int slot, tos;
498
499	node = pctrie_getroot(ptree);
500	if (node == NULL)
501		return (NULL);
502	else if (pctrie_isleaf(node)) {
503		m = pctrie_toval(node);
504		if (*m <= index)
505			return (m);
506		else
507			return (NULL);
508	}
509	tos = 0;
510	for (;;) {
511		/*
512		 * If the keys differ before the current bisection node,
513		 * then the search key might rollback to the earliest
514		 * available bisection node or to the largest key
515		 * in the current node (if the owner is smaller than the
516		 * search key).
517		 */
518		if (pctrie_keybarr(node, index)) {
519			if (index > node->pn_owner) {
520				index = node->pn_owner + PCTRIE_COUNT *
521				    PCTRIE_UNITLEVEL(node->pn_clev);
522			} else {
523ascend:
524				KASSERT(++loops < 1000,
525				    ("pctrie_lookup_le: too many loops"));
526
527				/*
528				 * Pop nodes from the stack until either the
529				 * stack is empty or a node that could have a
530				 * matching descendant is found.
531				 */
532				do {
533					if (tos == 0)
534						return (NULL);
535					node = stack[--tos];
536				} while (pctrie_slot(index,
537				    node->pn_clev) == 0);
538
539				/*
540				 * The following computation cannot overflow
541				 * because index's slot at the current level
542				 * is greater than 0.
543				 */
544				index = pctrie_trimkey(index,
545				    node->pn_clev);
546			}
547			index--;
548			KASSERT(!pctrie_keybarr(node, index),
549			    ("pctrie_lookup_le: keybarr failed"));
550		}
551		slot = pctrie_slot(index, node->pn_clev);
552		child = node->pn_child[slot];
553		if (pctrie_isleaf(child)) {
554			m = pctrie_toval(child);
555			if (*m <= index)
556				return (m);
557		} else if (child != NULL)
558			goto descend;
559
560		/*
561		 * Look for an available edge or value within the current
562		 * bisection node.
563		 */
564		if (slot > 0) {
565			inc = PCTRIE_UNITLEVEL(node->pn_clev);
566			index |= inc - 1;
567			do {
568				index -= inc;
569				slot--;
570				child = node->pn_child[slot];
571				if (pctrie_isleaf(child)) {
572					m = pctrie_toval(child);
573					if (*m <= index)
574						return (m);
575				} else if (child != NULL)
576					goto descend;
577			} while (slot > 0);
578		}
579		KASSERT(child == NULL || pctrie_isleaf(child),
580		    ("pctrie_lookup_le: child is radix node"));
581
582		/*
583		 * If a value or edge smaller than the search slot is not found
584		 * in the current node, ascend to the next higher-level node.
585		 */
586		goto ascend;
587descend:
588		KASSERT(node->pn_clev > 0,
589		    ("pctrie_lookup_le: pushing leaf's parent"));
590		KASSERT(tos < PCTRIE_LIMIT,
591		    ("pctrie_lookup_le: stack overflow"));
592		stack[tos++] = node;
593		node = child;
594	}
595}
596
597/*
598 * Remove the specified index from the tree.
599 * Panics if the key is not present.
600 */
601void
602pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
603{
604	struct pctrie_node *node, *parent;
605	uint64_t *m;
606	int i, slot;
607
608	node = pctrie_getroot(ptree);
609	if (pctrie_isleaf(node)) {
610		m = pctrie_toval(node);
611		if (*m != index)
612			panic("%s: invalid key found", __func__);
613		pctrie_setroot(ptree, NULL);
614		return;
615	}
616	parent = NULL;
617	for (;;) {
618		if (node == NULL)
619			panic("pctrie_remove: impossible to locate the key");
620		slot = pctrie_slot(index, node->pn_clev);
621		if (pctrie_isleaf(node->pn_child[slot])) {
622			m = pctrie_toval(node->pn_child[slot]);
623			if (*m != index)
624				panic("%s: invalid key found", __func__);
625			node->pn_child[slot] = NULL;
626			node->pn_count--;
627			if (node->pn_count > 1)
628				break;
629			for (i = 0; i < PCTRIE_COUNT; i++)
630				if (node->pn_child[i] != NULL)
631					break;
632			KASSERT(i != PCTRIE_COUNT,
633			    ("%s: invalid node configuration", __func__));
634			if (parent == NULL)
635				pctrie_setroot(ptree, node->pn_child[i]);
636			else {
637				slot = pctrie_slot(index, parent->pn_clev);
638				KASSERT(parent->pn_child[slot] == node,
639				    ("%s: invalid child value", __func__));
640				parent->pn_child[slot] = node->pn_child[i];
641			}
642			node->pn_count--;
643			node->pn_child[i] = NULL;
644			pctrie_node_put(ptree, node, freefn);
645			break;
646		}
647		parent = node;
648		node = node->pn_child[slot];
649	}
650}
651
652/*
653 * Remove and free all the nodes from the tree.
654 * This function is recursive but there is a tight control on it as the
655 * maximum depth of the tree is fixed.
656 */
657void
658pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
659{
660	struct pctrie_node *root;
661
662	root = pctrie_getroot(ptree);
663	if (root == NULL)
664		return;
665	pctrie_setroot(ptree, NULL);
666	if (!pctrie_isleaf(root))
667		pctrie_reclaim_allnodes_int(ptree, root, freefn);
668}
669
670#ifdef DDB
671/*
672 * Show details about the given node.
673 */
674DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
675{
676	struct pctrie_node *node;
677	int i;
678
679        if (!have_addr)
680                return;
681	node = (struct pctrie_node *)addr;
682	db_printf("node %p, owner %jx, children count %u, level %u:\n",
683	    (void *)node, (uintmax_t)node->pn_owner, node->pn_count,
684	    node->pn_clev);
685	for (i = 0; i < PCTRIE_COUNT; i++)
686		if (node->pn_child[i] != NULL)
687			db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
688			    i, (void *)node->pn_child[i],
689			    pctrie_isleaf(node->pn_child[i]) ?
690			    pctrie_toval(node->pn_child[i]) : NULL,
691			    node->pn_clev);
692}
693#endif /* DDB */
694