vm_radix.c revision 246840
1/*
2 * Copyright (c) 2013 EMC Corp.
3 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
4 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 */
29
30/*
31 * Path-compressed radix trie implementation.
32 * The following code is not generalized into a general purpose library
33 * because there are way too many parameters embedded that should really
34 * be decided by the library consumers.  At the same time, consumers
35 * of this code must achieve highest possible performance.
36 *
37 * The implementation takes into account the following rationale:
38 * - Size of the nodes might be as small as possible.
39 * - There is no bias toward lookup operations over inserts or removes,
40 *   and vice-versa.
41 * - In average there are not many complete levels, than level
42 *   compression may just complicate things.
43 */
44
45#include <sys/cdefs.h>
46
47#include "opt_ddb.h"
48#include "opt_vm.h"
49
50#include <sys/param.h>
51#include <sys/systm.h>
52#include <sys/kernel.h>
53#include <sys/vmmeter.h>
54
55#include <vm/uma.h>
56#include <vm/vm.h>
57#include <vm/vm_param.h>
58#include <vm/vm_page.h>
59#include <vm/vm_radix.h>
60
61#ifdef DDB
62#include <ddb/ddb.h>
63#endif
64
65#ifndef VM_RADIX_BOOT_CACHE
66#define	VM_RADIX_BOOT_CACHE	1500
67#endif
68
69/*
70 * Such sizes should permit to keep node children contained into a single
71 * cache-line, or to at least not span many of those.
72 * In particular, sparse tries should however be compressed properly and
73 * then make some extra-levels not a big deal.
74 */
75#ifdef __LP64__
76#define	VM_RADIX_WIDTH	4
77#else
78#define	VM_RADIX_WIDTH	3
79#endif
80
81#define	VM_RADIX_COUNT	(1 << VM_RADIX_WIDTH)
82#define	VM_RADIX_MASK	(VM_RADIX_COUNT - 1)
83#define	VM_RADIX_LIMIT							\
84	(howmany((sizeof(vm_pindex_t) * NBBY), VM_RADIX_WIDTH) - 1)
85
86/* Flag bits stored in node pointers. */
87#define	VM_RADIX_ISLEAF	0x1
88#define	VM_RADIX_FLAGS	0x1
89#define	VM_RADIX_PAD	VM_RADIX_FLAGS
90
91/* Returns one unit associated with specified level. */
92#define	VM_RADIX_UNITLEVEL(lev)						\
93	((vm_pindex_t)1 << ((VM_RADIX_LIMIT - (lev)) * VM_RADIX_WIDTH))
94
95struct vm_radix_node {
96	void		*rn_child[VM_RADIX_COUNT];	/* Child nodes. */
97	vm_pindex_t	 rn_owner;			/* Owner of record. */
98	uint16_t	 rn_count;			/* Valid children. */
99	uint16_t	 rn_clev;			/* Current level. */
100};
101
102static uma_zone_t vm_radix_node_zone;
103
104/*
105 * Boot-time cache of struct vm_radix_node objects.
106 * This cache is used to cater page allocations before the UMA zone is
107 * actually setup and pre-allocated (ie. pmap_init()).
108 */
109static u_int boot_cache_cnt;
110static struct vm_radix_node boot_cache[VM_RADIX_BOOT_CACHE];
111
112static struct vm_radix_node *
113vm_radix_carve_bootcache(void)
114{
115	struct vm_radix_node *rnode;
116
117	if (boot_cache_cnt == VM_RADIX_BOOT_CACHE)
118		panic("%s: Increase VM_RADIX_BOOT_CACHE (%u)", __func__,
119		    VM_RADIX_BOOT_CACHE);
120	rnode = &boot_cache[boot_cache_cnt];
121	boot_cache_cnt++;
122	return (rnode);
123}
124
125/*
126 * Allocate a radix node.  Pre-allocation ensures that the request will be
127 * always successfully satisfied.
128 */
129static __inline struct vm_radix_node *
130vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
131{
132	struct vm_radix_node *rnode;
133
134	if (__predict_false(boot_cache_cnt <= VM_RADIX_BOOT_CACHE))
135		rnode = vm_radix_carve_bootcache();
136	else {
137		rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO);
138
139		/*
140		 * The required number of nodes might be already correctly
141		 * pre-allocated in vm_radix_init().  However, UMA can reserve
142		 * few nodes on per-cpu specific buckets, which will not be
143		 * accessible from the curcpu.  The allocation could then
144		 * return NULL when the pre-allocation pool is close to be
145		 * exhausted.  Anyway, in practice this should never be a
146		 * problem because a new node is not always required for
147		 * insert, thus the pre-allocation pool should already have
148		 * some extra-pages that indirectly deal with this situation.
149		 */
150		if (rnode == NULL)
151			panic("%s: uma_zalloc() returned NULL for a new node",
152			    __func__);
153	}
154	rnode->rn_owner = owner;
155	rnode->rn_count = count;
156	rnode->rn_clev = clevel;
157	return (rnode);
158}
159
160/*
161 * Free radix node.
162 */
163static __inline void
164vm_radix_node_put(struct vm_radix_node *rnode)
165{
166
167	if (__predict_false(rnode > boot_cache &&
168	    rnode <= &boot_cache[VM_RADIX_BOOT_CACHE]))
169		return;
170	uma_zfree(vm_radix_node_zone, rnode);
171}
172
173/*
174 * Return the position in the array for a given level.
175 */
176static __inline int
177vm_radix_slot(vm_pindex_t index, uint16_t level)
178{
179
180	return ((index >> ((VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH)) &
181	    VM_RADIX_MASK);
182}
183
184/* Trims the key after the specified level. */
185static __inline vm_pindex_t
186vm_radix_trimkey(vm_pindex_t index, uint16_t level)
187{
188	vm_pindex_t ret;
189
190	ret = index;
191	if (level < VM_RADIX_LIMIT) {
192		ret >>= (VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH;
193		ret <<= (VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH;
194	}
195	return (ret);
196}
197
198/*
199 * Get the root node for a radix tree.
200 */
201static __inline struct vm_radix_node *
202vm_radix_getroot(struct vm_radix *rtree)
203{
204
205	return ((struct vm_radix_node *)(rtree->rt_root & ~VM_RADIX_FLAGS));
206}
207
208/*
209 * Set the root node for a radix tree.
210 */
211static __inline void
212vm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode)
213{
214
215	rtree->rt_root = (uintptr_t)rnode;
216}
217
218/*
219 * Returns the associated page extracted from rnode if available,
220 * NULL otherwise.
221 */
222static __inline vm_page_t
223vm_radix_node_page(struct vm_radix_node *rnode)
224{
225
226	return ((((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0) ?
227	    (vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS) : NULL);
228}
229
230/*
231 * Adds the page as a child of provided node.
232 */
233static __inline void
234vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
235    vm_page_t page)
236{
237	int slot;
238
239	slot = vm_radix_slot(index, clev);
240	rnode->rn_child[slot] = (void *)((uintptr_t)page | VM_RADIX_ISLEAF);
241}
242
243/*
244 * Returns the slot where two keys differ.
245 * It cannot accept 2 equal keys.
246 */
247static __inline uint16_t
248vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
249{
250	uint16_t clev;
251
252	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
253	    __func__, (uintmax_t)index1));
254
255	index1 ^= index2;
256	for (clev = 0; clev <= VM_RADIX_LIMIT ; clev++)
257		if (vm_radix_slot(index1, clev))
258			return (clev);
259	panic("%s: it might have not reached this point", __func__);
260	return (0);
261}
262
263/*
264 * Returns TRUE if it can be determined that key does not belong to the
265 * specified rnode. FALSE otherwise.
266 */
267static __inline boolean_t
268vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
269{
270
271	if (rnode->rn_clev > 0) {
272		idx = vm_radix_trimkey(idx, rnode->rn_clev - 1);
273		idx -= rnode->rn_owner;
274		if (idx != 0)
275			return (TRUE);
276	}
277	return (FALSE);
278}
279
280/*
281 * Adjusts the idx key to the first upper level available, based on a valid
282 * initial level and map of available levels.
283 * Returns a value bigger than 0 to signal that there are not valid levels
284 * available.
285 */
286static __inline int
287vm_radix_addlev(vm_pindex_t *idx, boolean_t *levels, uint16_t ilev)
288{
289	vm_pindex_t wrapidx;
290
291	for (; levels[ilev] == FALSE ||
292	    vm_radix_slot(*idx, ilev) == (VM_RADIX_COUNT - 1); ilev--)
293		if (ilev == 0)
294			break;
295	KASSERT(ilev > 0 || levels[0] == TRUE,
296	    ("%s: levels back-scanning problem", __func__));
297	if (ilev == 0 && vm_radix_slot(*idx, ilev) == (VM_RADIX_COUNT - 1))
298		return (1);
299	wrapidx = *idx;
300	*idx = vm_radix_trimkey(*idx, ilev);
301	*idx += VM_RADIX_UNITLEVEL(ilev);
302	if (*idx < wrapidx)
303		return (1);
304	return (0);
305}
306
307/*
308 * Adjusts the idx key to the first lower level available, based on a valid
309 * initial level and map of available levels.
310 * Returns a value bigger than 0 to signal that there are not valid levels
311 * available.
312 */
313static __inline int
314vm_radix_declev(vm_pindex_t *idx, boolean_t *levels, uint16_t ilev)
315{
316	vm_pindex_t wrapidx;
317
318	for (; levels[ilev] == FALSE ||
319	    vm_radix_slot(*idx, ilev) == 0; ilev--)
320		if (ilev == 0)
321			break;
322	KASSERT(ilev > 0 || levels[0] == TRUE,
323	    ("%s: levels back-scanning problem", __func__));
324	if (ilev == 0 && vm_radix_slot(*idx, ilev) == 0)
325		return (1);
326	wrapidx = *idx;
327	*idx = vm_radix_trimkey(*idx, ilev);
328	*idx |= VM_RADIX_UNITLEVEL(ilev) - 1;
329	*idx -= VM_RADIX_UNITLEVEL(ilev);
330	if (*idx < wrapidx)
331		return (1);
332	return (0);
333}
334
335/*
336 * Internal handwork for vm_radix_reclaim_allonodes() primitive.
337 * This function is recrusive.
338 */
339static void
340vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
341{
342	int slot;
343
344	for (slot = 0; slot < VM_RADIX_COUNT && rnode->rn_count != 0; slot++) {
345		if (rnode->rn_child[slot] == NULL)
346			continue;
347		if (vm_radix_node_page(rnode->rn_child[slot]) == NULL)
348			vm_radix_reclaim_allnodes_int(rnode->rn_child[slot]);
349		rnode->rn_count--;
350	}
351	vm_radix_node_put(rnode);
352}
353
354#ifdef INVARIANTS
355/*
356 * Radix node zone destructor.
357 */
358static void
359vm_radix_node_zone_dtor(void *mem, int size __unused, void *arg __unused)
360{
361	struct vm_radix_node *rnode;
362
363	rnode = mem;
364	KASSERT(rnode->rn_count == 0,
365	    ("vm_radix_node_put: Freeing node %p with %d children\n", mem,
366	    rnode->rn_count));
367}
368#endif
369
370/*
371 * Pre-allocate intermediate nodes from the UMA slab zone.
372 */
373static void
374vm_radix_init(void *arg __unused)
375{
376	int nitems;
377
378	vm_radix_node_zone = uma_zcreate("RADIX NODE",
379	    sizeof(struct vm_radix_node), NULL,
380#ifdef INVARIANTS
381	    vm_radix_node_zone_dtor,
382#else
383	    NULL,
384#endif
385	    NULL, NULL, VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_NOFREE);
386	nitems = uma_zone_set_max(vm_radix_node_zone, cnt.v_page_count);
387	if (nitems < cnt.v_page_count)
388		panic("%s: unexpected requested number of items", __func__);
389	uma_prealloc(vm_radix_node_zone, nitems);
390	boot_cache_cnt = VM_RADIX_BOOT_CACHE + 1;
391}
392SYSINIT(vm_radix_init, SI_SUB_KMEM, SI_ORDER_SECOND, vm_radix_init, NULL);
393
394/*
395 * Inserts the key-value pair in to the trie.
396 * Panics if the key already exists.
397 */
398void
399vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, vm_page_t page)
400{
401	vm_pindex_t newind;
402	struct vm_radix_node *rnode, *tmp, *tmp2;
403	vm_page_t m;
404	int slot;
405	uint16_t clev;
406
407	/*
408	 * The owner of record for root is not really important because it
409	 * will never be used.
410	 */
411	rnode = vm_radix_getroot(rtree);
412	if (rnode == NULL) {
413		rnode = vm_radix_node_get(0, 1, 0);
414		vm_radix_setroot(rtree, rnode);
415		vm_radix_addpage(rnode, index, 0, page);
416		return;
417	}
418	while (rnode != NULL) {
419		if (vm_radix_keybarr(rnode, index) == TRUE)
420			break;
421		slot = vm_radix_slot(index, rnode->rn_clev);
422		m = vm_radix_node_page(rnode->rn_child[slot]);
423		if (m != NULL) {
424			if (m->pindex == index)
425				panic("%s: key %jx is already present",
426				    __func__, (uintmax_t)index);
427			clev = vm_radix_keydiff(m->pindex, index);
428			tmp = vm_radix_node_get(vm_radix_trimkey(index,
429			    clev - 1), 2, clev);
430			rnode->rn_child[slot] = tmp;
431			vm_radix_addpage(tmp, index, clev, page);
432			vm_radix_addpage(tmp, m->pindex, clev, m);
433			return;
434		}
435		if (rnode->rn_child[slot] == NULL) {
436			rnode->rn_count++;
437			vm_radix_addpage(rnode, index, rnode->rn_clev, page);
438			return;
439		}
440		rnode = rnode->rn_child[slot];
441	}
442	if (rnode == NULL)
443		panic("%s: path traversal ended unexpectedly", __func__);
444
445	/*
446	 * Scan the trie from the top and find the parent to insert
447	 * the new object.
448	 */
449	newind = rnode->rn_owner;
450	clev = vm_radix_keydiff(newind, index);
451	slot = VM_RADIX_COUNT;
452	for (rnode = vm_radix_getroot(rtree); ; rnode = tmp) {
453		KASSERT(rnode != NULL, ("%s: edge cannot be NULL in the scan",
454		    __func__));
455		KASSERT(clev >= rnode->rn_clev,
456		    ("%s: unexpected trie depth: clev: %d, rnode->rn_clev: %d",
457		    __func__, clev, rnode->rn_clev));
458		slot = vm_radix_slot(index, rnode->rn_clev);
459		tmp = rnode->rn_child[slot];
460		KASSERT(tmp != NULL && vm_radix_node_page(tmp) == NULL,
461		    ("%s: unexpected lookup interruption", __func__));
462		if (tmp->rn_clev > clev)
463			break;
464	}
465	KASSERT(rnode != NULL && tmp != NULL && slot < VM_RADIX_COUNT,
466	    ("%s: invalid scan parameters rnode: %p, tmp: %p, slot: %d",
467	    __func__, (void *)rnode, (void *)tmp, slot));
468
469	/*
470	 * A new node is needed because the right insertion level is reached.
471	 * Setup the new intermediate node and add the 2 children: the
472	 * new object and the older edge.
473	 */
474	tmp2 = vm_radix_node_get(vm_radix_trimkey(page->pindex, clev - 1), 2,
475	    clev);
476	rnode->rn_child[slot] = tmp2;
477	vm_radix_addpage(tmp2, index, clev, page);
478	slot = vm_radix_slot(newind, clev);
479	tmp2->rn_child[slot] = tmp;
480}
481
482/*
483 * Returns the value stored at the index.  If the index is not present
484 * NULL is returned.
485 */
486vm_page_t
487vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
488{
489	struct vm_radix_node *rnode;
490	vm_page_t m;
491	int slot;
492
493	rnode = vm_radix_getroot(rtree);
494	while (rnode != NULL) {
495		if (vm_radix_keybarr(rnode, index) == TRUE)
496			return (NULL);
497		slot = vm_radix_slot(index, rnode->rn_clev);
498		rnode = rnode->rn_child[slot];
499		m = vm_radix_node_page(rnode);
500		if (m != NULL) {
501			if (m->pindex == index)
502				return (m);
503			else
504				return (NULL);
505		}
506	}
507	return (NULL);
508}
509
510/*
511 * Look up any entry at a position bigger than or equal to index.
512 */
513vm_page_t
514vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
515{
516	vm_pindex_t inc;
517	vm_page_t m;
518	struct vm_radix_node *rnode;
519	int slot;
520	uint16_t difflev;
521	boolean_t maplevels[VM_RADIX_LIMIT + 1];
522#ifdef INVARIANTS
523	int loops = 0;
524#endif
525
526restart:
527	KASSERT(++loops < 1000, ("%s: too many loops", __func__));
528	for (difflev = 0; difflev < (VM_RADIX_LIMIT + 1); difflev++)
529		maplevels[difflev] = FALSE;
530	rnode = vm_radix_getroot(rtree);
531	while (rnode != NULL) {
532		maplevels[rnode->rn_clev] = TRUE;
533
534		/*
535		 * If the keys differ before the current bisection node
536		 * the search key might rollback to the earlierst
537		 * available bisection node, or to the smaller value
538		 * in the current domain (if the owner is bigger than the
539		 * search key).
540		 * The search for a valid bisection node is helped through
541		 * the use of maplevels array which should bring immediately
542		 * a lower useful level, skipping holes.
543		 */
544		if (vm_radix_keybarr(rnode, index) == TRUE) {
545			difflev = vm_radix_keydiff(index, rnode->rn_owner);
546			if (index > rnode->rn_owner) {
547				if (vm_radix_addlev(&index, maplevels,
548				    difflev) > 0)
549					break;
550			} else
551				index = vm_radix_trimkey(rnode->rn_owner,
552				    difflev);
553			goto restart;
554		}
555		slot = vm_radix_slot(index, rnode->rn_clev);
556		m = vm_radix_node_page(rnode->rn_child[slot]);
557		if (m != NULL && m->pindex >= index)
558			return (m);
559		if (rnode->rn_child[slot] != NULL && m == NULL) {
560			rnode = rnode->rn_child[slot];
561			continue;
562		}
563
564		/*
565		 * Look for an available edge or page within the current
566		 * bisection node.
567		 */
568                if (slot < (VM_RADIX_COUNT - 1)) {
569			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
570			index = vm_radix_trimkey(index, rnode->rn_clev);
571			index += inc;
572			slot++;
573			for (;; index += inc, slot++) {
574				m = vm_radix_node_page(rnode->rn_child[slot]);
575				if (m != NULL && m->pindex >= index)
576					return (m);
577				if ((rnode->rn_child[slot] != NULL &&
578				    m == NULL) || slot == (VM_RADIX_COUNT - 1))
579					break;
580			}
581		}
582
583		/*
584		 * If a valid page or edge, bigger than the search slot, is
585		 * found in the traversal, skip to the next higher-level key.
586		 */
587		if (slot == (VM_RADIX_COUNT - 1) &&
588		    (rnode->rn_child[slot] == NULL || m != NULL)) {
589			if (rnode->rn_clev == 0  || vm_radix_addlev(&index,
590			    maplevels, rnode->rn_clev - 1) > 0)
591				break;
592			goto restart;
593		}
594		rnode = rnode->rn_child[slot];
595	}
596	return (NULL);
597}
598
599/*
600 * Look up any entry at a position less than or equal to index.
601 */
602vm_page_t
603vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
604{
605	vm_pindex_t inc;
606	vm_page_t m;
607	struct vm_radix_node *rnode;
608	int slot;
609	uint16_t difflev;
610	boolean_t maplevels[VM_RADIX_LIMIT + 1];
611#ifdef INVARIANTS
612	int loops = 0;
613#endif
614
615restart:
616	KASSERT(++loops < 1000, ("%s: too many loops", __func__));
617	for (difflev = 0; difflev < (VM_RADIX_LIMIT + 1); difflev++)
618		maplevels[difflev] = FALSE;
619	rnode = vm_radix_getroot(rtree);
620	while (rnode != NULL) {
621		maplevels[rnode->rn_clev] = TRUE;
622
623		/*
624		 * If the keys differ before the current bisection node
625		 * the search key might rollback to the earlierst
626		 * available bisection node, or to the higher value
627		 * in the current domain (if the owner is smaller than the
628		 * search key).
629		 * The search for a valid bisection node is helped through
630		 * the use of maplevels array which should bring immediately
631		 * a lower useful level, skipping holes.
632		 */
633		if (vm_radix_keybarr(rnode, index) == TRUE) {
634			difflev = vm_radix_keydiff(index, rnode->rn_owner);
635			if (index > rnode->rn_owner) {
636				index = vm_radix_trimkey(rnode->rn_owner,
637				    difflev);
638				index |= VM_RADIX_UNITLEVEL(difflev) - 1;
639			} else if (vm_radix_declev(&index, maplevels,
640			    difflev) > 0)
641				break;
642			goto restart;
643		}
644		slot = vm_radix_slot(index, rnode->rn_clev);
645		m = vm_radix_node_page(rnode->rn_child[slot]);
646		if (m != NULL && m->pindex <= index)
647			return (m);
648		if (rnode->rn_child[slot] != NULL && m == NULL) {
649			rnode = rnode->rn_child[slot];
650			continue;
651		}
652
653		/*
654		 * Look for an available edge or page within the current
655		 * bisection node.
656		 */
657		if (slot > 0) {
658			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
659			index = vm_radix_trimkey(index, rnode->rn_clev);
660			index |= inc - 1;
661			index -= inc;
662			slot--;
663			for (;; index -= inc, slot--) {
664				m = vm_radix_node_page(rnode->rn_child[slot]);
665				if (m != NULL && m->pindex <= index)
666					return (m);
667				if ((rnode->rn_child[slot] != NULL &&
668				    m == NULL) || slot == 0)
669					break;
670			}
671		}
672
673		/*
674		 * If a valid page or edge, smaller than the search slot, is
675		 * found in the traversal, skip to the next higher-level key.
676		 */
677		if (slot == 0 && (rnode->rn_child[slot] == NULL || m != NULL)) {
678			if (rnode->rn_clev == 0 || vm_radix_declev(&index,
679			    maplevels, rnode->rn_clev - 1) > 0)
680				break;
681			goto restart;
682		}
683		rnode = rnode->rn_child[slot];
684	}
685	return (NULL);
686}
687
688/*
689 * Remove the specified index from the tree.
690 * Panics if the key is not present.
691 */
692void
693vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
694{
695	struct vm_radix_node *rnode, *parent;
696	vm_page_t m;
697	int i, slot;
698
699	parent = NULL;
700	rnode = vm_radix_getroot(rtree);
701	for (;;) {
702		if (rnode == NULL)
703			panic("vm_radix_remove: impossible to locate the key");
704		slot = vm_radix_slot(index, rnode->rn_clev);
705		m = vm_radix_node_page(rnode->rn_child[slot]);
706		if (m != NULL && m->pindex == index) {
707			rnode->rn_child[slot] = NULL;
708			rnode->rn_count--;
709			if (rnode->rn_count > 1)
710				break;
711			if (parent == NULL) {
712				if (rnode->rn_count == 0) {
713					vm_radix_node_put(rnode);
714					vm_radix_setroot(rtree, NULL);
715				}
716				break;
717			}
718			for (i = 0; i < VM_RADIX_COUNT; i++)
719				if (rnode->rn_child[i] != NULL)
720					break;
721			KASSERT(i != VM_RADIX_COUNT,
722			    ("%s: invalid node configuration", __func__));
723			slot = vm_radix_slot(index, parent->rn_clev);
724			KASSERT(parent->rn_child[slot] == rnode,
725			    ("%s: invalid child value", __func__));
726			parent->rn_child[slot] = rnode->rn_child[i];
727			rnode->rn_count--;
728			rnode->rn_child[i] = NULL;
729			vm_radix_node_put(rnode);
730			break;
731		}
732		if (m != NULL && m->pindex != index)
733			panic("%s: invalid key found", __func__);
734		parent = rnode;
735		rnode = rnode->rn_child[slot];
736	}
737}
738
739/*
740 * Remove and free all the nodes from the radix tree.
741 * This function is recrusive but there is a tight control on it as the
742 * maximum depth of the tree is fixed.
743 */
744void
745vm_radix_reclaim_allnodes(struct vm_radix *rtree)
746{
747	struct vm_radix_node *root;
748
749	root = vm_radix_getroot(rtree);
750	if (root == NULL)
751		return;
752	vm_radix_reclaim_allnodes_int(root);
753	vm_radix_setroot(rtree, NULL);
754}
755
756#ifdef DDB
757/*
758 * Show details about the given radix node.
759 */
760DB_SHOW_COMMAND(radixnode, db_show_radixnode)
761{
762	struct vm_radix_node *rnode;
763	int i;
764
765        if (!have_addr)
766                return;
767	rnode = (struct vm_radix_node *)addr;
768	db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
769	    (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
770	    rnode->rn_clev);
771	for (i = 0; i < VM_RADIX_COUNT; i++)
772		if (rnode->rn_child[i] != NULL)
773			db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
774			    i, (void *)rnode->rn_child[i],
775			    (void *)vm_radix_node_page(rnode->rn_child[i]),
776			    rnode->rn_clev);
777}
778#endif /* DDB */
779