vm_radix.c revision 249427
1226645Sattilio/*
2246726Sattilio * Copyright (c) 2013 EMC Corp.
3226930Sjeff * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
4226645Sattilio * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
5226645Sattilio * All rights reserved.
6226645Sattilio *
7226645Sattilio * Redistribution and use in source and binary forms, with or without
8226645Sattilio * modification, are permitted provided that the following conditions
9226645Sattilio * are met:
10226645Sattilio * 1. Redistributions of source code must retain the above copyright
11226645Sattilio *    notice, this list of conditions and the following disclaimer.
12226645Sattilio * 2. Redistributions in binary form must reproduce the above copyright
13226645Sattilio *    notice, this list of conditions and the following disclaimer in the
14226645Sattilio *    documentation and/or other materials provided with the distribution.
15226645Sattilio *
16226645Sattilio * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17226645Sattilio * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18226645Sattilio * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19226645Sattilio * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20226645Sattilio * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21226645Sattilio * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22226645Sattilio * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23226645Sattilio * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24226645Sattilio * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25226645Sattilio * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26226645Sattilio * SUCH DAMAGE.
27226645Sattilio *
28226645Sattilio */
29226645Sattilio
30226645Sattilio/*
31246726Sattilio * Path-compressed radix trie implementation.
32246726Sattilio * The following code is not generalized into a general purpose library
33246726Sattilio * because there are way too many parameters embedded that should really
34246726Sattilio * be decided by the library consumers.  At the same time, consumers
35246726Sattilio * of this code must achieve highest possible performance.
36246726Sattilio *
37246726Sattilio * The implementation takes into account the following rationale:
38248424Sattilio * - Size of the nodes should be as small as possible but still big enough
39248424Sattilio *   to avoid a large maximum depth for the trie.  This is a balance
40248424Sattilio *   between the necessity to not wire too much physical memory for the nodes
41248424Sattilio *   and the necessity to avoid too much cache pollution during the trie
42248424Sattilio *   operations.
43248424Sattilio * - There is not a huge bias toward the number of lookup operations over
44248424Sattilio *   the number of insert and remove operations.  This basically implies
45248424Sattilio *   that optimizations supposedly helping one operation but hurting the
46248424Sattilio *   other might be carefully evaluated.
47248424Sattilio * - On average not many nodes are expected to be fully populated, hence
48248424Sattilio *   level compression may just complicate things.
49226645Sattilio */
50226645Sattilio
51226645Sattilio#include <sys/cdefs.h>
52248448Sattilio__FBSDID("$FreeBSD: head/sys/vm/vm_radix.c 249427 2013-04-12 20:21:28Z alc $");
53226645Sattilio
54246726Sattilio#include "opt_ddb.h"
55246726Sattilio
56226645Sattilio#include <sys/param.h>
57226645Sattilio#include <sys/systm.h>
58226645Sattilio#include <sys/kernel.h>
59246840Sattilio#include <sys/vmmeter.h>
60246726Sattilio
61226873Sattilio#include <vm/uma.h>
62226645Sattilio#include <vm/vm.h>
63226876Sjeff#include <vm/vm_param.h>
64226873Sattilio#include <vm/vm_page.h>
65226645Sattilio#include <vm/vm_radix.h>
66226645Sattilio
67246726Sattilio#ifdef DDB
68246726Sattilio#include <ddb/ddb.h>
69246726Sattilio#endif
70226645Sattilio
71233034Sattilio/*
72247771Salc * These widths should allow the pointers to a node's children to fit within
73247771Salc * a single cache line.  The extra levels from a narrow width should not be
74247771Salc * a problem thanks to path compression.
75233034Sattilio */
76246726Sattilio#ifdef __LP64__
77246726Sattilio#define	VM_RADIX_WIDTH	4
78233034Sattilio#else
79246726Sattilio#define	VM_RADIX_WIDTH	3
80233034Sattilio#endif
81233034Sattilio
82232326Sattilio#define	VM_RADIX_COUNT	(1 << VM_RADIX_WIDTH)
83232326Sattilio#define	VM_RADIX_MASK	(VM_RADIX_COUNT - 1)
84246726Sattilio#define	VM_RADIX_LIMIT							\
85246726Sattilio	(howmany((sizeof(vm_pindex_t) * NBBY), VM_RADIX_WIDTH) - 1)
86232326Sattilio
87232326Sattilio/* Flag bits stored in node pointers. */
88246726Sattilio#define	VM_RADIX_ISLEAF	0x1
89246726Sattilio#define	VM_RADIX_FLAGS	0x1
90246726Sattilio#define	VM_RADIX_PAD	VM_RADIX_FLAGS
91232326Sattilio
92246726Sattilio/* Returns one unit associated with specified level. */
93246726Sattilio#define	VM_RADIX_UNITLEVEL(lev)						\
94246726Sattilio	((vm_pindex_t)1 << ((VM_RADIX_LIMIT - (lev)) * VM_RADIX_WIDTH))
95232326Sattilio
96232326Sattiliostruct vm_radix_node {
97246726Sattilio	vm_pindex_t	 rn_owner;			/* Owner of record. */
98246726Sattilio	uint16_t	 rn_count;			/* Valid children. */
99246726Sattilio	uint16_t	 rn_clev;			/* Current level. */
100249221Salc	void		*rn_child[VM_RADIX_COUNT];	/* Child nodes. */
101232326Sattilio};
102232326Sattilio
103226873Sattiliostatic uma_zone_t vm_radix_node_zone;
104226645Sattilio
105246794Sattilio/*
106247771Salc * Allocate a radix node.  Pre-allocation should ensure that the request
107247771Salc * will always be satisfied.
108246726Sattilio */
109246726Sattiliostatic __inline struct vm_radix_node *
110246726Sattiliovm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
111226873Sattilio{
112246726Sattilio	struct vm_radix_node *rnode;
113226645Sattilio
114248431Salc	rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT);
115226873Sattilio
116247742Sattilio	/*
117247771Salc	 * The required number of nodes should already be pre-allocated
118247771Salc	 * by vm_radix_prealloc().  However, UMA can hold a few nodes
119247771Salc	 * in per-CPU buckets, which will not be accessible by the
120247771Salc	 * current CPU.  Thus, the allocation could return NULL when
121247771Salc	 * the pre-allocated pool is close to exhaustion.  Anyway,
122247771Salc	 * in practice this should never occur because a new node
123247771Salc	 * is not always required for insert.  Thus, the pre-allocated
124247771Salc	 * pool should have some extra pages that prevent this from
125247771Salc	 * becoming a problem.
126247742Sattilio	 */
127247742Sattilio	if (rnode == NULL)
128247742Sattilio		panic("%s: uma_zalloc() returned NULL for a new node",
129247742Sattilio		    __func__);
130246726Sattilio	rnode->rn_owner = owner;
131246726Sattilio	rnode->rn_count = count;
132246726Sattilio	rnode->rn_clev = clevel;
133246726Sattilio	return (rnode);
134246726Sattilio}
135226873Sattilio
136246726Sattilio/*
137246726Sattilio * Free radix node.
138246726Sattilio */
139246726Sattiliostatic __inline void
140246726Sattiliovm_radix_node_put(struct vm_radix_node *rnode)
141246726Sattilio{
142246726Sattilio
143246726Sattilio	uma_zfree(vm_radix_node_zone, rnode);
144226873Sattilio}
145226873Sattilio
146246726Sattilio/*
147246726Sattilio * Return the position in the array for a given level.
148246726Sattilio */
149246726Sattiliostatic __inline int
150246726Sattiliovm_radix_slot(vm_pindex_t index, uint16_t level)
151226873Sattilio{
152226873Sattilio
153246726Sattilio	return ((index >> ((VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH)) &
154246726Sattilio	    VM_RADIX_MASK);
155226873Sattilio}
156226873Sattilio
157246726Sattilio/* Trims the key after the specified level. */
158246726Sattiliostatic __inline vm_pindex_t
159246726Sattiliovm_radix_trimkey(vm_pindex_t index, uint16_t level)
160226873Sattilio{
161246726Sattilio	vm_pindex_t ret;
162226873Sattilio
163246726Sattilio	ret = index;
164246726Sattilio	if (level < VM_RADIX_LIMIT) {
165246726Sattilio		ret >>= (VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH;
166246726Sattilio		ret <<= (VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH;
167246726Sattilio	}
168246726Sattilio	return (ret);
169226873Sattilio}
170226873Sattilio
171226645Sattilio/*
172246726Sattilio * Get the root node for a radix tree.
173226873Sattilio */
174246726Sattiliostatic __inline struct vm_radix_node *
175246726Sattiliovm_radix_getroot(struct vm_radix *rtree)
176226873Sattilio{
177226873Sattilio
178249221Salc	return ((struct vm_radix_node *)rtree->rt_root);
179226873Sattilio}
180226873Sattilio
181226873Sattilio/*
182246726Sattilio * Set the root node for a radix tree.
183226645Sattilio */
184246726Sattiliostatic __inline void
185246726Sattiliovm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode)
186226645Sattilio{
187226645Sattilio
188246726Sattilio	rtree->rt_root = (uintptr_t)rnode;
189226645Sattilio}
190226645Sattilio
191226645Sattilio/*
192248728Salc * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
193248728Salc */
194248728Salcstatic __inline boolean_t
195248728Salcvm_radix_isleaf(struct vm_radix_node *rnode)
196248728Salc{
197248728Salc
198248728Salc	return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
199248728Salc}
200248728Salc
201248728Salc/*
202249038Salc * Returns the associated page extracted from rnode.
203226645Sattilio */
204246726Sattiliostatic __inline vm_page_t
205249038Salcvm_radix_topage(struct vm_radix_node *rnode)
206226645Sattilio{
207226645Sattilio
208249038Salc	return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
209226645Sattilio}
210226645Sattilio
211226645Sattilio/*
212247771Salc * Adds the page as a child of the provided node.
213226645Sattilio */
214246726Sattiliostatic __inline void
215246726Sattiliovm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
216246726Sattilio    vm_page_t page)
217226645Sattilio{
218246726Sattilio	int slot;
219226645Sattilio
220246726Sattilio	slot = vm_radix_slot(index, clev);
221246726Sattilio	rnode->rn_child[slot] = (void *)((uintptr_t)page | VM_RADIX_ISLEAF);
222226645Sattilio}
223226645Sattilio
224233034Sattilio/*
225246726Sattilio * Returns the slot where two keys differ.
226246726Sattilio * It cannot accept 2 equal keys.
227233034Sattilio */
228246726Sattiliostatic __inline uint16_t
229246726Sattiliovm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
230226873Sattilio{
231246726Sattilio	uint16_t clev;
232226873Sattilio
233246726Sattilio	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
234246726Sattilio	    __func__, (uintmax_t)index1));
235233034Sattilio
236246726Sattilio	index1 ^= index2;
237246726Sattilio	for (clev = 0; clev <= VM_RADIX_LIMIT ; clev++)
238246726Sattilio		if (vm_radix_slot(index1, clev))
239246726Sattilio			return (clev);
240247969Sattilio	panic("%s: cannot reach this point", __func__);
241246726Sattilio	return (0);
242226873Sattilio}
243226873Sattilio
244226645Sattilio/*
245246726Sattilio * Returns TRUE if it can be determined that key does not belong to the
246247771Salc * specified rnode.  Otherwise, returns FALSE.
247226876Sjeff */
248246726Sattiliostatic __inline boolean_t
249246726Sattiliovm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
250226876Sjeff{
251226876Sjeff
252246726Sattilio	if (rnode->rn_clev > 0) {
253246726Sattilio		idx = vm_radix_trimkey(idx, rnode->rn_clev - 1);
254249211Salc		return (idx != rnode->rn_owner);
255246726Sattilio	}
256246726Sattilio	return (FALSE);
257226876Sjeff}
258226876Sjeff
259226876Sjeff/*
260246726Sattilio * Adjusts the idx key to the first upper level available, based on a valid
261246726Sattilio * initial level and map of available levels.
262246726Sattilio * Returns a value bigger than 0 to signal that there are not valid levels
263246726Sattilio * available.
264226876Sjeff */
265246726Sattiliostatic __inline int
266246726Sattiliovm_radix_addlev(vm_pindex_t *idx, boolean_t *levels, uint16_t ilev)
267226876Sjeff{
268246726Sattilio	vm_pindex_t wrapidx;
269226876Sjeff
270246726Sattilio	for (; levels[ilev] == FALSE ||
271246726Sattilio	    vm_radix_slot(*idx, ilev) == (VM_RADIX_COUNT - 1); ilev--)
272246726Sattilio		if (ilev == 0)
273249427Salc			return (1);
274246726Sattilio	wrapidx = *idx;
275246726Sattilio	*idx = vm_radix_trimkey(*idx, ilev);
276246726Sattilio	*idx += VM_RADIX_UNITLEVEL(ilev);
277247232Sattilio	return (*idx < wrapidx);
278226876Sjeff}
279226876Sjeff
280246726Sattilio/*
281246726Sattilio * Adjusts the idx key to the first lower level available, based on a valid
282246726Sattilio * initial level and map of available levels.
283246726Sattilio * Returns a value bigger than 0 to signal that there are not valid levels
284246726Sattilio * available.
285246726Sattilio */
286246726Sattiliostatic __inline int
287246726Sattiliovm_radix_declev(vm_pindex_t *idx, boolean_t *levels, uint16_t ilev)
288226930Sjeff{
289246726Sattilio	vm_pindex_t wrapidx;
290226930Sjeff
291246726Sattilio	for (; levels[ilev] == FALSE ||
292246726Sattilio	    vm_radix_slot(*idx, ilev) == 0; ilev--)
293246726Sattilio		if (ilev == 0)
294249427Salc			return (1);
295246726Sattilio	wrapidx = *idx;
296246726Sattilio	*idx = vm_radix_trimkey(*idx, ilev);
297246726Sattilio	*idx |= VM_RADIX_UNITLEVEL(ilev) - 1;
298246726Sattilio	*idx -= VM_RADIX_UNITLEVEL(ilev);
299247233Sattilio	return (*idx > wrapidx);
300226930Sjeff}
301226930Sjeff
302246726Sattilio/*
303247773Salc * Internal helper for vm_radix_reclaim_allnodes().
304247769Salc * This function is recursive.
305246726Sattilio */
306226980Sattiliostatic void
307246726Sattiliovm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
308226980Sattilio{
309226980Sattilio	int slot;
310226980Sattilio
311248684Salc	KASSERT(rnode->rn_count <= VM_RADIX_COUNT,
312248684Salc	    ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode));
313248684Salc	for (slot = 0; rnode->rn_count != 0; slot++) {
314226980Sattilio		if (rnode->rn_child[slot] == NULL)
315226980Sattilio			continue;
316248728Salc		if (!vm_radix_isleaf(rnode->rn_child[slot]))
317246726Sattilio			vm_radix_reclaim_allnodes_int(rnode->rn_child[slot]);
318248431Salc		rnode->rn_child[slot] = NULL;
319226980Sattilio		rnode->rn_count--;
320226980Sattilio	}
321226980Sattilio	vm_radix_node_put(rnode);
322226980Sattilio}
323226980Sattilio
324246836Sattilio#ifdef INVARIANTS
325226876Sjeff/*
326246836Sattilio * Radix node zone destructor.
327246836Sattilio */
328246836Sattiliostatic void
329246836Sattiliovm_radix_node_zone_dtor(void *mem, int size __unused, void *arg __unused)
330246836Sattilio{
331246836Sattilio	struct vm_radix_node *rnode;
332248431Salc	int slot;
333246836Sattilio
334246836Sattilio	rnode = mem;
335246836Sattilio	KASSERT(rnode->rn_count == 0,
336248431Salc	    ("vm_radix_node_put: rnode %p has %d children", rnode,
337246836Sattilio	    rnode->rn_count));
338248431Salc	for (slot = 0; slot < VM_RADIX_COUNT; slot++)
339248431Salc		KASSERT(rnode->rn_child[slot] == NULL,
340248431Salc		    ("vm_radix_node_put: rnode %p has a child", rnode));
341246836Sattilio}
342246836Sattilio#endif
343246836Sattilio
344246836Sattilio/*
345248431Salc * Radix node zone initializer.
346248431Salc */
347248431Salcstatic int
348248431Salcvm_radix_node_zone_init(void *mem, int size __unused, int flags __unused)
349248431Salc{
350248431Salc	struct vm_radix_node *rnode;
351248431Salc
352248431Salc	rnode = mem;
353248431Salc	memset(rnode->rn_child, 0, sizeof(rnode->rn_child));
354248431Salc	return (0);
355248431Salc}
356248431Salc
357248431Salc/*
358246726Sattilio * Pre-allocate intermediate nodes from the UMA slab zone.
359246726Sattilio */
360246794Sattiliostatic void
361247742Sattiliovm_radix_prealloc(void *arg __unused)
362246726Sattilio{
363246726Sattilio
364247742Sattilio	if (!uma_zone_reserve_kva(vm_radix_node_zone, cnt.v_page_count))
365247742Sattilio		panic("%s: unable to create new zone", __func__);
366247742Sattilio	uma_prealloc(vm_radix_node_zone, cnt.v_page_count);
367247742Sattilio}
368247742SattilioSYSINIT(vm_radix_prealloc, SI_SUB_KMEM, SI_ORDER_SECOND, vm_radix_prealloc,
369247742Sattilio    NULL);
370247742Sattilio
371247742Sattilio/*
372247742Sattilio * Initialize the UMA slab zone.
373247742Sattilio * Until vm_radix_prealloc() is called, the zone will be served by the
374247742Sattilio * UMA boot-time pre-allocated pool of pages.
375247742Sattilio */
376247742Sattiliovoid
377247742Sattiliovm_radix_init(void)
378247742Sattilio{
379247742Sattilio
380246726Sattilio	vm_radix_node_zone = uma_zcreate("RADIX NODE",
381246726Sattilio	    sizeof(struct vm_radix_node), NULL,
382246726Sattilio#ifdef INVARIANTS
383246726Sattilio	    vm_radix_node_zone_dtor,
384246726Sattilio#else
385246726Sattilio	    NULL,
386246726Sattilio#endif
387248431Salc	    vm_radix_node_zone_init, NULL, VM_RADIX_PAD, UMA_ZONE_VM |
388248431Salc	    UMA_ZONE_NOFREE);
389246726Sattilio}
390246726Sattilio
391246726Sattilio/*
392247771Salc * Inserts the key-value pair into the trie.
393226645Sattilio * Panics if the key already exists.
394226645Sattilio */
395246430Sattiliovoid
396248428Salcvm_radix_insert(struct vm_radix *rtree, vm_page_t page)
397226645Sattilio{
398248428Salc	vm_pindex_t index, newind;
399249182Salc	struct vm_radix_node *parent, *rnode, *tmp;
400246726Sattilio	vm_page_t m;
401236763Sattilio	int slot;
402246726Sattilio	uint16_t clev;
403226645Sattilio
404248428Salc	index = page->pindex;
405248222Sattilio
406226645Sattilio	/*
407246726Sattilio	 * The owner of record for root is not really important because it
408246726Sattilio	 * will never be used.
409226645Sattilio	 */
410246726Sattilio	rnode = vm_radix_getroot(rtree);
411246726Sattilio	if (rnode == NULL) {
412246726Sattilio		rnode = vm_radix_node_get(0, 1, 0);
413246726Sattilio		vm_radix_setroot(rtree, rnode);
414246726Sattilio		vm_radix_addpage(rnode, index, 0, page);
415246726Sattilio		return;
416246726Sattilio	}
417248684Salc	do {
418246726Sattilio		slot = vm_radix_slot(index, rnode->rn_clev);
419249038Salc		if (vm_radix_isleaf(rnode->rn_child[slot])) {
420249038Salc			m = vm_radix_topage(rnode->rn_child[slot]);
421246726Sattilio			if (m->pindex == index)
422246726Sattilio				panic("%s: key %jx is already present",
423246726Sattilio				    __func__, (uintmax_t)index);
424246726Sattilio			clev = vm_radix_keydiff(m->pindex, index);
425246726Sattilio			tmp = vm_radix_node_get(vm_radix_trimkey(index,
426246726Sattilio			    clev - 1), 2, clev);
427246726Sattilio			rnode->rn_child[slot] = tmp;
428246726Sattilio			vm_radix_addpage(tmp, index, clev, page);
429246726Sattilio			vm_radix_addpage(tmp, m->pindex, clev, m);
430246726Sattilio			return;
431226645Sattilio		}
432226645Sattilio		if (rnode->rn_child[slot] == NULL) {
433236760Sattilio			rnode->rn_count++;
434246726Sattilio			vm_radix_addpage(rnode, index, rnode->rn_clev, page);
435246726Sattilio			return;
436246726Sattilio		}
437249182Salc		parent = rnode;
438226645Sattilio		rnode = rnode->rn_child[slot];
439248684Salc	} while (!vm_radix_keybarr(rnode, index));
440226645Sattilio
441246726Sattilio	/*
442246726Sattilio	 * A new node is needed because the right insertion level is reached.
443246726Sattilio	 * Setup the new intermediate node and add the 2 children: the
444246726Sattilio	 * new object and the older edge.
445246726Sattilio	 */
446249182Salc	newind = rnode->rn_owner;
447249182Salc	clev = vm_radix_keydiff(newind, index);
448249182Salc	tmp = vm_radix_node_get(vm_radix_trimkey(index, clev - 1), 2,
449246726Sattilio	    clev);
450249182Salc	parent->rn_child[slot] = tmp;
451249182Salc	vm_radix_addpage(tmp, index, clev, page);
452246726Sattilio	slot = vm_radix_slot(newind, clev);
453249182Salc	tmp->rn_child[slot] = rnode;
454226645Sattilio}
455226645Sattilio
456226645Sattilio/*
457247771Salc * Returns the value stored at the index.  If the index is not present,
458226645Sattilio * NULL is returned.
459226645Sattilio */
460246430Sattiliovm_page_t
461238245Sattiliovm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
462226645Sattilio{
463226645Sattilio	struct vm_radix_node *rnode;
464246726Sattilio	vm_page_t m;
465226645Sattilio	int slot;
466226645Sattilio
467246726Sattilio	rnode = vm_radix_getroot(rtree);
468246795Sattilio	while (rnode != NULL) {
469249038Salc		if (vm_radix_isleaf(rnode)) {
470249038Salc			m = vm_radix_topage(rnode);
471246726Sattilio			if (m->pindex == index)
472246726Sattilio				return (m);
473246726Sattilio			else
474249427Salc				break;
475249427Salc		} else if (vm_radix_keybarr(rnode, index))
476249427Salc			break;
477249427Salc		slot = vm_radix_slot(index, rnode->rn_clev);
478249427Salc		rnode = rnode->rn_child[slot];
479226645Sattilio	}
480246726Sattilio	return (NULL);
481226645Sattilio}
482226645Sattilio
483226645Sattilio/*
484247771Salc * Look up the nearest entry at a position bigger than or equal to index.
485226645Sattilio */
486246726Sattiliovm_page_t
487246726Sattiliovm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
488226645Sattilio{
489246726Sattilio	vm_pindex_t inc;
490246726Sattilio	vm_page_t m;
491249038Salc	struct vm_radix_node *child, *rnode;
492226645Sattilio	int slot;
493246726Sattilio	uint16_t difflev;
494246726Sattilio	boolean_t maplevels[VM_RADIX_LIMIT + 1];
495246726Sattilio#ifdef INVARIANTS
496246726Sattilio	int loops = 0;
497246726Sattilio#endif
498226645Sattilio
499249427Salc	rnode = vm_radix_getroot(rtree);
500249427Salc	if (rnode == NULL)
501249427Salc		return (NULL);
502249427Salc	else if (vm_radix_isleaf(rnode)) {
503249427Salc		m = vm_radix_topage(rnode);
504249427Salc		if (m->pindex >= index)
505249427Salc			return (m);
506249427Salc		else
507249427Salc			return (NULL);
508249427Salc	}
509226876Sjeffrestart:
510246726Sattilio	KASSERT(++loops < 1000, ("%s: too many loops", __func__));
511246726Sattilio	for (difflev = 0; difflev < (VM_RADIX_LIMIT + 1); difflev++)
512246726Sattilio		maplevels[difflev] = FALSE;
513249427Salc	for (;;) {
514246726Sattilio		maplevels[rnode->rn_clev] = TRUE;
515246726Sattilio
516246726Sattilio		/*
517246726Sattilio		 * If the keys differ before the current bisection node
518248444Salc		 * the search key might rollback to the earliest
519246726Sattilio		 * available bisection node, or to the smaller value
520246726Sattilio		 * in the current domain (if the owner is bigger than the
521246726Sattilio		 * search key).
522247969Sattilio		 * The maplevels array records any node has been seen
523247969Sattilio		 * at a given level.  This aids the search for a valid
524247969Sattilio		 * bisection node.
525246726Sattilio		 */
526247770Salc		if (vm_radix_keybarr(rnode, index)) {
527246726Sattilio			difflev = vm_radix_keydiff(index, rnode->rn_owner);
528246726Sattilio			if (index > rnode->rn_owner) {
529246726Sattilio				if (vm_radix_addlev(&index, maplevels,
530246726Sattilio				    difflev) > 0)
531246726Sattilio					break;
532246726Sattilio			} else
533246726Sattilio				index = vm_radix_trimkey(rnode->rn_owner,
534246726Sattilio				    difflev);
535249427Salc			rnode = vm_radix_getroot(rtree);
536246726Sattilio			goto restart;
537246726Sattilio		}
538246726Sattilio		slot = vm_radix_slot(index, rnode->rn_clev);
539249038Salc		child = rnode->rn_child[slot];
540249038Salc		if (vm_radix_isleaf(child)) {
541249038Salc			m = vm_radix_topage(child);
542249038Salc			if (m->pindex >= index)
543249038Salc				return (m);
544249038Salc		} else if (child != NULL)
545249038Salc			goto descend;
546246726Sattilio
547226645Sattilio		/*
548246726Sattilio		 * Look for an available edge or page within the current
549246726Sattilio		 * bisection node.
550246726Sattilio		 */
551246726Sattilio                if (slot < (VM_RADIX_COUNT - 1)) {
552246726Sattilio			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
553246726Sattilio			index = vm_radix_trimkey(index, rnode->rn_clev);
554249038Salc			do {
555249038Salc				index += inc;
556249038Salc				slot++;
557249038Salc				child = rnode->rn_child[slot];
558249038Salc				if (vm_radix_isleaf(child)) {
559249038Salc					m = vm_radix_topage(child);
560249038Salc					if (m->pindex >= index)
561249038Salc						return (m);
562249038Salc				} else if (child != NULL)
563249038Salc					goto descend;
564249038Salc			} while (slot < (VM_RADIX_COUNT - 1));
565246726Sattilio		}
566249038Salc		KASSERT(child == NULL || vm_radix_isleaf(child),
567249038Salc		    ("vm_radix_lookup_ge: child is radix node"));
568230750Sattilio
569246726Sattilio		/*
570247771Salc		 * If a valid page or edge bigger than the search slot is
571246726Sattilio		 * found in the traversal, skip to the next higher-level key.
572246726Sattilio		 */
573249038Salc		if (rnode->rn_clev == 0 || vm_radix_addlev(&index, maplevels,
574249038Salc		    rnode->rn_clev - 1) > 0)
575249038Salc			break;
576249427Salc		rnode = vm_radix_getroot(rtree);
577249038Salc		goto restart;
578249038Salcdescend:
579249038Salc		rnode = child;
580226645Sattilio	}
581245254Sattilio	return (NULL);
582226645Sattilio}
583226645Sattilio
584226645Sattilio/*
585247771Salc * Look up the nearest entry at a position less than or equal to index.
586226645Sattilio */
587246430Sattiliovm_page_t
588238245Sattiliovm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
589226645Sattilio{
590246726Sattilio	vm_pindex_t inc;
591246726Sattilio	vm_page_t m;
592249038Salc	struct vm_radix_node *child, *rnode;
593226646Sjeff	int slot;
594246726Sattilio	uint16_t difflev;
595246726Sattilio	boolean_t maplevels[VM_RADIX_LIMIT + 1];
596246726Sattilio#ifdef INVARIANTS
597246726Sattilio	int loops = 0;
598246726Sattilio#endif
599226645Sattilio
600249427Salc	rnode = vm_radix_getroot(rtree);
601249427Salc	if (rnode == NULL)
602249427Salc		return (NULL);
603249427Salc	else if (vm_radix_isleaf(rnode)) {
604249427Salc		m = vm_radix_topage(rnode);
605249427Salc		if (m->pindex <= index)
606249427Salc			return (m);
607249427Salc		else
608249427Salc			return (NULL);
609249427Salc	}
610226876Sjeffrestart:
611246726Sattilio	KASSERT(++loops < 1000, ("%s: too many loops", __func__));
612246726Sattilio	for (difflev = 0; difflev < (VM_RADIX_LIMIT + 1); difflev++)
613246726Sattilio		maplevels[difflev] = FALSE;
614249427Salc	for (;;) {
615246726Sattilio		maplevels[rnode->rn_clev] = TRUE;
616246726Sattilio
617226646Sjeff		/*
618246726Sattilio		 * If the keys differ before the current bisection node
619248444Salc		 * the search key might rollback to the earliest
620246726Sattilio		 * available bisection node, or to the higher value
621246726Sattilio		 * in the current domain (if the owner is smaller than the
622246726Sattilio		 * search key).
623247969Sattilio		 * The maplevels array records any node has been seen
624247969Sattilio		 * at a given level.  This aids the search for a valid
625247969Sattilio		 * bisection node.
626226646Sjeff		 */
627247770Salc		if (vm_radix_keybarr(rnode, index)) {
628246726Sattilio			difflev = vm_radix_keydiff(index, rnode->rn_owner);
629246726Sattilio			if (index > rnode->rn_owner) {
630246726Sattilio				index = vm_radix_trimkey(rnode->rn_owner,
631246726Sattilio				    difflev);
632246726Sattilio				index |= VM_RADIX_UNITLEVEL(difflev) - 1;
633246726Sattilio			} else if (vm_radix_declev(&index, maplevels,
634246726Sattilio			    difflev) > 0)
635246726Sattilio				break;
636249427Salc			rnode = vm_radix_getroot(rtree);
637246726Sattilio			goto restart;
638246726Sattilio		}
639246726Sattilio		slot = vm_radix_slot(index, rnode->rn_clev);
640249038Salc		child = rnode->rn_child[slot];
641249038Salc		if (vm_radix_isleaf(child)) {
642249038Salc			m = vm_radix_topage(child);
643249038Salc			if (m->pindex <= index)
644249038Salc				return (m);
645249038Salc		} else if (child != NULL)
646249038Salc			goto descend;
647246726Sattilio
648246726Sattilio		/*
649246726Sattilio		 * Look for an available edge or page within the current
650246726Sattilio		 * bisection node.
651246726Sattilio		 */
652246726Sattilio		if (slot > 0) {
653246726Sattilio			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
654246726Sattilio			index = vm_radix_trimkey(index, rnode->rn_clev);
655246726Sattilio			index |= inc - 1;
656249038Salc			do {
657249038Salc				index -= inc;
658249038Salc				slot--;
659249038Salc				child = rnode->rn_child[slot];
660249038Salc				if (vm_radix_isleaf(child)) {
661249038Salc					m = vm_radix_topage(child);
662249038Salc					if (m->pindex <= index)
663249038Salc						return (m);
664249038Salc				} else if (child != NULL)
665249038Salc					goto descend;
666249038Salc			} while (slot > 0);
667226646Sjeff		}
668249038Salc		KASSERT(child == NULL || vm_radix_isleaf(child),
669249038Salc		    ("vm_radix_lookup_le: child is radix node"));
670246726Sattilio
671246726Sattilio		/*
672247771Salc		 * If a valid page or edge smaller than the search slot is
673246726Sattilio		 * found in the traversal, skip to the next higher-level key.
674246726Sattilio		 */
675249038Salc		if (rnode->rn_clev == 0 || vm_radix_declev(&index, maplevels,
676249038Salc		    rnode->rn_clev - 1) > 0)
677249038Salc			break;
678249427Salc		rnode = vm_radix_getroot(rtree);
679249038Salc		goto restart;
680249038Salcdescend:
681249038Salc		rnode = child;
682226646Sjeff	}
683226645Sattilio	return (NULL);
684226645Sattilio}
685226645Sattilio
686226645Sattilio/*
687246726Sattilio * Remove the specified index from the tree.
688246726Sattilio * Panics if the key is not present.
689226645Sattilio */
690236763Sattiliovoid
691238245Sattiliovm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
692226645Sattilio{
693246726Sattilio	struct vm_radix_node *rnode, *parent;
694246726Sattilio	vm_page_t m;
695246726Sattilio	int i, slot;
696226645Sattilio
697246726Sattilio	parent = NULL;
698246726Sattilio	rnode = vm_radix_getroot(rtree);
699236760Sattilio	for (;;) {
700246726Sattilio		if (rnode == NULL)
701246726Sattilio			panic("vm_radix_remove: impossible to locate the key");
702246726Sattilio		slot = vm_radix_slot(index, rnode->rn_clev);
703249038Salc		if (vm_radix_isleaf(rnode->rn_child[slot])) {
704249038Salc			m = vm_radix_topage(rnode->rn_child[slot]);
705249038Salc			if (m->pindex != index)
706249038Salc				panic("%s: invalid key found", __func__);
707246726Sattilio			rnode->rn_child[slot] = NULL;
708246726Sattilio			rnode->rn_count--;
709246726Sattilio			if (rnode->rn_count > 1)
710246726Sattilio				break;
711246726Sattilio			if (parent == NULL) {
712246726Sattilio				if (rnode->rn_count == 0) {
713246726Sattilio					vm_radix_node_put(rnode);
714246726Sattilio					vm_radix_setroot(rtree, NULL);
715246726Sattilio				}
716246726Sattilio				break;
717246726Sattilio			}
718246726Sattilio			for (i = 0; i < VM_RADIX_COUNT; i++)
719246726Sattilio				if (rnode->rn_child[i] != NULL)
720246726Sattilio					break;
721246726Sattilio			KASSERT(i != VM_RADIX_COUNT,
722246726Sattilio			    ("%s: invalid node configuration", __func__));
723246726Sattilio			slot = vm_radix_slot(index, parent->rn_clev);
724246726Sattilio			KASSERT(parent->rn_child[slot] == rnode,
725246726Sattilio			    ("%s: invalid child value", __func__));
726246726Sattilio			parent->rn_child[slot] = rnode->rn_child[i];
727246726Sattilio			rnode->rn_count--;
728246726Sattilio			rnode->rn_child[i] = NULL;
729246726Sattilio			vm_radix_node_put(rnode);
730236760Sattilio			break;
731236760Sattilio		}
732246726Sattilio		parent = rnode;
733246726Sattilio		rnode = rnode->rn_child[slot];
734236760Sattilio	}
735226645Sattilio}
736226645Sattilio
737226645Sattilio/*
738226980Sattilio * Remove and free all the nodes from the radix tree.
739247771Salc * This function is recursive but there is a tight control on it as the
740226980Sattilio * maximum depth of the tree is fixed.
741226980Sattilio */
742226980Sattiliovoid
743226980Sattiliovm_radix_reclaim_allnodes(struct vm_radix *rtree)
744226980Sattilio{
745226980Sattilio	struct vm_radix_node *root;
746226980Sattilio
747246726Sattilio	root = vm_radix_getroot(rtree);
748246726Sattilio	if (root == NULL)
749226980Sattilio		return;
750248684Salc	vm_radix_setroot(rtree, NULL);
751246726Sattilio	vm_radix_reclaim_allnodes_int(root);
752226980Sattilio}
753246726Sattilio
754246726Sattilio#ifdef DDB
755246726Sattilio/*
756246837Sattilio * Show details about the given radix node.
757246726Sattilio */
758246726SattilioDB_SHOW_COMMAND(radixnode, db_show_radixnode)
759246726Sattilio{
760246726Sattilio	struct vm_radix_node *rnode;
761246726Sattilio	int i;
762246726Sattilio
763246726Sattilio        if (!have_addr)
764246726Sattilio                return;
765246726Sattilio	rnode = (struct vm_radix_node *)addr;
766246726Sattilio	db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
767246726Sattilio	    (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
768246726Sattilio	    rnode->rn_clev);
769246726Sattilio	for (i = 0; i < VM_RADIX_COUNT; i++)
770246726Sattilio		if (rnode->rn_child[i] != NULL)
771246726Sattilio			db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
772246726Sattilio			    i, (void *)rnode->rn_child[i],
773249038Salc			    vm_radix_isleaf(rnode->rn_child[i]) ?
774249038Salc			    vm_radix_topage(rnode->rn_child[i]) : NULL,
775246726Sattilio			    rnode->rn_clev);
776246726Sattilio}
777246726Sattilio#endif /* DDB */
778