vm_radix.c revision 254141
1226645Sattilio/*
2246726Sattilio * Copyright (c) 2013 EMC Corp.
3226930Sjeff * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
4226645Sattilio * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
5226645Sattilio * All rights reserved.
6226645Sattilio *
7226645Sattilio * Redistribution and use in source and binary forms, with or without
8226645Sattilio * modification, are permitted provided that the following conditions
9226645Sattilio * are met:
10226645Sattilio * 1. Redistributions of source code must retain the above copyright
11226645Sattilio *    notice, this list of conditions and the following disclaimer.
12226645Sattilio * 2. Redistributions in binary form must reproduce the above copyright
13226645Sattilio *    notice, this list of conditions and the following disclaimer in the
14226645Sattilio *    documentation and/or other materials provided with the distribution.
15226645Sattilio *
16226645Sattilio * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17226645Sattilio * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18226645Sattilio * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19226645Sattilio * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20226645Sattilio * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21226645Sattilio * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22226645Sattilio * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23226645Sattilio * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24226645Sattilio * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25226645Sattilio * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26226645Sattilio * SUCH DAMAGE.
27226645Sattilio *
28226645Sattilio */
29226645Sattilio
30226645Sattilio/*
31246726Sattilio * Path-compressed radix trie implementation.
32246726Sattilio * The following code is not generalized into a general purpose library
33246726Sattilio * because there are way too many parameters embedded that should really
34246726Sattilio * be decided by the library consumers.  At the same time, consumers
35246726Sattilio * of this code must achieve highest possible performance.
36246726Sattilio *
37246726Sattilio * The implementation takes into account the following rationale:
38248424Sattilio * - Size of the nodes should be as small as possible but still big enough
39248424Sattilio *   to avoid a large maximum depth for the trie.  This is a balance
40248424Sattilio *   between the necessity to not wire too much physical memory for the nodes
41248424Sattilio *   and the necessity to avoid too much cache pollution during the trie
42248424Sattilio *   operations.
43248424Sattilio * - There is not a huge bias toward the number of lookup operations over
44248424Sattilio *   the number of insert and remove operations.  This basically implies
45248424Sattilio *   that optimizations supposedly helping one operation but hurting the
46248424Sattilio *   other might be carefully evaluated.
47248424Sattilio * - On average not many nodes are expected to be fully populated, hence
48248424Sattilio *   level compression may just complicate things.
49226645Sattilio */
50226645Sattilio
51226645Sattilio#include <sys/cdefs.h>
52248448Sattilio__FBSDID("$FreeBSD: head/sys/vm/vm_radix.c 254141 2013-08-09 11:28:55Z attilio $");
53226645Sattilio
54246726Sattilio#include "opt_ddb.h"
55246726Sattilio
56226645Sattilio#include <sys/param.h>
57226645Sattilio#include <sys/systm.h>
58226645Sattilio#include <sys/kernel.h>
59246840Sattilio#include <sys/vmmeter.h>
60246726Sattilio
61226873Sattilio#include <vm/uma.h>
62226645Sattilio#include <vm/vm.h>
63226876Sjeff#include <vm/vm_param.h>
64226873Sattilio#include <vm/vm_page.h>
65226645Sattilio#include <vm/vm_radix.h>
66226645Sattilio
67246726Sattilio#ifdef DDB
68246726Sattilio#include <ddb/ddb.h>
69246726Sattilio#endif
70226645Sattilio
71233034Sattilio/*
72247771Salc * These widths should allow the pointers to a node's children to fit within
73247771Salc * a single cache line.  The extra levels from a narrow width should not be
74247771Salc * a problem thanks to path compression.
75233034Sattilio */
76246726Sattilio#ifdef __LP64__
77246726Sattilio#define	VM_RADIX_WIDTH	4
78233034Sattilio#else
79246726Sattilio#define	VM_RADIX_WIDTH	3
80233034Sattilio#endif
81233034Sattilio
82232326Sattilio#define	VM_RADIX_COUNT	(1 << VM_RADIX_WIDTH)
83232326Sattilio#define	VM_RADIX_MASK	(VM_RADIX_COUNT - 1)
84246726Sattilio#define	VM_RADIX_LIMIT							\
85246726Sattilio	(howmany((sizeof(vm_pindex_t) * NBBY), VM_RADIX_WIDTH) - 1)
86232326Sattilio
87232326Sattilio/* Flag bits stored in node pointers. */
88246726Sattilio#define	VM_RADIX_ISLEAF	0x1
89246726Sattilio#define	VM_RADIX_FLAGS	0x1
90246726Sattilio#define	VM_RADIX_PAD	VM_RADIX_FLAGS
91232326Sattilio
92246726Sattilio/* Returns one unit associated with specified level. */
93246726Sattilio#define	VM_RADIX_UNITLEVEL(lev)						\
94250520Salc	((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH))
95232326Sattilio
96232326Sattiliostruct vm_radix_node {
97246726Sattilio	vm_pindex_t	 rn_owner;			/* Owner of record. */
98246726Sattilio	uint16_t	 rn_count;			/* Valid children. */
99246726Sattilio	uint16_t	 rn_clev;			/* Current level. */
100249221Salc	void		*rn_child[VM_RADIX_COUNT];	/* Child nodes. */
101232326Sattilio};
102232326Sattilio
103226873Sattiliostatic uma_zone_t vm_radix_node_zone;
104226645Sattilio
105246794Sattilio/*
106254141Sattilio * Allocate a radix node.
107246726Sattilio */
108246726Sattiliostatic __inline struct vm_radix_node *
109246726Sattiliovm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
110226873Sattilio{
111246726Sattilio	struct vm_radix_node *rnode;
112226645Sattilio
113254141Sattilio	rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO);
114247742Sattilio	if (rnode == NULL)
115254141Sattilio		return (NULL);
116246726Sattilio	rnode->rn_owner = owner;
117246726Sattilio	rnode->rn_count = count;
118246726Sattilio	rnode->rn_clev = clevel;
119246726Sattilio	return (rnode);
120246726Sattilio}
121226873Sattilio
122246726Sattilio/*
123246726Sattilio * Free radix node.
124246726Sattilio */
125246726Sattiliostatic __inline void
126246726Sattiliovm_radix_node_put(struct vm_radix_node *rnode)
127246726Sattilio{
128246726Sattilio
129246726Sattilio	uma_zfree(vm_radix_node_zone, rnode);
130226873Sattilio}
131226873Sattilio
132246726Sattilio/*
133246726Sattilio * Return the position in the array for a given level.
134246726Sattilio */
135246726Sattiliostatic __inline int
136246726Sattiliovm_radix_slot(vm_pindex_t index, uint16_t level)
137226873Sattilio{
138226873Sattilio
139250520Salc	return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK);
140226873Sattilio}
141226873Sattilio
142246726Sattilio/* Trims the key after the specified level. */
143246726Sattiliostatic __inline vm_pindex_t
144246726Sattiliovm_radix_trimkey(vm_pindex_t index, uint16_t level)
145226873Sattilio{
146246726Sattilio	vm_pindex_t ret;
147226873Sattilio
148246726Sattilio	ret = index;
149250520Salc	if (level > 0) {
150250520Salc		ret >>= level * VM_RADIX_WIDTH;
151250520Salc		ret <<= level * VM_RADIX_WIDTH;
152246726Sattilio	}
153246726Sattilio	return (ret);
154226873Sattilio}
155226873Sattilio
156226645Sattilio/*
157246726Sattilio * Get the root node for a radix tree.
158226873Sattilio */
159246726Sattiliostatic __inline struct vm_radix_node *
160246726Sattiliovm_radix_getroot(struct vm_radix *rtree)
161226873Sattilio{
162226873Sattilio
163249221Salc	return ((struct vm_radix_node *)rtree->rt_root);
164226873Sattilio}
165226873Sattilio
166226873Sattilio/*
167246726Sattilio * Set the root node for a radix tree.
168226645Sattilio */
169246726Sattiliostatic __inline void
170246726Sattiliovm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode)
171226645Sattilio{
172226645Sattilio
173246726Sattilio	rtree->rt_root = (uintptr_t)rnode;
174226645Sattilio}
175226645Sattilio
176226645Sattilio/*
177248728Salc * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
178248728Salc */
179248728Salcstatic __inline boolean_t
180248728Salcvm_radix_isleaf(struct vm_radix_node *rnode)
181248728Salc{
182248728Salc
183248728Salc	return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
184248728Salc}
185248728Salc
186248728Salc/*
187249038Salc * Returns the associated page extracted from rnode.
188226645Sattilio */
189246726Sattiliostatic __inline vm_page_t
190249038Salcvm_radix_topage(struct vm_radix_node *rnode)
191226645Sattilio{
192226645Sattilio
193249038Salc	return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
194226645Sattilio}
195226645Sattilio
196226645Sattilio/*
197247771Salc * Adds the page as a child of the provided node.
198226645Sattilio */
199246726Sattiliostatic __inline void
200246726Sattiliovm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
201246726Sattilio    vm_page_t page)
202226645Sattilio{
203246726Sattilio	int slot;
204226645Sattilio
205246726Sattilio	slot = vm_radix_slot(index, clev);
206246726Sattilio	rnode->rn_child[slot] = (void *)((uintptr_t)page | VM_RADIX_ISLEAF);
207226645Sattilio}
208226645Sattilio
209233034Sattilio/*
210246726Sattilio * Returns the slot where two keys differ.
211246726Sattilio * It cannot accept 2 equal keys.
212233034Sattilio */
213246726Sattiliostatic __inline uint16_t
214246726Sattiliovm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
215226873Sattilio{
216246726Sattilio	uint16_t clev;
217226873Sattilio
218246726Sattilio	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
219246726Sattilio	    __func__, (uintmax_t)index1));
220233034Sattilio
221246726Sattilio	index1 ^= index2;
222250520Salc	for (clev = VM_RADIX_LIMIT;; clev--)
223250334Salc		if (vm_radix_slot(index1, clev) != 0)
224246726Sattilio			return (clev);
225226873Sattilio}
226226873Sattilio
227226645Sattilio/*
228246726Sattilio * Returns TRUE if it can be determined that key does not belong to the
229247771Salc * specified rnode.  Otherwise, returns FALSE.
230226876Sjeff */
231246726Sattiliostatic __inline boolean_t
232246726Sattiliovm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
233226876Sjeff{
234226876Sjeff
235250520Salc	if (rnode->rn_clev < VM_RADIX_LIMIT) {
236250520Salc		idx = vm_radix_trimkey(idx, rnode->rn_clev + 1);
237249211Salc		return (idx != rnode->rn_owner);
238246726Sattilio	}
239246726Sattilio	return (FALSE);
240226876Sjeff}
241226876Sjeff
242226876Sjeff/*
243247773Salc * Internal helper for vm_radix_reclaim_allnodes().
244247769Salc * This function is recursive.
245246726Sattilio */
246226980Sattiliostatic void
247246726Sattiliovm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
248226980Sattilio{
249226980Sattilio	int slot;
250226980Sattilio
251248684Salc	KASSERT(rnode->rn_count <= VM_RADIX_COUNT,
252248684Salc	    ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode));
253248684Salc	for (slot = 0; rnode->rn_count != 0; slot++) {
254226980Sattilio		if (rnode->rn_child[slot] == NULL)
255226980Sattilio			continue;
256248728Salc		if (!vm_radix_isleaf(rnode->rn_child[slot]))
257246726Sattilio			vm_radix_reclaim_allnodes_int(rnode->rn_child[slot]);
258248431Salc		rnode->rn_child[slot] = NULL;
259226980Sattilio		rnode->rn_count--;
260226980Sattilio	}
261226980Sattilio	vm_radix_node_put(rnode);
262226980Sattilio}
263226980Sattilio
264246836Sattilio#ifdef INVARIANTS
265226876Sjeff/*
266246836Sattilio * Radix node zone destructor.
267246836Sattilio */
268246836Sattiliostatic void
269246836Sattiliovm_radix_node_zone_dtor(void *mem, int size __unused, void *arg __unused)
270246836Sattilio{
271246836Sattilio	struct vm_radix_node *rnode;
272248431Salc	int slot;
273246836Sattilio
274246836Sattilio	rnode = mem;
275246836Sattilio	KASSERT(rnode->rn_count == 0,
276248431Salc	    ("vm_radix_node_put: rnode %p has %d children", rnode,
277246836Sattilio	    rnode->rn_count));
278248431Salc	for (slot = 0; slot < VM_RADIX_COUNT; slot++)
279248431Salc		KASSERT(rnode->rn_child[slot] == NULL,
280248431Salc		    ("vm_radix_node_put: rnode %p has a child", rnode));
281246836Sattilio}
282246836Sattilio#endif
283246836Sattilio
284254141Sattilio#ifndef UMA_MD_SMALL_ALLOC
285246836Sattilio/*
286254141Sattilio * Reserve the KVA necessary to satisfy the node allocation.
287254141Sattilio * This is mandatory in architectures not supporting direct
288254141Sattilio * mapping as they will need otherwise to carve into the kernel maps for
289254141Sattilio * every node allocation, resulting into deadlocks for consumers already
290254141Sattilio * working with kernel maps.
291248431Salc */
292246794Sattiliostatic void
293254141Sattiliovm_radix_reserve_kva(void *arg __unused)
294246726Sattilio{
295246726Sattilio
296249605Salc	/*
297249605Salc	 * Calculate the number of reserved nodes, discounting the pages that
298249605Salc	 * are needed to store them.
299249605Salc	 */
300254141Sattilio	if (!uma_zone_reserve_kva(vm_radix_node_zone,
301254141Sattilio	    ((vm_paddr_t)cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
302254141Sattilio	    sizeof(struct vm_radix_node))))
303254141Sattilio		panic("%s: unable to reserve KVA", __func__);
304247742Sattilio}
305254141SattilioSYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_SECOND,
306254141Sattilio    vm_radix_reserve_kva, NULL);
307254141Sattilio#endif
308247742Sattilio
309247742Sattilio/*
310247742Sattilio * Initialize the UMA slab zone.
311247742Sattilio * Until vm_radix_prealloc() is called, the zone will be served by the
312247742Sattilio * UMA boot-time pre-allocated pool of pages.
313247742Sattilio */
314247742Sattiliovoid
315247742Sattiliovm_radix_init(void)
316247742Sattilio{
317247742Sattilio
318246726Sattilio	vm_radix_node_zone = uma_zcreate("RADIX NODE",
319246726Sattilio	    sizeof(struct vm_radix_node), NULL,
320246726Sattilio#ifdef INVARIANTS
321246726Sattilio	    vm_radix_node_zone_dtor,
322246726Sattilio#else
323246726Sattilio	    NULL,
324246726Sattilio#endif
325254141Sattilio	    NULL, NULL, VM_RADIX_PAD, UMA_ZONE_VM);
326246726Sattilio}
327246726Sattilio
328246726Sattilio/*
329247771Salc * Inserts the key-value pair into the trie.
330226645Sattilio * Panics if the key already exists.
331226645Sattilio */
332254141Sattilioint
333248428Salcvm_radix_insert(struct vm_radix *rtree, vm_page_t page)
334226645Sattilio{
335248428Salc	vm_pindex_t index, newind;
336249502Salc	void **parentp;
337249502Salc	struct vm_radix_node *rnode, *tmp;
338246726Sattilio	vm_page_t m;
339236763Sattilio	int slot;
340246726Sattilio	uint16_t clev;
341226645Sattilio
342248428Salc	index = page->pindex;
343248222Sattilio
344254141Sattiliorestart:
345254141Sattilio
346226645Sattilio	/*
347246726Sattilio	 * The owner of record for root is not really important because it
348246726Sattilio	 * will never be used.
349226645Sattilio	 */
350246726Sattilio	rnode = vm_radix_getroot(rtree);
351246726Sattilio	if (rnode == NULL) {
352249502Salc		rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
353254141Sattilio		return (0);
354246726Sattilio	}
355249502Salc	parentp = (void **)&rtree->rt_root;
356249502Salc	for (;;) {
357249502Salc		if (vm_radix_isleaf(rnode)) {
358249502Salc			m = vm_radix_topage(rnode);
359246726Sattilio			if (m->pindex == index)
360246726Sattilio				panic("%s: key %jx is already present",
361246726Sattilio				    __func__, (uintmax_t)index);
362246726Sattilio			clev = vm_radix_keydiff(m->pindex, index);
363254141Sattilio
364254141Sattilio			/*
365254141Sattilio			 * During node allocation the trie that is being
366254141Sattilio			 * walked can be modified because of recursing radix
367254141Sattilio			 * trie operations.
368254141Sattilio			 * If this is the case, the recursing functions signal
369254141Sattilio			 * such situation and the insert operation must
370254141Sattilio			 * start from scratch again.
371254141Sattilio			 * The freed radix node will then be in the UMA
372254141Sattilio			 * caches very likely to avoid the same situation
373254141Sattilio			 * to happen.
374254141Sattilio			 */
375254141Sattilio			rtree->rt_flags |= RT_INSERT_INPROG;
376246726Sattilio			tmp = vm_radix_node_get(vm_radix_trimkey(index,
377250520Salc			    clev + 1), 2, clev);
378254141Sattilio			rtree->rt_flags &= ~RT_INSERT_INPROG;
379254141Sattilio			if (tmp == NULL) {
380254141Sattilio				rtree->rt_flags &= ~RT_TRIE_MODIFIED;
381254141Sattilio				return (ENOMEM);
382254141Sattilio			}
383254141Sattilio			if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
384254141Sattilio				rtree->rt_flags &= ~RT_TRIE_MODIFIED;
385254141Sattilio				tmp->rn_count = 0;
386254141Sattilio				vm_radix_node_put(tmp);
387254141Sattilio				goto restart;
388254141Sattilio			}
389249502Salc			*parentp = tmp;
390246726Sattilio			vm_radix_addpage(tmp, index, clev, page);
391246726Sattilio			vm_radix_addpage(tmp, m->pindex, clev, m);
392254141Sattilio			return (0);
393249502Salc		} else if (vm_radix_keybarr(rnode, index))
394249502Salc			break;
395249502Salc		slot = vm_radix_slot(index, rnode->rn_clev);
396226645Sattilio		if (rnode->rn_child[slot] == NULL) {
397236760Sattilio			rnode->rn_count++;
398246726Sattilio			vm_radix_addpage(rnode, index, rnode->rn_clev, page);
399254141Sattilio			return (0);
400246726Sattilio		}
401249502Salc		parentp = &rnode->rn_child[slot];
402226645Sattilio		rnode = rnode->rn_child[slot];
403249502Salc	}
404226645Sattilio
405246726Sattilio	/*
406246726Sattilio	 * A new node is needed because the right insertion level is reached.
407246726Sattilio	 * Setup the new intermediate node and add the 2 children: the
408246726Sattilio	 * new object and the older edge.
409246726Sattilio	 */
410249182Salc	newind = rnode->rn_owner;
411249182Salc	clev = vm_radix_keydiff(newind, index);
412254141Sattilio
413254141Sattilio	/* See the comments above. */
414254141Sattilio	rtree->rt_flags |= RT_INSERT_INPROG;
415254141Sattilio	tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
416254141Sattilio	rtree->rt_flags &= ~RT_INSERT_INPROG;
417254141Sattilio	if (tmp == NULL) {
418254141Sattilio		rtree->rt_flags &= ~RT_TRIE_MODIFIED;
419254141Sattilio		return (ENOMEM);
420254141Sattilio	}
421254141Sattilio	if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
422254141Sattilio		rtree->rt_flags &= ~RT_TRIE_MODIFIED;
423254141Sattilio		tmp->rn_count = 0;
424254141Sattilio		vm_radix_node_put(tmp);
425254141Sattilio		goto restart;
426254141Sattilio	}
427249502Salc	*parentp = tmp;
428249182Salc	vm_radix_addpage(tmp, index, clev, page);
429246726Sattilio	slot = vm_radix_slot(newind, clev);
430249182Salc	tmp->rn_child[slot] = rnode;
431254141Sattilio	return (0);
432226645Sattilio}
433226645Sattilio
434226645Sattilio/*
435247771Salc * Returns the value stored at the index.  If the index is not present,
436226645Sattilio * NULL is returned.
437226645Sattilio */
438246430Sattiliovm_page_t
439238245Sattiliovm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
440226645Sattilio{
441226645Sattilio	struct vm_radix_node *rnode;
442246726Sattilio	vm_page_t m;
443226645Sattilio	int slot;
444226645Sattilio
445246726Sattilio	rnode = vm_radix_getroot(rtree);
446246795Sattilio	while (rnode != NULL) {
447249038Salc		if (vm_radix_isleaf(rnode)) {
448249038Salc			m = vm_radix_topage(rnode);
449246726Sattilio			if (m->pindex == index)
450246726Sattilio				return (m);
451246726Sattilio			else
452249427Salc				break;
453249427Salc		} else if (vm_radix_keybarr(rnode, index))
454249427Salc			break;
455249427Salc		slot = vm_radix_slot(index, rnode->rn_clev);
456249427Salc		rnode = rnode->rn_child[slot];
457226645Sattilio	}
458246726Sattilio	return (NULL);
459226645Sattilio}
460226645Sattilio
461226645Sattilio/*
462247771Salc * Look up the nearest entry at a position bigger than or equal to index.
463226645Sattilio */
464246726Sattiliovm_page_t
465246726Sattiliovm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
466226645Sattilio{
467250259Salc	struct vm_radix_node *stack[VM_RADIX_LIMIT];
468246726Sattilio	vm_pindex_t inc;
469246726Sattilio	vm_page_t m;
470249038Salc	struct vm_radix_node *child, *rnode;
471246726Sattilio#ifdef INVARIANTS
472246726Sattilio	int loops = 0;
473246726Sattilio#endif
474250259Salc	int slot, tos;
475226645Sattilio
476249427Salc	rnode = vm_radix_getroot(rtree);
477249427Salc	if (rnode == NULL)
478249427Salc		return (NULL);
479249427Salc	else if (vm_radix_isleaf(rnode)) {
480249427Salc		m = vm_radix_topage(rnode);
481249427Salc		if (m->pindex >= index)
482249427Salc			return (m);
483249427Salc		else
484249427Salc			return (NULL);
485249427Salc	}
486250259Salc	tos = 0;
487249427Salc	for (;;) {
488246726Sattilio		/*
489249986Salc		 * If the keys differ before the current bisection node,
490249986Salc		 * then the search key might rollback to the earliest
491249986Salc		 * available bisection node or to the smallest key
492249986Salc		 * in the current node (if the owner is bigger than the
493246726Sattilio		 * search key).
494246726Sattilio		 */
495247770Salc		if (vm_radix_keybarr(rnode, index)) {
496246726Sattilio			if (index > rnode->rn_owner) {
497250259Salcascend:
498250259Salc				KASSERT(++loops < 1000,
499250259Salc				    ("vm_radix_lookup_ge: too many loops"));
500250259Salc
501250259Salc				/*
502250259Salc				 * Pop nodes from the stack until either the
503250259Salc				 * stack is empty or a node that could have a
504250259Salc				 * matching descendant is found.
505250259Salc				 */
506250259Salc				do {
507250259Salc					if (tos == 0)
508250259Salc						return (NULL);
509250259Salc					rnode = stack[--tos];
510250259Salc				} while (vm_radix_slot(index,
511250259Salc				    rnode->rn_clev) == (VM_RADIX_COUNT - 1));
512250259Salc
513250259Salc				/*
514250259Salc				 * The following computation cannot overflow
515250259Salc				 * because index's slot at the current level
516250259Salc				 * is less than VM_RADIX_COUNT - 1.
517250259Salc				 */
518250259Salc				index = vm_radix_trimkey(index,
519250259Salc				    rnode->rn_clev);
520250259Salc				index += VM_RADIX_UNITLEVEL(rnode->rn_clev);
521246726Sattilio			} else
522249986Salc				index = rnode->rn_owner;
523250259Salc			KASSERT(!vm_radix_keybarr(rnode, index),
524250259Salc			    ("vm_radix_lookup_ge: keybarr failed"));
525246726Sattilio		}
526246726Sattilio		slot = vm_radix_slot(index, rnode->rn_clev);
527249038Salc		child = rnode->rn_child[slot];
528249038Salc		if (vm_radix_isleaf(child)) {
529249038Salc			m = vm_radix_topage(child);
530249038Salc			if (m->pindex >= index)
531249038Salc				return (m);
532249038Salc		} else if (child != NULL)
533249038Salc			goto descend;
534246726Sattilio
535226645Sattilio		/*
536246726Sattilio		 * Look for an available edge or page within the current
537246726Sattilio		 * bisection node.
538246726Sattilio		 */
539246726Sattilio                if (slot < (VM_RADIX_COUNT - 1)) {
540246726Sattilio			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
541246726Sattilio			index = vm_radix_trimkey(index, rnode->rn_clev);
542249038Salc			do {
543249038Salc				index += inc;
544249038Salc				slot++;
545249038Salc				child = rnode->rn_child[slot];
546249038Salc				if (vm_radix_isleaf(child)) {
547249038Salc					m = vm_radix_topage(child);
548249038Salc					if (m->pindex >= index)
549249038Salc						return (m);
550249038Salc				} else if (child != NULL)
551249038Salc					goto descend;
552249038Salc			} while (slot < (VM_RADIX_COUNT - 1));
553246726Sattilio		}
554249038Salc		KASSERT(child == NULL || vm_radix_isleaf(child),
555249038Salc		    ("vm_radix_lookup_ge: child is radix node"));
556230750Sattilio
557246726Sattilio		/*
558250259Salc		 * If a page or edge bigger than the search slot is not found
559250259Salc		 * in the current node, ascend to the next higher-level node.
560246726Sattilio		 */
561250259Salc		goto ascend;
562249038Salcdescend:
563250520Salc		KASSERT(rnode->rn_clev > 0,
564250259Salc		    ("vm_radix_lookup_ge: pushing leaf's parent"));
565250259Salc		KASSERT(tos < VM_RADIX_LIMIT,
566250259Salc		    ("vm_radix_lookup_ge: stack overflow"));
567250259Salc		stack[tos++] = rnode;
568249038Salc		rnode = child;
569226645Sattilio	}
570226645Sattilio}
571226645Sattilio
572226645Sattilio/*
573247771Salc * Look up the nearest entry at a position less than or equal to index.
574226645Sattilio */
575246430Sattiliovm_page_t
576238245Sattiliovm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
577226645Sattilio{
578250259Salc	struct vm_radix_node *stack[VM_RADIX_LIMIT];
579246726Sattilio	vm_pindex_t inc;
580246726Sattilio	vm_page_t m;
581249038Salc	struct vm_radix_node *child, *rnode;
582246726Sattilio#ifdef INVARIANTS
583246726Sattilio	int loops = 0;
584246726Sattilio#endif
585250259Salc	int slot, tos;
586226645Sattilio
587249427Salc	rnode = vm_radix_getroot(rtree);
588249427Salc	if (rnode == NULL)
589249427Salc		return (NULL);
590249427Salc	else if (vm_radix_isleaf(rnode)) {
591249427Salc		m = vm_radix_topage(rnode);
592249427Salc		if (m->pindex <= index)
593249427Salc			return (m);
594249427Salc		else
595249427Salc			return (NULL);
596249427Salc	}
597250259Salc	tos = 0;
598249427Salc	for (;;) {
599226646Sjeff		/*
600249986Salc		 * If the keys differ before the current bisection node,
601249986Salc		 * then the search key might rollback to the earliest
602249986Salc		 * available bisection node or to the largest key
603249986Salc		 * in the current node (if the owner is smaller than the
604246726Sattilio		 * search key).
605226646Sjeff		 */
606247770Salc		if (vm_radix_keybarr(rnode, index)) {
607246726Sattilio			if (index > rnode->rn_owner) {
608249986Salc				index = rnode->rn_owner + VM_RADIX_COUNT *
609250259Salc				    VM_RADIX_UNITLEVEL(rnode->rn_clev);
610249986Salc			} else {
611250259Salcascend:
612250259Salc				KASSERT(++loops < 1000,
613250259Salc				    ("vm_radix_lookup_le: too many loops"));
614250259Salc
615250259Salc				/*
616250259Salc				 * Pop nodes from the stack until either the
617250259Salc				 * stack is empty or a node that could have a
618250259Salc				 * matching descendant is found.
619250259Salc				 */
620250259Salc				do {
621250259Salc					if (tos == 0)
622250259Salc						return (NULL);
623250259Salc					rnode = stack[--tos];
624250259Salc				} while (vm_radix_slot(index,
625250259Salc				    rnode->rn_clev) == 0);
626250259Salc
627250259Salc				/*
628250259Salc				 * The following computation cannot overflow
629250259Salc				 * because index's slot at the current level
630250259Salc				 * is greater than 0.
631250259Salc				 */
632250259Salc				index = vm_radix_trimkey(index,
633250259Salc				    rnode->rn_clev);
634249986Salc			}
635250259Salc			index--;
636250259Salc			KASSERT(!vm_radix_keybarr(rnode, index),
637250259Salc			    ("vm_radix_lookup_le: keybarr failed"));
638246726Sattilio		}
639246726Sattilio		slot = vm_radix_slot(index, rnode->rn_clev);
640249038Salc		child = rnode->rn_child[slot];
641249038Salc		if (vm_radix_isleaf(child)) {
642249038Salc			m = vm_radix_topage(child);
643249038Salc			if (m->pindex <= index)
644249038Salc				return (m);
645249038Salc		} else if (child != NULL)
646249038Salc			goto descend;
647246726Sattilio
648246726Sattilio		/*
649246726Sattilio		 * Look for an available edge or page within the current
650246726Sattilio		 * bisection node.
651246726Sattilio		 */
652246726Sattilio		if (slot > 0) {
653246726Sattilio			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
654246726Sattilio			index |= inc - 1;
655249038Salc			do {
656249038Salc				index -= inc;
657249038Salc				slot--;
658249038Salc				child = rnode->rn_child[slot];
659249038Salc				if (vm_radix_isleaf(child)) {
660249038Salc					m = vm_radix_topage(child);
661249038Salc					if (m->pindex <= index)
662249038Salc						return (m);
663249038Salc				} else if (child != NULL)
664249038Salc					goto descend;
665249038Salc			} while (slot > 0);
666226646Sjeff		}
667249038Salc		KASSERT(child == NULL || vm_radix_isleaf(child),
668249038Salc		    ("vm_radix_lookup_le: child is radix node"));
669246726Sattilio
670246726Sattilio		/*
671250259Salc		 * If a page or edge smaller than the search slot is not found
672250259Salc		 * in the current node, ascend to the next higher-level node.
673246726Sattilio		 */
674250259Salc		goto ascend;
675249038Salcdescend:
676250520Salc		KASSERT(rnode->rn_clev > 0,
677250259Salc		    ("vm_radix_lookup_le: pushing leaf's parent"));
678250259Salc		KASSERT(tos < VM_RADIX_LIMIT,
679250259Salc		    ("vm_radix_lookup_le: stack overflow"));
680250259Salc		stack[tos++] = rnode;
681249038Salc		rnode = child;
682226646Sjeff	}
683226645Sattilio}
684226645Sattilio
685226645Sattilio/*
686246726Sattilio * Remove the specified index from the tree.
687246726Sattilio * Panics if the key is not present.
688226645Sattilio */
689236763Sattiliovoid
690238245Sattiliovm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
691226645Sattilio{
692246726Sattilio	struct vm_radix_node *rnode, *parent;
693246726Sattilio	vm_page_t m;
694246726Sattilio	int i, slot;
695226645Sattilio
696254141Sattilio	/*
697254141Sattilio	 * Detect if a page is going to be removed from a trie which is
698254141Sattilio	 * already undergoing another trie operation.
699254141Sattilio	 * Right now this is only possible for vm_radix_remove() recursing
700254141Sattilio	 * into vm_radix_insert().
701254141Sattilio	 * If this is the case, the caller must be notified about this
702254141Sattilio	 * situation.  It will also takecare to update the RT_TRIE_MODIFIED
703254141Sattilio	 * accordingly.
704254141Sattilio	 * The RT_TRIE_MODIFIED bit is set here because the remove operation
705254141Sattilio	 * will always succeed.
706254141Sattilio	 */
707254141Sattilio	if ((rtree->rt_flags & RT_INSERT_INPROG) != 0)
708254141Sattilio		rtree->rt_flags |= RT_TRIE_MODIFIED;
709254141Sattilio
710249502Salc	rnode = vm_radix_getroot(rtree);
711249502Salc	if (vm_radix_isleaf(rnode)) {
712249502Salc		m = vm_radix_topage(rnode);
713249502Salc		if (m->pindex != index)
714249502Salc			panic("%s: invalid key found", __func__);
715249502Salc		vm_radix_setroot(rtree, NULL);
716249502Salc		return;
717249502Salc	}
718246726Sattilio	parent = NULL;
719236760Sattilio	for (;;) {
720246726Sattilio		if (rnode == NULL)
721246726Sattilio			panic("vm_radix_remove: impossible to locate the key");
722246726Sattilio		slot = vm_radix_slot(index, rnode->rn_clev);
723249038Salc		if (vm_radix_isleaf(rnode->rn_child[slot])) {
724249038Salc			m = vm_radix_topage(rnode->rn_child[slot]);
725249038Salc			if (m->pindex != index)
726249038Salc				panic("%s: invalid key found", __func__);
727246726Sattilio			rnode->rn_child[slot] = NULL;
728246726Sattilio			rnode->rn_count--;
729246726Sattilio			if (rnode->rn_count > 1)
730246726Sattilio				break;
731246726Sattilio			for (i = 0; i < VM_RADIX_COUNT; i++)
732246726Sattilio				if (rnode->rn_child[i] != NULL)
733246726Sattilio					break;
734246726Sattilio			KASSERT(i != VM_RADIX_COUNT,
735246726Sattilio			    ("%s: invalid node configuration", __func__));
736249502Salc			if (parent == NULL)
737249502Salc				vm_radix_setroot(rtree, rnode->rn_child[i]);
738249502Salc			else {
739249502Salc				slot = vm_radix_slot(index, parent->rn_clev);
740249502Salc				KASSERT(parent->rn_child[slot] == rnode,
741249502Salc				    ("%s: invalid child value", __func__));
742249502Salc				parent->rn_child[slot] = rnode->rn_child[i];
743249502Salc			}
744246726Sattilio			rnode->rn_count--;
745246726Sattilio			rnode->rn_child[i] = NULL;
746246726Sattilio			vm_radix_node_put(rnode);
747236760Sattilio			break;
748236760Sattilio		}
749246726Sattilio		parent = rnode;
750246726Sattilio		rnode = rnode->rn_child[slot];
751236760Sattilio	}
752226645Sattilio}
753226645Sattilio
754226645Sattilio/*
755226980Sattilio * Remove and free all the nodes from the radix tree.
756247771Salc * This function is recursive but there is a tight control on it as the
757226980Sattilio * maximum depth of the tree is fixed.
758226980Sattilio */
759226980Sattiliovoid
760226980Sattiliovm_radix_reclaim_allnodes(struct vm_radix *rtree)
761226980Sattilio{
762226980Sattilio	struct vm_radix_node *root;
763226980Sattilio
764254141Sattilio	KASSERT((rtree->rt_flags & RT_INSERT_INPROG) == 0,
765254141Sattilio	    ("vm_radix_reclaim_allnodes: unexpected trie recursion"));
766254141Sattilio
767246726Sattilio	root = vm_radix_getroot(rtree);
768246726Sattilio	if (root == NULL)
769226980Sattilio		return;
770248684Salc	vm_radix_setroot(rtree, NULL);
771249502Salc	if (!vm_radix_isleaf(root))
772249502Salc		vm_radix_reclaim_allnodes_int(root);
773226980Sattilio}
774246726Sattilio
775254141Sattilio/*
776254141Sattilio * Replace an existing page into the trie with another one.
777254141Sattilio * Panics if the replacing page is not present or if the new page has an
778254141Sattilio * invalid key.
779254141Sattilio */
780254141Sattiliovm_page_t
781254141Sattiliovm_radix_replace(struct vm_radix *rtree, vm_page_t newpage, vm_pindex_t index)
782254141Sattilio{
783254141Sattilio	struct vm_radix_node *rnode;
784254141Sattilio	vm_page_t m;
785254141Sattilio	int slot;
786254141Sattilio
787254141Sattilio	KASSERT(newpage->pindex == index, ("%s: newpage index invalid",
788254141Sattilio	    __func__));
789254141Sattilio
790254141Sattilio	rnode = vm_radix_getroot(rtree);
791254141Sattilio	if (rnode == NULL)
792254141Sattilio		panic("%s: replacing page on an empty trie", __func__);
793254141Sattilio	if (vm_radix_isleaf(rnode)) {
794254141Sattilio		m = vm_radix_topage(rnode);
795254141Sattilio		if (m->pindex != index)
796254141Sattilio			panic("%s: original replacing root key not found",
797254141Sattilio			    __func__);
798254141Sattilio		rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF;
799254141Sattilio		return (m);
800254141Sattilio	}
801254141Sattilio	for (;;) {
802254141Sattilio		slot = vm_radix_slot(index, rnode->rn_clev);
803254141Sattilio		if (vm_radix_isleaf(rnode->rn_child[slot])) {
804254141Sattilio			m = vm_radix_topage(rnode->rn_child[slot]);
805254141Sattilio			if (m->pindex == index) {
806254141Sattilio				rnode->rn_child[slot] =
807254141Sattilio				    (void *)((uintptr_t)newpage |
808254141Sattilio				    VM_RADIX_ISLEAF);
809254141Sattilio				return (m);
810254141Sattilio			} else
811254141Sattilio				break;
812254141Sattilio		} else if (rnode->rn_child[slot] == NULL ||
813254141Sattilio		    vm_radix_keybarr(rnode->rn_child[slot], index))
814254141Sattilio			break;
815254141Sattilio		rnode = rnode->rn_child[slot];
816254141Sattilio	}
817254141Sattilio	panic("%s: original replacing page not found", __func__);
818254141Sattilio}
819254141Sattilio
820246726Sattilio#ifdef DDB
821246726Sattilio/*
822246837Sattilio * Show details about the given radix node.
823246726Sattilio */
824246726SattilioDB_SHOW_COMMAND(radixnode, db_show_radixnode)
825246726Sattilio{
826246726Sattilio	struct vm_radix_node *rnode;
827246726Sattilio	int i;
828246726Sattilio
829246726Sattilio        if (!have_addr)
830246726Sattilio                return;
831246726Sattilio	rnode = (struct vm_radix_node *)addr;
832246726Sattilio	db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
833246726Sattilio	    (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
834246726Sattilio	    rnode->rn_clev);
835246726Sattilio	for (i = 0; i < VM_RADIX_COUNT; i++)
836246726Sattilio		if (rnode->rn_child[i] != NULL)
837246726Sattilio			db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
838246726Sattilio			    i, (void *)rnode->rn_child[i],
839249038Salc			    vm_radix_isleaf(rnode->rn_child[i]) ?
840249038Salc			    vm_radix_topage(rnode->rn_child[i]) : NULL,
841246726Sattilio			    rnode->rn_clev);
842246726Sattilio}
843246726Sattilio#endif /* DDB */
844