1226645Sattilio/*
2246726Sattilio * Copyright (c) 2013 EMC Corp.
3226930Sjeff * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
4226645Sattilio * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
5226645Sattilio * All rights reserved.
6226645Sattilio *
7226645Sattilio * Redistribution and use in source and binary forms, with or without
8226645Sattilio * modification, are permitted provided that the following conditions
9226645Sattilio * are met:
10226645Sattilio * 1. Redistributions of source code must retain the above copyright
11226645Sattilio *    notice, this list of conditions and the following disclaimer.
12226645Sattilio * 2. Redistributions in binary form must reproduce the above copyright
13226645Sattilio *    notice, this list of conditions and the following disclaimer in the
14226645Sattilio *    documentation and/or other materials provided with the distribution.
15226645Sattilio *
16226645Sattilio * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17226645Sattilio * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18226645Sattilio * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19226645Sattilio * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20226645Sattilio * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21226645Sattilio * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22226645Sattilio * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23226645Sattilio * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24226645Sattilio * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25226645Sattilio * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26226645Sattilio * SUCH DAMAGE.
27226645Sattilio *
28226645Sattilio */
29226645Sattilio
30226645Sattilio/*
31246726Sattilio * Path-compressed radix trie implementation.
32246726Sattilio * The following code is not generalized into a general purpose library
33246726Sattilio * because there are way too many parameters embedded that should really
34246726Sattilio * be decided by the library consumers.  At the same time, consumers
35246726Sattilio * of this code must achieve highest possible performance.
36246726Sattilio *
37246726Sattilio * The implementation takes into account the following rationale:
38248424Sattilio * - Size of the nodes should be as small as possible but still big enough
39248424Sattilio *   to avoid a large maximum depth for the trie.  This is a balance
40248424Sattilio *   between the necessity to not wire too much physical memory for the nodes
41248424Sattilio *   and the necessity to avoid too much cache pollution during the trie
42248424Sattilio *   operations.
43248424Sattilio * - There is not a huge bias toward the number of lookup operations over
44248424Sattilio *   the number of insert and remove operations.  This basically implies
45248424Sattilio *   that optimizations supposedly helping one operation but hurting the
46248424Sattilio *   other might be carefully evaluated.
47248424Sattilio * - On average not many nodes are expected to be fully populated, hence
48248424Sattilio *   level compression may just complicate things.
49226645Sattilio */
50226645Sattilio
51226645Sattilio#include <sys/cdefs.h>
52248448Sattilio__FBSDID("$FreeBSD: stable/11/sys/vm/vm_radix.c 327785 2018-01-10 20:39:26Z markj $");
53226645Sattilio
54246726Sattilio#include "opt_ddb.h"
55246726Sattilio
56226645Sattilio#include <sys/param.h>
57226645Sattilio#include <sys/systm.h>
58226645Sattilio#include <sys/kernel.h>
59246840Sattilio#include <sys/vmmeter.h>
60246726Sattilio
61226873Sattilio#include <vm/uma.h>
62226645Sattilio#include <vm/vm.h>
63226876Sjeff#include <vm/vm_param.h>
64226873Sattilio#include <vm/vm_page.h>
65226645Sattilio#include <vm/vm_radix.h>
66226645Sattilio
67246726Sattilio#ifdef DDB
68246726Sattilio#include <ddb/ddb.h>
69246726Sattilio#endif
70226645Sattilio
71233034Sattilio/*
72247771Salc * These widths should allow the pointers to a node's children to fit within
73247771Salc * a single cache line.  The extra levels from a narrow width should not be
74247771Salc * a problem thanks to path compression.
75233034Sattilio */
76246726Sattilio#ifdef __LP64__
77246726Sattilio#define	VM_RADIX_WIDTH	4
78233034Sattilio#else
79246726Sattilio#define	VM_RADIX_WIDTH	3
80233034Sattilio#endif
81233034Sattilio
82232326Sattilio#define	VM_RADIX_COUNT	(1 << VM_RADIX_WIDTH)
83232326Sattilio#define	VM_RADIX_MASK	(VM_RADIX_COUNT - 1)
84246726Sattilio#define	VM_RADIX_LIMIT							\
85298482Spfg	(howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1)
86232326Sattilio
87232326Sattilio/* Flag bits stored in node pointers. */
88246726Sattilio#define	VM_RADIX_ISLEAF	0x1
89246726Sattilio#define	VM_RADIX_FLAGS	0x1
90246726Sattilio#define	VM_RADIX_PAD	VM_RADIX_FLAGS
91232326Sattilio
92246726Sattilio/* Returns one unit associated with specified level. */
93246726Sattilio#define	VM_RADIX_UNITLEVEL(lev)						\
94250520Salc	((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH))
95232326Sattilio
96232326Sattiliostruct vm_radix_node {
97246726Sattilio	vm_pindex_t	 rn_owner;			/* Owner of record. */
98246726Sattilio	uint16_t	 rn_count;			/* Valid children. */
99246726Sattilio	uint16_t	 rn_clev;			/* Current level. */
100249221Salc	void		*rn_child[VM_RADIX_COUNT];	/* Child nodes. */
101232326Sattilio};
102232326Sattilio
103226873Sattiliostatic uma_zone_t vm_radix_node_zone;
104226645Sattilio
105246794Sattilio/*
106254141Sattilio * Allocate a radix node.
107246726Sattilio */
108246726Sattiliostatic __inline struct vm_radix_node *
109246726Sattiliovm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
110226873Sattilio{
111246726Sattilio	struct vm_radix_node *rnode;
112226645Sattilio
113254141Sattilio	rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO);
114247742Sattilio	if (rnode == NULL)
115254141Sattilio		return (NULL);
116246726Sattilio	rnode->rn_owner = owner;
117246726Sattilio	rnode->rn_count = count;
118246726Sattilio	rnode->rn_clev = clevel;
119246726Sattilio	return (rnode);
120246726Sattilio}
121226873Sattilio
122246726Sattilio/*
123246726Sattilio * Free radix node.
124246726Sattilio */
125246726Sattiliostatic __inline void
126246726Sattiliovm_radix_node_put(struct vm_radix_node *rnode)
127246726Sattilio{
128246726Sattilio
129246726Sattilio	uma_zfree(vm_radix_node_zone, rnode);
130226873Sattilio}
131226873Sattilio
132246726Sattilio/*
133246726Sattilio * Return the position in the array for a given level.
134246726Sattilio */
135246726Sattiliostatic __inline int
136246726Sattiliovm_radix_slot(vm_pindex_t index, uint16_t level)
137226873Sattilio{
138226873Sattilio
139250520Salc	return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK);
140226873Sattilio}
141226873Sattilio
142246726Sattilio/* Trims the key after the specified level. */
143246726Sattiliostatic __inline vm_pindex_t
144246726Sattiliovm_radix_trimkey(vm_pindex_t index, uint16_t level)
145226873Sattilio{
146246726Sattilio	vm_pindex_t ret;
147226873Sattilio
148246726Sattilio	ret = index;
149250520Salc	if (level > 0) {
150250520Salc		ret >>= level * VM_RADIX_WIDTH;
151250520Salc		ret <<= level * VM_RADIX_WIDTH;
152246726Sattilio	}
153246726Sattilio	return (ret);
154226873Sattilio}
155226873Sattilio
156226645Sattilio/*
157246726Sattilio * Get the root node for a radix tree.
158226873Sattilio */
159246726Sattiliostatic __inline struct vm_radix_node *
160246726Sattiliovm_radix_getroot(struct vm_radix *rtree)
161226873Sattilio{
162226873Sattilio
163249221Salc	return ((struct vm_radix_node *)rtree->rt_root);
164226873Sattilio}
165226873Sattilio
166226873Sattilio/*
167246726Sattilio * Set the root node for a radix tree.
168226645Sattilio */
169246726Sattiliostatic __inline void
170246726Sattiliovm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode)
171226645Sattilio{
172226645Sattilio
173246726Sattilio	rtree->rt_root = (uintptr_t)rnode;
174226645Sattilio}
175226645Sattilio
176226645Sattilio/*
177248728Salc * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
178248728Salc */
179248728Salcstatic __inline boolean_t
180248728Salcvm_radix_isleaf(struct vm_radix_node *rnode)
181248728Salc{
182248728Salc
183248728Salc	return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
184248728Salc}
185248728Salc
186248728Salc/*
187249038Salc * Returns the associated page extracted from rnode.
188226645Sattilio */
189246726Sattiliostatic __inline vm_page_t
190249038Salcvm_radix_topage(struct vm_radix_node *rnode)
191226645Sattilio{
192226645Sattilio
193249038Salc	return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
194226645Sattilio}
195226645Sattilio
196226645Sattilio/*
197247771Salc * Adds the page as a child of the provided node.
198226645Sattilio */
199246726Sattiliostatic __inline void
200246726Sattiliovm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
201246726Sattilio    vm_page_t page)
202226645Sattilio{
203246726Sattilio	int slot;
204226645Sattilio
205246726Sattilio	slot = vm_radix_slot(index, clev);
206246726Sattilio	rnode->rn_child[slot] = (void *)((uintptr_t)page | VM_RADIX_ISLEAF);
207226645Sattilio}
208226645Sattilio
209233034Sattilio/*
210246726Sattilio * Returns the slot where two keys differ.
211246726Sattilio * It cannot accept 2 equal keys.
212233034Sattilio */
213246726Sattiliostatic __inline uint16_t
214246726Sattiliovm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
215226873Sattilio{
216246726Sattilio	uint16_t clev;
217226873Sattilio
218246726Sattilio	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
219246726Sattilio	    __func__, (uintmax_t)index1));
220233034Sattilio
221246726Sattilio	index1 ^= index2;
222250520Salc	for (clev = VM_RADIX_LIMIT;; clev--)
223250334Salc		if (vm_radix_slot(index1, clev) != 0)
224246726Sattilio			return (clev);
225226873Sattilio}
226226873Sattilio
227226645Sattilio/*
228246726Sattilio * Returns TRUE if it can be determined that key does not belong to the
229247771Salc * specified rnode.  Otherwise, returns FALSE.
230226876Sjeff */
231246726Sattiliostatic __inline boolean_t
232246726Sattiliovm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
233226876Sjeff{
234226876Sjeff
235250520Salc	if (rnode->rn_clev < VM_RADIX_LIMIT) {
236250520Salc		idx = vm_radix_trimkey(idx, rnode->rn_clev + 1);
237249211Salc		return (idx != rnode->rn_owner);
238246726Sattilio	}
239246726Sattilio	return (FALSE);
240226876Sjeff}
241226876Sjeff
242226876Sjeff/*
243247773Salc * Internal helper for vm_radix_reclaim_allnodes().
244247769Salc * This function is recursive.
245246726Sattilio */
246226980Sattiliostatic void
247246726Sattiliovm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
248226980Sattilio{
249226980Sattilio	int slot;
250226980Sattilio
251248684Salc	KASSERT(rnode->rn_count <= VM_RADIX_COUNT,
252248684Salc	    ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode));
253248684Salc	for (slot = 0; rnode->rn_count != 0; slot++) {
254226980Sattilio		if (rnode->rn_child[slot] == NULL)
255226980Sattilio			continue;
256248728Salc		if (!vm_radix_isleaf(rnode->rn_child[slot]))
257246726Sattilio			vm_radix_reclaim_allnodes_int(rnode->rn_child[slot]);
258248431Salc		rnode->rn_child[slot] = NULL;
259226980Sattilio		rnode->rn_count--;
260226980Sattilio	}
261226980Sattilio	vm_radix_node_put(rnode);
262226980Sattilio}
263226980Sattilio
264246836Sattilio#ifdef INVARIANTS
265226876Sjeff/*
266246836Sattilio * Radix node zone destructor.
267246836Sattilio */
268246836Sattiliostatic void
269246836Sattiliovm_radix_node_zone_dtor(void *mem, int size __unused, void *arg __unused)
270246836Sattilio{
271246836Sattilio	struct vm_radix_node *rnode;
272248431Salc	int slot;
273246836Sattilio
274246836Sattilio	rnode = mem;
275246836Sattilio	KASSERT(rnode->rn_count == 0,
276248431Salc	    ("vm_radix_node_put: rnode %p has %d children", rnode,
277246836Sattilio	    rnode->rn_count));
278248431Salc	for (slot = 0; slot < VM_RADIX_COUNT; slot++)
279248431Salc		KASSERT(rnode->rn_child[slot] == NULL,
280248431Salc		    ("vm_radix_node_put: rnode %p has a child", rnode));
281246836Sattilio}
282246836Sattilio#endif
283246836Sattilio
284254141Sattilio#ifndef UMA_MD_SMALL_ALLOC
285246836Sattilio/*
286254141Sattilio * Reserve the KVA necessary to satisfy the node allocation.
287254141Sattilio * This is mandatory in architectures not supporting direct
288254141Sattilio * mapping as they will need otherwise to carve into the kernel maps for
289254141Sattilio * every node allocation, resulting into deadlocks for consumers already
290254141Sattilio * working with kernel maps.
291248431Salc */
292246794Sattiliostatic void
293254141Sattiliovm_radix_reserve_kva(void *arg __unused)
294246726Sattilio{
295246726Sattilio
296249605Salc	/*
297249605Salc	 * Calculate the number of reserved nodes, discounting the pages that
298249605Salc	 * are needed to store them.
299249605Salc	 */
300254141Sattilio	if (!uma_zone_reserve_kva(vm_radix_node_zone,
301263620Sbdrewery	    ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
302254141Sattilio	    sizeof(struct vm_radix_node))))
303254141Sattilio		panic("%s: unable to reserve KVA", __func__);
304247742Sattilio}
305267992ShselaskySYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_THIRD,
306254141Sattilio    vm_radix_reserve_kva, NULL);
307254141Sattilio#endif
308247742Sattilio
309247742Sattilio/*
310247742Sattilio * Initialize the UMA slab zone.
311247742Sattilio */
312247742Sattiliovoid
313321513Skibvm_radix_zinit(void)
314247742Sattilio{
315247742Sattilio
316246726Sattilio	vm_radix_node_zone = uma_zcreate("RADIX NODE",
317246726Sattilio	    sizeof(struct vm_radix_node), NULL,
318246726Sattilio#ifdef INVARIANTS
319246726Sattilio	    vm_radix_node_zone_dtor,
320246726Sattilio#else
321246726Sattilio	    NULL,
322246726Sattilio#endif
323254141Sattilio	    NULL, NULL, VM_RADIX_PAD, UMA_ZONE_VM);
324246726Sattilio}
325246726Sattilio
326246726Sattilio/*
327247771Salc * Inserts the key-value pair into the trie.
328226645Sattilio * Panics if the key already exists.
329226645Sattilio */
330254141Sattilioint
331248428Salcvm_radix_insert(struct vm_radix *rtree, vm_page_t page)
332226645Sattilio{
333248428Salc	vm_pindex_t index, newind;
334249502Salc	void **parentp;
335249502Salc	struct vm_radix_node *rnode, *tmp;
336246726Sattilio	vm_page_t m;
337236763Sattilio	int slot;
338246726Sattilio	uint16_t clev;
339226645Sattilio
340248428Salc	index = page->pindex;
341248222Sattilio
342226645Sattilio	/*
343246726Sattilio	 * The owner of record for root is not really important because it
344246726Sattilio	 * will never be used.
345226645Sattilio	 */
346246726Sattilio	rnode = vm_radix_getroot(rtree);
347246726Sattilio	if (rnode == NULL) {
348249502Salc		rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
349254141Sattilio		return (0);
350246726Sattilio	}
351249502Salc	parentp = (void **)&rtree->rt_root;
352249502Salc	for (;;) {
353249502Salc		if (vm_radix_isleaf(rnode)) {
354249502Salc			m = vm_radix_topage(rnode);
355246726Sattilio			if (m->pindex == index)
356246726Sattilio				panic("%s: key %jx is already present",
357246726Sattilio				    __func__, (uintmax_t)index);
358246726Sattilio			clev = vm_radix_keydiff(m->pindex, index);
359246726Sattilio			tmp = vm_radix_node_get(vm_radix_trimkey(index,
360250520Salc			    clev + 1), 2, clev);
361318716Smarkj			if (tmp == NULL)
362254141Sattilio				return (ENOMEM);
363249502Salc			*parentp = tmp;
364246726Sattilio			vm_radix_addpage(tmp, index, clev, page);
365246726Sattilio			vm_radix_addpage(tmp, m->pindex, clev, m);
366254141Sattilio			return (0);
367249502Salc		} else if (vm_radix_keybarr(rnode, index))
368249502Salc			break;
369249502Salc		slot = vm_radix_slot(index, rnode->rn_clev);
370226645Sattilio		if (rnode->rn_child[slot] == NULL) {
371236760Sattilio			rnode->rn_count++;
372246726Sattilio			vm_radix_addpage(rnode, index, rnode->rn_clev, page);
373254141Sattilio			return (0);
374246726Sattilio		}
375249502Salc		parentp = &rnode->rn_child[slot];
376226645Sattilio		rnode = rnode->rn_child[slot];
377249502Salc	}
378226645Sattilio
379246726Sattilio	/*
380246726Sattilio	 * A new node is needed because the right insertion level is reached.
381246726Sattilio	 * Setup the new intermediate node and add the 2 children: the
382246726Sattilio	 * new object and the older edge.
383246726Sattilio	 */
384249182Salc	newind = rnode->rn_owner;
385249182Salc	clev = vm_radix_keydiff(newind, index);
386254141Sattilio	tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
387318716Smarkj	if (tmp == NULL)
388254141Sattilio		return (ENOMEM);
389249502Salc	*parentp = tmp;
390249182Salc	vm_radix_addpage(tmp, index, clev, page);
391246726Sattilio	slot = vm_radix_slot(newind, clev);
392249182Salc	tmp->rn_child[slot] = rnode;
393254141Sattilio	return (0);
394226645Sattilio}
395226645Sattilio
396226645Sattilio/*
397254719Salc * Returns TRUE if the specified radix tree contains a single leaf and FALSE
398254719Salc * otherwise.
399254719Salc */
400254719Salcboolean_t
401254719Salcvm_radix_is_singleton(struct vm_radix *rtree)
402254719Salc{
403254719Salc	struct vm_radix_node *rnode;
404254719Salc
405254719Salc	rnode = vm_radix_getroot(rtree);
406254719Salc	if (rnode == NULL)
407254719Salc		return (FALSE);
408254719Salc	return (vm_radix_isleaf(rnode));
409254719Salc}
410254719Salc
411254719Salc/*
412247771Salc * Returns the value stored at the index.  If the index is not present,
413226645Sattilio * NULL is returned.
414226645Sattilio */
415246430Sattiliovm_page_t
416238245Sattiliovm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
417226645Sattilio{
418226645Sattilio	struct vm_radix_node *rnode;
419246726Sattilio	vm_page_t m;
420226645Sattilio	int slot;
421226645Sattilio
422246726Sattilio	rnode = vm_radix_getroot(rtree);
423246795Sattilio	while (rnode != NULL) {
424249038Salc		if (vm_radix_isleaf(rnode)) {
425249038Salc			m = vm_radix_topage(rnode);
426246726Sattilio			if (m->pindex == index)
427246726Sattilio				return (m);
428246726Sattilio			else
429249427Salc				break;
430249427Salc		} else if (vm_radix_keybarr(rnode, index))
431249427Salc			break;
432249427Salc		slot = vm_radix_slot(index, rnode->rn_clev);
433249427Salc		rnode = rnode->rn_child[slot];
434226645Sattilio	}
435246726Sattilio	return (NULL);
436226645Sattilio}
437226645Sattilio
438226645Sattilio/*
439247771Salc * Look up the nearest entry at a position bigger than or equal to index.
440226645Sattilio */
441246726Sattiliovm_page_t
442246726Sattiliovm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
443226645Sattilio{
444250259Salc	struct vm_radix_node *stack[VM_RADIX_LIMIT];
445246726Sattilio	vm_pindex_t inc;
446246726Sattilio	vm_page_t m;
447249038Salc	struct vm_radix_node *child, *rnode;
448246726Sattilio#ifdef INVARIANTS
449246726Sattilio	int loops = 0;
450246726Sattilio#endif
451250259Salc	int slot, tos;
452226645Sattilio
453249427Salc	rnode = vm_radix_getroot(rtree);
454249427Salc	if (rnode == NULL)
455249427Salc		return (NULL);
456249427Salc	else if (vm_radix_isleaf(rnode)) {
457249427Salc		m = vm_radix_topage(rnode);
458249427Salc		if (m->pindex >= index)
459249427Salc			return (m);
460249427Salc		else
461249427Salc			return (NULL);
462249427Salc	}
463250259Salc	tos = 0;
464249427Salc	for (;;) {
465246726Sattilio		/*
466249986Salc		 * If the keys differ before the current bisection node,
467249986Salc		 * then the search key might rollback to the earliest
468249986Salc		 * available bisection node or to the smallest key
469249986Salc		 * in the current node (if the owner is bigger than the
470246726Sattilio		 * search key).
471246726Sattilio		 */
472247770Salc		if (vm_radix_keybarr(rnode, index)) {
473246726Sattilio			if (index > rnode->rn_owner) {
474250259Salcascend:
475250259Salc				KASSERT(++loops < 1000,
476250259Salc				    ("vm_radix_lookup_ge: too many loops"));
477250259Salc
478250259Salc				/*
479250259Salc				 * Pop nodes from the stack until either the
480250259Salc				 * stack is empty or a node that could have a
481250259Salc				 * matching descendant is found.
482250259Salc				 */
483250259Salc				do {
484250259Salc					if (tos == 0)
485250259Salc						return (NULL);
486250259Salc					rnode = stack[--tos];
487250259Salc				} while (vm_radix_slot(index,
488250259Salc				    rnode->rn_clev) == (VM_RADIX_COUNT - 1));
489250259Salc
490250259Salc				/*
491250259Salc				 * The following computation cannot overflow
492250259Salc				 * because index's slot at the current level
493250259Salc				 * is less than VM_RADIX_COUNT - 1.
494250259Salc				 */
495250259Salc				index = vm_radix_trimkey(index,
496250259Salc				    rnode->rn_clev);
497250259Salc				index += VM_RADIX_UNITLEVEL(rnode->rn_clev);
498246726Sattilio			} else
499249986Salc				index = rnode->rn_owner;
500250259Salc			KASSERT(!vm_radix_keybarr(rnode, index),
501250259Salc			    ("vm_radix_lookup_ge: keybarr failed"));
502246726Sattilio		}
503246726Sattilio		slot = vm_radix_slot(index, rnode->rn_clev);
504249038Salc		child = rnode->rn_child[slot];
505249038Salc		if (vm_radix_isleaf(child)) {
506249038Salc			m = vm_radix_topage(child);
507249038Salc			if (m->pindex >= index)
508249038Salc				return (m);
509249038Salc		} else if (child != NULL)
510249038Salc			goto descend;
511246726Sattilio
512226645Sattilio		/*
513246726Sattilio		 * Look for an available edge or page within the current
514246726Sattilio		 * bisection node.
515246726Sattilio		 */
516246726Sattilio                if (slot < (VM_RADIX_COUNT - 1)) {
517246726Sattilio			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
518246726Sattilio			index = vm_radix_trimkey(index, rnode->rn_clev);
519249038Salc			do {
520249038Salc				index += inc;
521249038Salc				slot++;
522249038Salc				child = rnode->rn_child[slot];
523249038Salc				if (vm_radix_isleaf(child)) {
524249038Salc					m = vm_radix_topage(child);
525249038Salc					if (m->pindex >= index)
526249038Salc						return (m);
527249038Salc				} else if (child != NULL)
528249038Salc					goto descend;
529249038Salc			} while (slot < (VM_RADIX_COUNT - 1));
530246726Sattilio		}
531249038Salc		KASSERT(child == NULL || vm_radix_isleaf(child),
532249038Salc		    ("vm_radix_lookup_ge: child is radix node"));
533230750Sattilio
534246726Sattilio		/*
535250259Salc		 * If a page or edge bigger than the search slot is not found
536250259Salc		 * in the current node, ascend to the next higher-level node.
537246726Sattilio		 */
538250259Salc		goto ascend;
539249038Salcdescend:
540250520Salc		KASSERT(rnode->rn_clev > 0,
541250259Salc		    ("vm_radix_lookup_ge: pushing leaf's parent"));
542250259Salc		KASSERT(tos < VM_RADIX_LIMIT,
543250259Salc		    ("vm_radix_lookup_ge: stack overflow"));
544250259Salc		stack[tos++] = rnode;
545249038Salc		rnode = child;
546226645Sattilio	}
547226645Sattilio}
548226645Sattilio
549226645Sattilio/*
550247771Salc * Look up the nearest entry at a position less than or equal to index.
551226645Sattilio */
552246430Sattiliovm_page_t
553238245Sattiliovm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
554226645Sattilio{
555250259Salc	struct vm_radix_node *stack[VM_RADIX_LIMIT];
556246726Sattilio	vm_pindex_t inc;
557246726Sattilio	vm_page_t m;
558249038Salc	struct vm_radix_node *child, *rnode;
559246726Sattilio#ifdef INVARIANTS
560246726Sattilio	int loops = 0;
561246726Sattilio#endif
562250259Salc	int slot, tos;
563226645Sattilio
564249427Salc	rnode = vm_radix_getroot(rtree);
565249427Salc	if (rnode == NULL)
566249427Salc		return (NULL);
567249427Salc	else if (vm_radix_isleaf(rnode)) {
568249427Salc		m = vm_radix_topage(rnode);
569249427Salc		if (m->pindex <= index)
570249427Salc			return (m);
571249427Salc		else
572249427Salc			return (NULL);
573249427Salc	}
574250259Salc	tos = 0;
575249427Salc	for (;;) {
576226646Sjeff		/*
577249986Salc		 * If the keys differ before the current bisection node,
578249986Salc		 * then the search key might rollback to the earliest
579249986Salc		 * available bisection node or to the largest key
580249986Salc		 * in the current node (if the owner is smaller than the
581246726Sattilio		 * search key).
582226646Sjeff		 */
583247770Salc		if (vm_radix_keybarr(rnode, index)) {
584246726Sattilio			if (index > rnode->rn_owner) {
585249986Salc				index = rnode->rn_owner + VM_RADIX_COUNT *
586250259Salc				    VM_RADIX_UNITLEVEL(rnode->rn_clev);
587249986Salc			} else {
588250259Salcascend:
589250259Salc				KASSERT(++loops < 1000,
590250259Salc				    ("vm_radix_lookup_le: too many loops"));
591250259Salc
592250259Salc				/*
593250259Salc				 * Pop nodes from the stack until either the
594250259Salc				 * stack is empty or a node that could have a
595250259Salc				 * matching descendant is found.
596250259Salc				 */
597250259Salc				do {
598250259Salc					if (tos == 0)
599250259Salc						return (NULL);
600250259Salc					rnode = stack[--tos];
601250259Salc				} while (vm_radix_slot(index,
602250259Salc				    rnode->rn_clev) == 0);
603250259Salc
604250259Salc				/*
605250259Salc				 * The following computation cannot overflow
606250259Salc				 * because index's slot at the current level
607250259Salc				 * is greater than 0.
608250259Salc				 */
609250259Salc				index = vm_radix_trimkey(index,
610250259Salc				    rnode->rn_clev);
611249986Salc			}
612250259Salc			index--;
613250259Salc			KASSERT(!vm_radix_keybarr(rnode, index),
614250259Salc			    ("vm_radix_lookup_le: keybarr failed"));
615246726Sattilio		}
616246726Sattilio		slot = vm_radix_slot(index, rnode->rn_clev);
617249038Salc		child = rnode->rn_child[slot];
618249038Salc		if (vm_radix_isleaf(child)) {
619249038Salc			m = vm_radix_topage(child);
620249038Salc			if (m->pindex <= index)
621249038Salc				return (m);
622249038Salc		} else if (child != NULL)
623249038Salc			goto descend;
624246726Sattilio
625246726Sattilio		/*
626246726Sattilio		 * Look for an available edge or page within the current
627246726Sattilio		 * bisection node.
628246726Sattilio		 */
629246726Sattilio		if (slot > 0) {
630246726Sattilio			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
631246726Sattilio			index |= inc - 1;
632249038Salc			do {
633249038Salc				index -= inc;
634249038Salc				slot--;
635249038Salc				child = rnode->rn_child[slot];
636249038Salc				if (vm_radix_isleaf(child)) {
637249038Salc					m = vm_radix_topage(child);
638249038Salc					if (m->pindex <= index)
639249038Salc						return (m);
640249038Salc				} else if (child != NULL)
641249038Salc					goto descend;
642249038Salc			} while (slot > 0);
643226646Sjeff		}
644249038Salc		KASSERT(child == NULL || vm_radix_isleaf(child),
645249038Salc		    ("vm_radix_lookup_le: child is radix node"));
646246726Sattilio
647246726Sattilio		/*
648250259Salc		 * If a page or edge smaller than the search slot is not found
649250259Salc		 * in the current node, ascend to the next higher-level node.
650246726Sattilio		 */
651250259Salc		goto ascend;
652249038Salcdescend:
653250520Salc		KASSERT(rnode->rn_clev > 0,
654250259Salc		    ("vm_radix_lookup_le: pushing leaf's parent"));
655250259Salc		KASSERT(tos < VM_RADIX_LIMIT,
656250259Salc		    ("vm_radix_lookup_le: stack overflow"));
657250259Salc		stack[tos++] = rnode;
658249038Salc		rnode = child;
659226646Sjeff	}
660226645Sattilio}
661226645Sattilio
662226645Sattilio/*
663318716Smarkj * Remove the specified index from the trie, and return the value stored at
664318716Smarkj * that index.  If the index is not present, return NULL.
665226645Sattilio */
666318716Smarkjvm_page_t
667238245Sattiliovm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
668226645Sattilio{
669246726Sattilio	struct vm_radix_node *rnode, *parent;
670246726Sattilio	vm_page_t m;
671246726Sattilio	int i, slot;
672226645Sattilio
673249502Salc	rnode = vm_radix_getroot(rtree);
674249502Salc	if (vm_radix_isleaf(rnode)) {
675249502Salc		m = vm_radix_topage(rnode);
676249502Salc		if (m->pindex != index)
677318716Smarkj			return (NULL);
678249502Salc		vm_radix_setroot(rtree, NULL);
679318716Smarkj		return (m);
680249502Salc	}
681246726Sattilio	parent = NULL;
682236760Sattilio	for (;;) {
683246726Sattilio		if (rnode == NULL)
684318716Smarkj			return (NULL);
685246726Sattilio		slot = vm_radix_slot(index, rnode->rn_clev);
686249038Salc		if (vm_radix_isleaf(rnode->rn_child[slot])) {
687249038Salc			m = vm_radix_topage(rnode->rn_child[slot]);
688249038Salc			if (m->pindex != index)
689318716Smarkj				return (NULL);
690246726Sattilio			rnode->rn_child[slot] = NULL;
691246726Sattilio			rnode->rn_count--;
692246726Sattilio			if (rnode->rn_count > 1)
693318716Smarkj				return (m);
694246726Sattilio			for (i = 0; i < VM_RADIX_COUNT; i++)
695246726Sattilio				if (rnode->rn_child[i] != NULL)
696246726Sattilio					break;
697246726Sattilio			KASSERT(i != VM_RADIX_COUNT,
698246726Sattilio			    ("%s: invalid node configuration", __func__));
699249502Salc			if (parent == NULL)
700249502Salc				vm_radix_setroot(rtree, rnode->rn_child[i]);
701249502Salc			else {
702249502Salc				slot = vm_radix_slot(index, parent->rn_clev);
703249502Salc				KASSERT(parent->rn_child[slot] == rnode,
704249502Salc				    ("%s: invalid child value", __func__));
705249502Salc				parent->rn_child[slot] = rnode->rn_child[i];
706249502Salc			}
707246726Sattilio			rnode->rn_count--;
708246726Sattilio			rnode->rn_child[i] = NULL;
709246726Sattilio			vm_radix_node_put(rnode);
710318716Smarkj			return (m);
711236760Sattilio		}
712246726Sattilio		parent = rnode;
713246726Sattilio		rnode = rnode->rn_child[slot];
714236760Sattilio	}
715226645Sattilio}
716226645Sattilio
717226645Sattilio/*
718226980Sattilio * Remove and free all the nodes from the radix tree.
719247771Salc * This function is recursive but there is a tight control on it as the
720226980Sattilio * maximum depth of the tree is fixed.
721226980Sattilio */
722226980Sattiliovoid
723226980Sattiliovm_radix_reclaim_allnodes(struct vm_radix *rtree)
724226980Sattilio{
725226980Sattilio	struct vm_radix_node *root;
726226980Sattilio
727246726Sattilio	root = vm_radix_getroot(rtree);
728246726Sattilio	if (root == NULL)
729226980Sattilio		return;
730248684Salc	vm_radix_setroot(rtree, NULL);
731249502Salc	if (!vm_radix_isleaf(root))
732249502Salc		vm_radix_reclaim_allnodes_int(root);
733226980Sattilio}
734246726Sattilio
735254141Sattilio/*
736259107Salc * Replace an existing page in the trie with another one.
737259107Salc * Panics if there is not an old page in the trie at the new page's index.
738254141Sattilio */
739254141Sattiliovm_page_t
740259107Salcvm_radix_replace(struct vm_radix *rtree, vm_page_t newpage)
741254141Sattilio{
742254141Sattilio	struct vm_radix_node *rnode;
743254141Sattilio	vm_page_t m;
744259107Salc	vm_pindex_t index;
745254141Sattilio	int slot;
746254141Sattilio
747259107Salc	index = newpage->pindex;
748254141Sattilio	rnode = vm_radix_getroot(rtree);
749254141Sattilio	if (rnode == NULL)
750254141Sattilio		panic("%s: replacing page on an empty trie", __func__);
751254141Sattilio	if (vm_radix_isleaf(rnode)) {
752254141Sattilio		m = vm_radix_topage(rnode);
753254141Sattilio		if (m->pindex != index)
754254141Sattilio			panic("%s: original replacing root key not found",
755254141Sattilio			    __func__);
756254141Sattilio		rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF;
757254141Sattilio		return (m);
758254141Sattilio	}
759254141Sattilio	for (;;) {
760254141Sattilio		slot = vm_radix_slot(index, rnode->rn_clev);
761254141Sattilio		if (vm_radix_isleaf(rnode->rn_child[slot])) {
762254141Sattilio			m = vm_radix_topage(rnode->rn_child[slot]);
763254141Sattilio			if (m->pindex == index) {
764254141Sattilio				rnode->rn_child[slot] =
765254141Sattilio				    (void *)((uintptr_t)newpage |
766254141Sattilio				    VM_RADIX_ISLEAF);
767254141Sattilio				return (m);
768254141Sattilio			} else
769254141Sattilio				break;
770254141Sattilio		} else if (rnode->rn_child[slot] == NULL ||
771254141Sattilio		    vm_radix_keybarr(rnode->rn_child[slot], index))
772254141Sattilio			break;
773254141Sattilio		rnode = rnode->rn_child[slot];
774254141Sattilio	}
775254141Sattilio	panic("%s: original replacing page not found", __func__);
776254141Sattilio}
777254141Sattilio
778327785Smarkjvoid
779327785Smarkjvm_radix_wait(void)
780327785Smarkj{
781327785Smarkj	uma_zwait(vm_radix_node_zone);
782327785Smarkj}
783327785Smarkj
784246726Sattilio#ifdef DDB
785246726Sattilio/*
786246837Sattilio * Show details about the given radix node.
787246726Sattilio */
788246726SattilioDB_SHOW_COMMAND(radixnode, db_show_radixnode)
789246726Sattilio{
790246726Sattilio	struct vm_radix_node *rnode;
791246726Sattilio	int i;
792246726Sattilio
793246726Sattilio        if (!have_addr)
794246726Sattilio                return;
795246726Sattilio	rnode = (struct vm_radix_node *)addr;
796246726Sattilio	db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
797246726Sattilio	    (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
798246726Sattilio	    rnode->rn_clev);
799246726Sattilio	for (i = 0; i < VM_RADIX_COUNT; i++)
800246726Sattilio		if (rnode->rn_child[i] != NULL)
801246726Sattilio			db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
802246726Sattilio			    i, (void *)rnode->rn_child[i],
803249038Salc			    vm_radix_isleaf(rnode->rn_child[i]) ?
804249038Salc			    vm_radix_topage(rnode->rn_child[i]) : NULL,
805246726Sattilio			    rnode->rn_clev);
806246726Sattilio}
807246726Sattilio#endif /* DDB */
808