Deleted Added
full compact
vm_radix.c (246835) vm_radix.c (246836)
1/*
2 * Copyright (c) 2013 EMC Corp.
3 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
4 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 */
29
30/*
31 * Path-compressed radix trie implementation.
32 * The following code is not generalized into a general purpose library
33 * because there are way too many parameters embedded that should really
34 * be decided by the library consumers. At the same time, consumers
35 * of this code must achieve highest possible performance.
36 *
37 * The implementation takes into account the following rationale:
38 * - Size of the nodes might be as small as possible.
39 * - There is no bias toward lookup operations over inserts or removes,
40 * and vice-versa.
41 * - In average there are not many complete levels, than level
42 * compression may just complicate things.
43 */
44
45#include <sys/cdefs.h>
46
47#include "opt_ddb.h"
48#include "opt_vm.h"
49
50#include <sys/param.h>
51#include <sys/conf.h>
52#include <sys/systm.h>
53#include <sys/kernel.h>
54#include <sys/malloc.h>
55#include <sys/queue.h>
56#include <sys/lock.h>
57#include <sys/mutex.h>
58
59#include <vm/uma.h>
60#include <vm/vm.h>
61#include <vm/vm_param.h>
62#include <vm/vm_extern.h>
63#include <vm/vm_kern.h>
64#include <vm/vm_page.h>
65#include <vm/vm_radix.h>
66
67#ifdef DDB
68#include <ddb/ddb.h>
69#endif
70
71#ifndef VM_RADIX_BOOT_CACHE
72#define VM_RADIX_BOOT_CACHE 1500
73#endif
74
75/*
76 * Such sizes should permit to keep node children contained into a single
77 * cache-line, or to at least not span many of those.
78 * In particular, sparse tries should however be compressed properly and
79 * then make some extra-levels not a big deal.
80 */
81#ifdef __LP64__
82#define VM_RADIX_WIDTH 4
83#else
84#define VM_RADIX_WIDTH 3
85#endif
86
87#define VM_RADIX_COUNT (1 << VM_RADIX_WIDTH)
88#define VM_RADIX_MASK (VM_RADIX_COUNT - 1)
89#define VM_RADIX_LIMIT \
90 (howmany((sizeof(vm_pindex_t) * NBBY), VM_RADIX_WIDTH) - 1)
91
92/* Flag bits stored in node pointers. */
93#define VM_RADIX_ISLEAF 0x1
94#define VM_RADIX_FLAGS 0x1
95#define VM_RADIX_PAD VM_RADIX_FLAGS
96
97/* Returns one unit associated with specified level. */
98#define VM_RADIX_UNITLEVEL(lev) \
99 ((vm_pindex_t)1 << ((VM_RADIX_LIMIT - (lev)) * VM_RADIX_WIDTH))
100
101struct vm_radix_node {
102 void *rn_child[VM_RADIX_COUNT]; /* Child nodes. */
103 vm_pindex_t rn_owner; /* Owner of record. */
104 uint16_t rn_count; /* Valid children. */
105 uint16_t rn_clev; /* Current level. */
106};
107
108static uma_zone_t vm_radix_node_zone;
109
110/*
111 * Boot-time cache of struct vm_radix_node objects.
112 * This cache is used to cater page allocations before the UMA zone is
113 * actually setup and pre-allocated (ie. pmap_init()).
114 */
115static u_int boot_cache_cnt;
116static struct vm_radix_node boot_cache[VM_RADIX_BOOT_CACHE];
117
1/*
2 * Copyright (c) 2013 EMC Corp.
3 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
4 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 */
29
30/*
31 * Path-compressed radix trie implementation.
32 * The following code is not generalized into a general purpose library
33 * because there are way too many parameters embedded that should really
34 * be decided by the library consumers. At the same time, consumers
35 * of this code must achieve highest possible performance.
36 *
37 * The implementation takes into account the following rationale:
38 * - Size of the nodes might be as small as possible.
39 * - There is no bias toward lookup operations over inserts or removes,
40 * and vice-versa.
41 * - In average there are not many complete levels, than level
42 * compression may just complicate things.
43 */
44
45#include <sys/cdefs.h>
46
47#include "opt_ddb.h"
48#include "opt_vm.h"
49
50#include <sys/param.h>
51#include <sys/conf.h>
52#include <sys/systm.h>
53#include <sys/kernel.h>
54#include <sys/malloc.h>
55#include <sys/queue.h>
56#include <sys/lock.h>
57#include <sys/mutex.h>
58
59#include <vm/uma.h>
60#include <vm/vm.h>
61#include <vm/vm_param.h>
62#include <vm/vm_extern.h>
63#include <vm/vm_kern.h>
64#include <vm/vm_page.h>
65#include <vm/vm_radix.h>
66
67#ifdef DDB
68#include <ddb/ddb.h>
69#endif
70
71#ifndef VM_RADIX_BOOT_CACHE
72#define VM_RADIX_BOOT_CACHE 1500
73#endif
74
75/*
76 * Such sizes should permit to keep node children contained into a single
77 * cache-line, or to at least not span many of those.
78 * In particular, sparse tries should however be compressed properly and
79 * then make some extra-levels not a big deal.
80 */
81#ifdef __LP64__
82#define VM_RADIX_WIDTH 4
83#else
84#define VM_RADIX_WIDTH 3
85#endif
86
87#define VM_RADIX_COUNT (1 << VM_RADIX_WIDTH)
88#define VM_RADIX_MASK (VM_RADIX_COUNT - 1)
89#define VM_RADIX_LIMIT \
90 (howmany((sizeof(vm_pindex_t) * NBBY), VM_RADIX_WIDTH) - 1)
91
92/* Flag bits stored in node pointers. */
93#define VM_RADIX_ISLEAF 0x1
94#define VM_RADIX_FLAGS 0x1
95#define VM_RADIX_PAD VM_RADIX_FLAGS
96
97/* Returns one unit associated with specified level. */
98#define VM_RADIX_UNITLEVEL(lev) \
99 ((vm_pindex_t)1 << ((VM_RADIX_LIMIT - (lev)) * VM_RADIX_WIDTH))
100
101struct vm_radix_node {
102 void *rn_child[VM_RADIX_COUNT]; /* Child nodes. */
103 vm_pindex_t rn_owner; /* Owner of record. */
104 uint16_t rn_count; /* Valid children. */
105 uint16_t rn_clev; /* Current level. */
106};
107
108static uma_zone_t vm_radix_node_zone;
109
110/*
111 * Boot-time cache of struct vm_radix_node objects.
112 * This cache is used to cater page allocations before the UMA zone is
113 * actually setup and pre-allocated (ie. pmap_init()).
114 */
115static u_int boot_cache_cnt;
116static struct vm_radix_node boot_cache[VM_RADIX_BOOT_CACHE];
117
118#ifdef INVARIANTS
119/*
120 * Radix node zone destructor.
121 */
122static void
123vm_radix_node_zone_dtor(void *mem, int size __unused, void *arg __unused)
124{
125 struct vm_radix_node *rnode;
126
127 rnode = mem;
128 KASSERT(rnode->rn_count == 0,
129 ("vm_radix_node_put: Freeing node %p with %d children\n", mem,
130 rnode->rn_count));
131}
132#endif
133
134static struct vm_radix_node *
135vm_radix_carve_bootcache(void)
136{
137 struct vm_radix_node *rnode;
138
139 if (boot_cache_cnt == VM_RADIX_BOOT_CACHE)
140 panic("%s: Increase VM_RADIX_BOOT_CACHE (%u)", __func__,
141 VM_RADIX_BOOT_CACHE);
142 rnode = &boot_cache[boot_cache_cnt];
143 boot_cache_cnt++;
144 return (rnode);
145}
146
147/*
148 * Allocate a radix node. Pre-allocation ensures that the request will be
149 * always successfully satisfied.
150 */
151static __inline struct vm_radix_node *
152vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
153{
154 struct vm_radix_node *rnode;
155
156 if (__predict_false(boot_cache_cnt <= VM_RADIX_BOOT_CACHE))
157 rnode = vm_radix_carve_bootcache();
158 else {
159 rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO);
160
161 /*
162 * The required number of nodes might be already correctly
163 * pre-allocated in vm_radix_init(). However, UMA can reserve
164 * few nodes on per-cpu specific buckets, which will not be
165 * accessible from the curcpu. The allocation could then
166 * return NULL when the pre-allocation pool is close to be
167 * exhausted. Anyway, in practice this should never be a
168 * problem because a new node is not always required for
169 * insert, thus the pre-allocation pool should already have
170 * some extra-pages that indirectly deal with this situation.
171 */
172 if (rnode == NULL)
173 panic("%s: uma_zalloc() returned NULL for a new node",
174 __func__);
175 }
176 rnode->rn_owner = owner;
177 rnode->rn_count = count;
178 rnode->rn_clev = clevel;
179 return (rnode);
180}
181
182/*
183 * Free radix node.
184 */
185static __inline void
186vm_radix_node_put(struct vm_radix_node *rnode)
187{
188
189 if (__predict_false(rnode > boot_cache &&
190 rnode <= &boot_cache[VM_RADIX_BOOT_CACHE]))
191 return;
192 uma_zfree(vm_radix_node_zone, rnode);
193}
194
195/*
196 * Return the position in the array for a given level.
197 */
198static __inline int
199vm_radix_slot(vm_pindex_t index, uint16_t level)
200{
201
202 return ((index >> ((VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH)) &
203 VM_RADIX_MASK);
204}
205
206/* Trims the key after the specified level. */
207static __inline vm_pindex_t
208vm_radix_trimkey(vm_pindex_t index, uint16_t level)
209{
210 vm_pindex_t ret;
211
212 ret = index;
213 if (level < VM_RADIX_LIMIT) {
214 ret >>= (VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH;
215 ret <<= (VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH;
216 }
217 return (ret);
218}
219
220/*
221 * Get the root node for a radix tree.
222 */
223static __inline struct vm_radix_node *
224vm_radix_getroot(struct vm_radix *rtree)
225{
226
227 return ((struct vm_radix_node *)(rtree->rt_root & ~VM_RADIX_FLAGS));
228}
229
230/*
231 * Set the root node for a radix tree.
232 */
233static __inline void
234vm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode)
235{
236
237 rtree->rt_root = (uintptr_t)rnode;
238}
239
240/*
241 * Returns the associated page extracted from rnode if available,
242 * NULL otherwise.
243 */
244static __inline vm_page_t
245vm_radix_node_page(struct vm_radix_node *rnode)
246{
247
248 return ((((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0) ?
249 (vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS) : NULL);
250}
251
252/*
253 * Adds the page as a child of provided node.
254 */
255static __inline void
256vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
257 vm_page_t page)
258{
259 int slot;
260
261 slot = vm_radix_slot(index, clev);
262 rnode->rn_child[slot] = (void *)((uintptr_t)page | VM_RADIX_ISLEAF);
263}
264
265/*
266 * Returns the slot where two keys differ.
267 * It cannot accept 2 equal keys.
268 */
269static __inline uint16_t
270vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
271{
272 uint16_t clev;
273
274 KASSERT(index1 != index2, ("%s: passing the same key value %jx",
275 __func__, (uintmax_t)index1));
276
277 index1 ^= index2;
278 for (clev = 0; clev <= VM_RADIX_LIMIT ; clev++)
279 if (vm_radix_slot(index1, clev))
280 return (clev);
281 panic("%s: it might have not reached this point", __func__);
282 return (0);
283}
284
285/*
286 * Returns TRUE if it can be determined that key does not belong to the
287 * specified rnode. FALSE otherwise.
288 */
289static __inline boolean_t
290vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
291{
292
293 if (rnode->rn_clev > 0) {
294 idx = vm_radix_trimkey(idx, rnode->rn_clev - 1);
295 idx -= rnode->rn_owner;
296 if (idx != 0)
297 return (TRUE);
298 }
299 return (FALSE);
300}
301
302/*
303 * Adjusts the idx key to the first upper level available, based on a valid
304 * initial level and map of available levels.
305 * Returns a value bigger than 0 to signal that there are not valid levels
306 * available.
307 */
308static __inline int
309vm_radix_addlev(vm_pindex_t *idx, boolean_t *levels, uint16_t ilev)
310{
311 vm_pindex_t wrapidx;
312
313 for (; levels[ilev] == FALSE ||
314 vm_radix_slot(*idx, ilev) == (VM_RADIX_COUNT - 1); ilev--)
315 if (ilev == 0)
316 break;
317 KASSERT(ilev > 0 || levels[0] == TRUE,
318 ("%s: levels back-scanning problem", __func__));
319 if (ilev == 0 && vm_radix_slot(*idx, ilev) == (VM_RADIX_COUNT - 1))
320 return (1);
321 wrapidx = *idx;
322 *idx = vm_radix_trimkey(*idx, ilev);
323 *idx += VM_RADIX_UNITLEVEL(ilev);
324 if (*idx < wrapidx)
325 return (1);
326 return (0);
327}
328
329/*
330 * Adjusts the idx key to the first lower level available, based on a valid
331 * initial level and map of available levels.
332 * Returns a value bigger than 0 to signal that there are not valid levels
333 * available.
334 */
335static __inline int
336vm_radix_declev(vm_pindex_t *idx, boolean_t *levels, uint16_t ilev)
337{
338 vm_pindex_t wrapidx;
339
340 for (; levels[ilev] == FALSE ||
341 vm_radix_slot(*idx, ilev) == 0; ilev--)
342 if (ilev == 0)
343 break;
344 KASSERT(ilev > 0 || levels[0] == TRUE,
345 ("%s: levels back-scanning problem", __func__));
346 if (ilev == 0 && vm_radix_slot(*idx, ilev) == 0)
347 return (1);
348 wrapidx = *idx;
349 *idx = vm_radix_trimkey(*idx, ilev);
350 *idx |= VM_RADIX_UNITLEVEL(ilev) - 1;
351 *idx -= VM_RADIX_UNITLEVEL(ilev);
352 if (*idx < wrapidx)
353 return (1);
354 return (0);
355}
356
357/*
358 * Internal handwork for vm_radix_reclaim_allonodes() primitive.
359 * This function is recrusive.
360 */
361static void
362vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
363{
364 int slot;
365
366 for (slot = 0; slot < VM_RADIX_COUNT && rnode->rn_count != 0; slot++) {
367 if (rnode->rn_child[slot] == NULL)
368 continue;
369 if (vm_radix_node_page(rnode->rn_child[slot]) == NULL)
370 vm_radix_reclaim_allnodes_int(rnode->rn_child[slot]);
371 rnode->rn_count--;
372 }
373 vm_radix_node_put(rnode);
374}
375
118static struct vm_radix_node *
119vm_radix_carve_bootcache(void)
120{
121 struct vm_radix_node *rnode;
122
123 if (boot_cache_cnt == VM_RADIX_BOOT_CACHE)
124 panic("%s: Increase VM_RADIX_BOOT_CACHE (%u)", __func__,
125 VM_RADIX_BOOT_CACHE);
126 rnode = &boot_cache[boot_cache_cnt];
127 boot_cache_cnt++;
128 return (rnode);
129}
130
131/*
132 * Allocate a radix node. Pre-allocation ensures that the request will be
133 * always successfully satisfied.
134 */
135static __inline struct vm_radix_node *
136vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
137{
138 struct vm_radix_node *rnode;
139
140 if (__predict_false(boot_cache_cnt <= VM_RADIX_BOOT_CACHE))
141 rnode = vm_radix_carve_bootcache();
142 else {
143 rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO);
144
145 /*
146 * The required number of nodes might be already correctly
147 * pre-allocated in vm_radix_init(). However, UMA can reserve
148 * few nodes on per-cpu specific buckets, which will not be
149 * accessible from the curcpu. The allocation could then
150 * return NULL when the pre-allocation pool is close to be
151 * exhausted. Anyway, in practice this should never be a
152 * problem because a new node is not always required for
153 * insert, thus the pre-allocation pool should already have
154 * some extra-pages that indirectly deal with this situation.
155 */
156 if (rnode == NULL)
157 panic("%s: uma_zalloc() returned NULL for a new node",
158 __func__);
159 }
160 rnode->rn_owner = owner;
161 rnode->rn_count = count;
162 rnode->rn_clev = clevel;
163 return (rnode);
164}
165
166/*
167 * Free radix node.
168 */
169static __inline void
170vm_radix_node_put(struct vm_radix_node *rnode)
171{
172
173 if (__predict_false(rnode > boot_cache &&
174 rnode <= &boot_cache[VM_RADIX_BOOT_CACHE]))
175 return;
176 uma_zfree(vm_radix_node_zone, rnode);
177}
178
179/*
180 * Return the position in the array for a given level.
181 */
182static __inline int
183vm_radix_slot(vm_pindex_t index, uint16_t level)
184{
185
186 return ((index >> ((VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH)) &
187 VM_RADIX_MASK);
188}
189
190/* Trims the key after the specified level. */
191static __inline vm_pindex_t
192vm_radix_trimkey(vm_pindex_t index, uint16_t level)
193{
194 vm_pindex_t ret;
195
196 ret = index;
197 if (level < VM_RADIX_LIMIT) {
198 ret >>= (VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH;
199 ret <<= (VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH;
200 }
201 return (ret);
202}
203
204/*
205 * Get the root node for a radix tree.
206 */
207static __inline struct vm_radix_node *
208vm_radix_getroot(struct vm_radix *rtree)
209{
210
211 return ((struct vm_radix_node *)(rtree->rt_root & ~VM_RADIX_FLAGS));
212}
213
214/*
215 * Set the root node for a radix tree.
216 */
217static __inline void
218vm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode)
219{
220
221 rtree->rt_root = (uintptr_t)rnode;
222}
223
224/*
225 * Returns the associated page extracted from rnode if available,
226 * NULL otherwise.
227 */
228static __inline vm_page_t
229vm_radix_node_page(struct vm_radix_node *rnode)
230{
231
232 return ((((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0) ?
233 (vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS) : NULL);
234}
235
236/*
237 * Adds the page as a child of provided node.
238 */
239static __inline void
240vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
241 vm_page_t page)
242{
243 int slot;
244
245 slot = vm_radix_slot(index, clev);
246 rnode->rn_child[slot] = (void *)((uintptr_t)page | VM_RADIX_ISLEAF);
247}
248
249/*
250 * Returns the slot where two keys differ.
251 * It cannot accept 2 equal keys.
252 */
253static __inline uint16_t
254vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
255{
256 uint16_t clev;
257
258 KASSERT(index1 != index2, ("%s: passing the same key value %jx",
259 __func__, (uintmax_t)index1));
260
261 index1 ^= index2;
262 for (clev = 0; clev <= VM_RADIX_LIMIT ; clev++)
263 if (vm_radix_slot(index1, clev))
264 return (clev);
265 panic("%s: it might have not reached this point", __func__);
266 return (0);
267}
268
269/*
270 * Returns TRUE if it can be determined that key does not belong to the
271 * specified rnode. FALSE otherwise.
272 */
273static __inline boolean_t
274vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
275{
276
277 if (rnode->rn_clev > 0) {
278 idx = vm_radix_trimkey(idx, rnode->rn_clev - 1);
279 idx -= rnode->rn_owner;
280 if (idx != 0)
281 return (TRUE);
282 }
283 return (FALSE);
284}
285
286/*
287 * Adjusts the idx key to the first upper level available, based on a valid
288 * initial level and map of available levels.
289 * Returns a value bigger than 0 to signal that there are not valid levels
290 * available.
291 */
292static __inline int
293vm_radix_addlev(vm_pindex_t *idx, boolean_t *levels, uint16_t ilev)
294{
295 vm_pindex_t wrapidx;
296
297 for (; levels[ilev] == FALSE ||
298 vm_radix_slot(*idx, ilev) == (VM_RADIX_COUNT - 1); ilev--)
299 if (ilev == 0)
300 break;
301 KASSERT(ilev > 0 || levels[0] == TRUE,
302 ("%s: levels back-scanning problem", __func__));
303 if (ilev == 0 && vm_radix_slot(*idx, ilev) == (VM_RADIX_COUNT - 1))
304 return (1);
305 wrapidx = *idx;
306 *idx = vm_radix_trimkey(*idx, ilev);
307 *idx += VM_RADIX_UNITLEVEL(ilev);
308 if (*idx < wrapidx)
309 return (1);
310 return (0);
311}
312
313/*
314 * Adjusts the idx key to the first lower level available, based on a valid
315 * initial level and map of available levels.
316 * Returns a value bigger than 0 to signal that there are not valid levels
317 * available.
318 */
319static __inline int
320vm_radix_declev(vm_pindex_t *idx, boolean_t *levels, uint16_t ilev)
321{
322 vm_pindex_t wrapidx;
323
324 for (; levels[ilev] == FALSE ||
325 vm_radix_slot(*idx, ilev) == 0; ilev--)
326 if (ilev == 0)
327 break;
328 KASSERT(ilev > 0 || levels[0] == TRUE,
329 ("%s: levels back-scanning problem", __func__));
330 if (ilev == 0 && vm_radix_slot(*idx, ilev) == 0)
331 return (1);
332 wrapidx = *idx;
333 *idx = vm_radix_trimkey(*idx, ilev);
334 *idx |= VM_RADIX_UNITLEVEL(ilev) - 1;
335 *idx -= VM_RADIX_UNITLEVEL(ilev);
336 if (*idx < wrapidx)
337 return (1);
338 return (0);
339}
340
341/*
342 * Internal handwork for vm_radix_reclaim_allonodes() primitive.
343 * This function is recrusive.
344 */
345static void
346vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
347{
348 int slot;
349
350 for (slot = 0; slot < VM_RADIX_COUNT && rnode->rn_count != 0; slot++) {
351 if (rnode->rn_child[slot] == NULL)
352 continue;
353 if (vm_radix_node_page(rnode->rn_child[slot]) == NULL)
354 vm_radix_reclaim_allnodes_int(rnode->rn_child[slot]);
355 rnode->rn_count--;
356 }
357 vm_radix_node_put(rnode);
358}
359
360#ifdef INVARIANTS
376/*
361/*
362 * Radix node zone destructor.
363 */
364static void
365vm_radix_node_zone_dtor(void *mem, int size __unused, void *arg __unused)
366{
367 struct vm_radix_node *rnode;
368
369 rnode = mem;
370 KASSERT(rnode->rn_count == 0,
371 ("vm_radix_node_put: Freeing node %p with %d children\n", mem,
372 rnode->rn_count));
373}
374#endif
375
376/*
377 * Pre-allocate intermediate nodes from the UMA slab zone.
378 */
379static void
380vm_radix_init(void *arg __unused)
381{
382 int nitems;
383
384 vm_radix_node_zone = uma_zcreate("RADIX NODE",
385 sizeof(struct vm_radix_node), NULL,
386#ifdef INVARIANTS
387 vm_radix_node_zone_dtor,
388#else
389 NULL,
390#endif
391 NULL, NULL, VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_NOFREE);
392 nitems = uma_zone_set_max(vm_radix_node_zone, vm_page_array_size);
393 uma_prealloc(vm_radix_node_zone, nitems);
394 boot_cache_cnt = VM_RADIX_BOOT_CACHE + 1;
395}
396SYSINIT(vm_radix_init, SI_SUB_KMEM, SI_ORDER_SECOND, vm_radix_init, NULL);
397
398/*
399 * Inserts the key-value pair in to the trie.
400 * Panics if the key already exists.
401 */
402void
403vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, vm_page_t page)
404{
405 vm_pindex_t newind;
406 struct vm_radix_node *rnode, *tmp, *tmp2;
407 vm_page_t m;
408 int slot;
409 uint16_t clev;
410
411 /*
412 * The owner of record for root is not really important because it
413 * will never be used.
414 */
415 rnode = vm_radix_getroot(rtree);
416 if (rnode == NULL) {
417 rnode = vm_radix_node_get(0, 1, 0);
418 vm_radix_setroot(rtree, rnode);
419 vm_radix_addpage(rnode, index, 0, page);
420 return;
421 }
422 while (rnode != NULL) {
423 if (vm_radix_keybarr(rnode, index) == TRUE)
424 break;
425 slot = vm_radix_slot(index, rnode->rn_clev);
426 m = vm_radix_node_page(rnode->rn_child[slot]);
427 if (m != NULL) {
428 if (m->pindex == index)
429 panic("%s: key %jx is already present",
430 __func__, (uintmax_t)index);
431 clev = vm_radix_keydiff(m->pindex, index);
432 tmp = vm_radix_node_get(vm_radix_trimkey(index,
433 clev - 1), 2, clev);
434 rnode->rn_child[slot] = tmp;
435 vm_radix_addpage(tmp, index, clev, page);
436 vm_radix_addpage(tmp, m->pindex, clev, m);
437 return;
438 }
439 if (rnode->rn_child[slot] == NULL) {
440 rnode->rn_count++;
441 vm_radix_addpage(rnode, index, rnode->rn_clev, page);
442 return;
443 }
444 rnode = rnode->rn_child[slot];
445 }
446 if (rnode == NULL)
447 panic("%s: path traversal ended unexpectedly", __func__);
448
449 /*
450 * Scan the trie from the top and find the parent to insert
451 * the new object.
452 */
453 newind = rnode->rn_owner;
454 clev = vm_radix_keydiff(newind, index);
455 slot = VM_RADIX_COUNT;
456 for (rnode = vm_radix_getroot(rtree); ; rnode = tmp) {
457 KASSERT(rnode != NULL, ("%s: edge cannot be NULL in the scan",
458 __func__));
459 KASSERT(clev >= rnode->rn_clev,
460 ("%s: unexpected trie depth: clev: %d, rnode->rn_clev: %d",
461 __func__, clev, rnode->rn_clev));
462 slot = vm_radix_slot(index, rnode->rn_clev);
463 tmp = rnode->rn_child[slot];
464 KASSERT(tmp != NULL && vm_radix_node_page(tmp) == NULL,
465 ("%s: unexpected lookup interruption", __func__));
466 if (tmp->rn_clev > clev)
467 break;
468 }
469 KASSERT(rnode != NULL && tmp != NULL && slot < VM_RADIX_COUNT,
470 ("%s: invalid scan parameters rnode: %p, tmp: %p, slot: %d",
471 __func__, (void *)rnode, (void *)tmp, slot));
472
473 /*
474 * A new node is needed because the right insertion level is reached.
475 * Setup the new intermediate node and add the 2 children: the
476 * new object and the older edge.
477 */
478 tmp2 = vm_radix_node_get(vm_radix_trimkey(page->pindex, clev - 1), 2,
479 clev);
480 rnode->rn_child[slot] = tmp2;
481 vm_radix_addpage(tmp2, index, clev, page);
482 slot = vm_radix_slot(newind, clev);
483 tmp2->rn_child[slot] = tmp;
484}
485
486/*
487 * Returns the value stored at the index. If the index is not present
488 * NULL is returned.
489 */
490vm_page_t
491vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
492{
493 struct vm_radix_node *rnode;
494 vm_page_t m;
495 int slot;
496
497 rnode = vm_radix_getroot(rtree);
498 while (rnode != NULL) {
499 if (vm_radix_keybarr(rnode, index) == TRUE)
500 return (NULL);
501 slot = vm_radix_slot(index, rnode->rn_clev);
502 rnode = rnode->rn_child[slot];
503 m = vm_radix_node_page(rnode);
504 if (m != NULL) {
505 if (m->pindex == index)
506 return (m);
507 else
508 return (NULL);
509 }
510 }
511 return (NULL);
512}
513
514/*
515 * Look up any entry at a position bigger than or equal to index.
516 */
517vm_page_t
518vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
519{
520 vm_pindex_t inc;
521 vm_page_t m;
522 struct vm_radix_node *rnode;
523 int slot;
524 uint16_t difflev;
525 boolean_t maplevels[VM_RADIX_LIMIT + 1];
526#ifdef INVARIANTS
527 int loops = 0;
528#endif
529
530restart:
531 KASSERT(++loops < 1000, ("%s: too many loops", __func__));
532 for (difflev = 0; difflev < (VM_RADIX_LIMIT + 1); difflev++)
533 maplevels[difflev] = FALSE;
534 rnode = vm_radix_getroot(rtree);
535 while (rnode != NULL) {
536 maplevels[rnode->rn_clev] = TRUE;
537
538 /*
539 * If the keys differ before the current bisection node
540 * the search key might rollback to the earlierst
541 * available bisection node, or to the smaller value
542 * in the current domain (if the owner is bigger than the
543 * search key).
544 * The search for a valid bisection node is helped through
545 * the use of maplevels array which should bring immediately
546 * a lower useful level, skipping holes.
547 */
548 if (vm_radix_keybarr(rnode, index) == TRUE) {
549 difflev = vm_radix_keydiff(index, rnode->rn_owner);
550 if (index > rnode->rn_owner) {
551 if (vm_radix_addlev(&index, maplevels,
552 difflev) > 0)
553 break;
554 } else
555 index = vm_radix_trimkey(rnode->rn_owner,
556 difflev);
557 goto restart;
558 }
559 slot = vm_radix_slot(index, rnode->rn_clev);
560 m = vm_radix_node_page(rnode->rn_child[slot]);
561 if (m != NULL && m->pindex >= index)
562 return (m);
563 if (rnode->rn_child[slot] != NULL && m == NULL) {
564 rnode = rnode->rn_child[slot];
565 continue;
566 }
567
568 /*
569 * Look for an available edge or page within the current
570 * bisection node.
571 */
572 if (slot < (VM_RADIX_COUNT - 1)) {
573 inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
574 index = vm_radix_trimkey(index, rnode->rn_clev);
575 index += inc;
576 slot++;
577 for (;; index += inc, slot++) {
578 m = vm_radix_node_page(rnode->rn_child[slot]);
579 if (m != NULL && m->pindex >= index)
580 return (m);
581 if ((rnode->rn_child[slot] != NULL &&
582 m == NULL) || slot == (VM_RADIX_COUNT - 1))
583 break;
584 }
585 }
586
587 /*
588 * If a valid page or edge, bigger than the search slot, is
589 * found in the traversal, skip to the next higher-level key.
590 */
591 if (slot == (VM_RADIX_COUNT - 1) &&
592 (rnode->rn_child[slot] == NULL || m != NULL)) {
593 if (rnode->rn_clev == 0 || vm_radix_addlev(&index,
594 maplevels, rnode->rn_clev - 1) > 0)
595 break;
596 goto restart;
597 }
598 rnode = rnode->rn_child[slot];
599 }
600 return (NULL);
601}
602
603/*
604 * Look up any entry at a position less than or equal to index.
605 */
606vm_page_t
607vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
608{
609 vm_pindex_t inc;
610 vm_page_t m;
611 struct vm_radix_node *rnode;
612 int slot;
613 uint16_t difflev;
614 boolean_t maplevels[VM_RADIX_LIMIT + 1];
615#ifdef INVARIANTS
616 int loops = 0;
617#endif
618
619restart:
620 KASSERT(++loops < 1000, ("%s: too many loops", __func__));
621 for (difflev = 0; difflev < (VM_RADIX_LIMIT + 1); difflev++)
622 maplevels[difflev] = FALSE;
623 rnode = vm_radix_getroot(rtree);
624 while (rnode != NULL) {
625 maplevels[rnode->rn_clev] = TRUE;
626
627 /*
628 * If the keys differ before the current bisection node
629 * the search key might rollback to the earlierst
630 * available bisection node, or to the higher value
631 * in the current domain (if the owner is smaller than the
632 * search key).
633 * The search for a valid bisection node is helped through
634 * the use of maplevels array which should bring immediately
635 * a lower useful level, skipping holes.
636 */
637 if (vm_radix_keybarr(rnode, index) == TRUE) {
638 difflev = vm_radix_keydiff(index, rnode->rn_owner);
639 if (index > rnode->rn_owner) {
640 index = vm_radix_trimkey(rnode->rn_owner,
641 difflev);
642 index |= VM_RADIX_UNITLEVEL(difflev) - 1;
643 } else if (vm_radix_declev(&index, maplevels,
644 difflev) > 0)
645 break;
646 goto restart;
647 }
648 slot = vm_radix_slot(index, rnode->rn_clev);
649 m = vm_radix_node_page(rnode->rn_child[slot]);
650 if (m != NULL && m->pindex <= index)
651 return (m);
652 if (rnode->rn_child[slot] != NULL && m == NULL) {
653 rnode = rnode->rn_child[slot];
654 continue;
655 }
656
657 /*
658 * Look for an available edge or page within the current
659 * bisection node.
660 */
661 if (slot > 0) {
662 inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
663 index = vm_radix_trimkey(index, rnode->rn_clev);
664 index |= inc - 1;
665 index -= inc;
666 slot--;
667 for (;; index -= inc, slot--) {
668 m = vm_radix_node_page(rnode->rn_child[slot]);
669 if (m != NULL && m->pindex <= index)
670 return (m);
671 if ((rnode->rn_child[slot] != NULL &&
672 m == NULL) || slot == 0)
673 break;
674 }
675 }
676
677 /*
678 * If a valid page or edge, smaller than the search slot, is
679 * found in the traversal, skip to the next higher-level key.
680 */
681 if (slot == 0 && (rnode->rn_child[slot] == NULL || m != NULL)) {
682 if (rnode->rn_clev == 0 || vm_radix_declev(&index,
683 maplevels, rnode->rn_clev - 1) > 0)
684 break;
685 goto restart;
686 }
687 rnode = rnode->rn_child[slot];
688 }
689 return (NULL);
690}
691
692/*
693 * Remove the specified index from the tree.
694 * Panics if the key is not present.
695 */
696void
697vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
698{
699 struct vm_radix_node *rnode, *parent;
700 vm_page_t m;
701 int i, slot;
702
703 parent = NULL;
704 rnode = vm_radix_getroot(rtree);
705 for (;;) {
706 if (rnode == NULL)
707 panic("vm_radix_remove: impossible to locate the key");
708 slot = vm_radix_slot(index, rnode->rn_clev);
709 m = vm_radix_node_page(rnode->rn_child[slot]);
710 if (m != NULL && m->pindex == index) {
711 rnode->rn_child[slot] = NULL;
712 rnode->rn_count--;
713 if (rnode->rn_count > 1)
714 break;
715 if (parent == NULL) {
716 if (rnode->rn_count == 0) {
717 vm_radix_node_put(rnode);
718 vm_radix_setroot(rtree, NULL);
719 }
720 break;
721 }
722 for (i = 0; i < VM_RADIX_COUNT; i++)
723 if (rnode->rn_child[i] != NULL)
724 break;
725 KASSERT(i != VM_RADIX_COUNT,
726 ("%s: invalid node configuration", __func__));
727 slot = vm_radix_slot(index, parent->rn_clev);
728 KASSERT(parent->rn_child[slot] == rnode,
729 ("%s: invalid child value", __func__));
730 parent->rn_child[slot] = rnode->rn_child[i];
731 rnode->rn_count--;
732 rnode->rn_child[i] = NULL;
733 vm_radix_node_put(rnode);
734 break;
735 }
736 if (m != NULL && m->pindex != index)
737 panic("%s: invalid key found", __func__);
738 parent = rnode;
739 rnode = rnode->rn_child[slot];
740 }
741}
742
743/*
744 * Remove and free all the nodes from the radix tree.
745 * This function is recrusive but there is a tight control on it as the
746 * maximum depth of the tree is fixed.
747 */
748void
749vm_radix_reclaim_allnodes(struct vm_radix *rtree)
750{
751 struct vm_radix_node *root;
752
753 root = vm_radix_getroot(rtree);
754 if (root == NULL)
755 return;
756 vm_radix_reclaim_allnodes_int(root);
757 vm_radix_setroot(rtree, NULL);
758}
759
760#ifdef DDB
761/*
762 * Show details about the given vnode.
763 */
764DB_SHOW_COMMAND(radixnode, db_show_radixnode)
765{
766 struct vm_radix_node *rnode;
767 int i;
768
769 if (!have_addr)
770 return;
771 rnode = (struct vm_radix_node *)addr;
772 db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
773 (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
774 rnode->rn_clev);
775 for (i = 0; i < VM_RADIX_COUNT; i++)
776 if (rnode->rn_child[i] != NULL)
777 db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
778 i, (void *)rnode->rn_child[i],
779 (void *)vm_radix_node_page(rnode->rn_child[i]),
780 rnode->rn_clev);
781}
782#endif /* DDB */
377 * Pre-allocate intermediate nodes from the UMA slab zone.
378 */
379static void
380vm_radix_init(void *arg __unused)
381{
382 int nitems;
383
384 vm_radix_node_zone = uma_zcreate("RADIX NODE",
385 sizeof(struct vm_radix_node), NULL,
386#ifdef INVARIANTS
387 vm_radix_node_zone_dtor,
388#else
389 NULL,
390#endif
391 NULL, NULL, VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_NOFREE);
392 nitems = uma_zone_set_max(vm_radix_node_zone, vm_page_array_size);
393 uma_prealloc(vm_radix_node_zone, nitems);
394 boot_cache_cnt = VM_RADIX_BOOT_CACHE + 1;
395}
396SYSINIT(vm_radix_init, SI_SUB_KMEM, SI_ORDER_SECOND, vm_radix_init, NULL);
397
398/*
399 * Inserts the key-value pair in to the trie.
400 * Panics if the key already exists.
401 */
402void
403vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, vm_page_t page)
404{
405 vm_pindex_t newind;
406 struct vm_radix_node *rnode, *tmp, *tmp2;
407 vm_page_t m;
408 int slot;
409 uint16_t clev;
410
411 /*
412 * The owner of record for root is not really important because it
413 * will never be used.
414 */
415 rnode = vm_radix_getroot(rtree);
416 if (rnode == NULL) {
417 rnode = vm_radix_node_get(0, 1, 0);
418 vm_radix_setroot(rtree, rnode);
419 vm_radix_addpage(rnode, index, 0, page);
420 return;
421 }
422 while (rnode != NULL) {
423 if (vm_radix_keybarr(rnode, index) == TRUE)
424 break;
425 slot = vm_radix_slot(index, rnode->rn_clev);
426 m = vm_radix_node_page(rnode->rn_child[slot]);
427 if (m != NULL) {
428 if (m->pindex == index)
429 panic("%s: key %jx is already present",
430 __func__, (uintmax_t)index);
431 clev = vm_radix_keydiff(m->pindex, index);
432 tmp = vm_radix_node_get(vm_radix_trimkey(index,
433 clev - 1), 2, clev);
434 rnode->rn_child[slot] = tmp;
435 vm_radix_addpage(tmp, index, clev, page);
436 vm_radix_addpage(tmp, m->pindex, clev, m);
437 return;
438 }
439 if (rnode->rn_child[slot] == NULL) {
440 rnode->rn_count++;
441 vm_radix_addpage(rnode, index, rnode->rn_clev, page);
442 return;
443 }
444 rnode = rnode->rn_child[slot];
445 }
446 if (rnode == NULL)
447 panic("%s: path traversal ended unexpectedly", __func__);
448
449 /*
450 * Scan the trie from the top and find the parent to insert
451 * the new object.
452 */
453 newind = rnode->rn_owner;
454 clev = vm_radix_keydiff(newind, index);
455 slot = VM_RADIX_COUNT;
456 for (rnode = vm_radix_getroot(rtree); ; rnode = tmp) {
457 KASSERT(rnode != NULL, ("%s: edge cannot be NULL in the scan",
458 __func__));
459 KASSERT(clev >= rnode->rn_clev,
460 ("%s: unexpected trie depth: clev: %d, rnode->rn_clev: %d",
461 __func__, clev, rnode->rn_clev));
462 slot = vm_radix_slot(index, rnode->rn_clev);
463 tmp = rnode->rn_child[slot];
464 KASSERT(tmp != NULL && vm_radix_node_page(tmp) == NULL,
465 ("%s: unexpected lookup interruption", __func__));
466 if (tmp->rn_clev > clev)
467 break;
468 }
469 KASSERT(rnode != NULL && tmp != NULL && slot < VM_RADIX_COUNT,
470 ("%s: invalid scan parameters rnode: %p, tmp: %p, slot: %d",
471 __func__, (void *)rnode, (void *)tmp, slot));
472
473 /*
474 * A new node is needed because the right insertion level is reached.
475 * Setup the new intermediate node and add the 2 children: the
476 * new object and the older edge.
477 */
478 tmp2 = vm_radix_node_get(vm_radix_trimkey(page->pindex, clev - 1), 2,
479 clev);
480 rnode->rn_child[slot] = tmp2;
481 vm_radix_addpage(tmp2, index, clev, page);
482 slot = vm_radix_slot(newind, clev);
483 tmp2->rn_child[slot] = tmp;
484}
485
486/*
487 * Returns the value stored at the index. If the index is not present
488 * NULL is returned.
489 */
490vm_page_t
491vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
492{
493 struct vm_radix_node *rnode;
494 vm_page_t m;
495 int slot;
496
497 rnode = vm_radix_getroot(rtree);
498 while (rnode != NULL) {
499 if (vm_radix_keybarr(rnode, index) == TRUE)
500 return (NULL);
501 slot = vm_radix_slot(index, rnode->rn_clev);
502 rnode = rnode->rn_child[slot];
503 m = vm_radix_node_page(rnode);
504 if (m != NULL) {
505 if (m->pindex == index)
506 return (m);
507 else
508 return (NULL);
509 }
510 }
511 return (NULL);
512}
513
514/*
515 * Look up any entry at a position bigger than or equal to index.
516 */
517vm_page_t
518vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
519{
520 vm_pindex_t inc;
521 vm_page_t m;
522 struct vm_radix_node *rnode;
523 int slot;
524 uint16_t difflev;
525 boolean_t maplevels[VM_RADIX_LIMIT + 1];
526#ifdef INVARIANTS
527 int loops = 0;
528#endif
529
530restart:
531 KASSERT(++loops < 1000, ("%s: too many loops", __func__));
532 for (difflev = 0; difflev < (VM_RADIX_LIMIT + 1); difflev++)
533 maplevels[difflev] = FALSE;
534 rnode = vm_radix_getroot(rtree);
535 while (rnode != NULL) {
536 maplevels[rnode->rn_clev] = TRUE;
537
538 /*
539 * If the keys differ before the current bisection node
540 * the search key might rollback to the earlierst
541 * available bisection node, or to the smaller value
542 * in the current domain (if the owner is bigger than the
543 * search key).
544 * The search for a valid bisection node is helped through
545 * the use of maplevels array which should bring immediately
546 * a lower useful level, skipping holes.
547 */
548 if (vm_radix_keybarr(rnode, index) == TRUE) {
549 difflev = vm_radix_keydiff(index, rnode->rn_owner);
550 if (index > rnode->rn_owner) {
551 if (vm_radix_addlev(&index, maplevels,
552 difflev) > 0)
553 break;
554 } else
555 index = vm_radix_trimkey(rnode->rn_owner,
556 difflev);
557 goto restart;
558 }
559 slot = vm_radix_slot(index, rnode->rn_clev);
560 m = vm_radix_node_page(rnode->rn_child[slot]);
561 if (m != NULL && m->pindex >= index)
562 return (m);
563 if (rnode->rn_child[slot] != NULL && m == NULL) {
564 rnode = rnode->rn_child[slot];
565 continue;
566 }
567
568 /*
569 * Look for an available edge or page within the current
570 * bisection node.
571 */
572 if (slot < (VM_RADIX_COUNT - 1)) {
573 inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
574 index = vm_radix_trimkey(index, rnode->rn_clev);
575 index += inc;
576 slot++;
577 for (;; index += inc, slot++) {
578 m = vm_radix_node_page(rnode->rn_child[slot]);
579 if (m != NULL && m->pindex >= index)
580 return (m);
581 if ((rnode->rn_child[slot] != NULL &&
582 m == NULL) || slot == (VM_RADIX_COUNT - 1))
583 break;
584 }
585 }
586
587 /*
588 * If a valid page or edge, bigger than the search slot, is
589 * found in the traversal, skip to the next higher-level key.
590 */
591 if (slot == (VM_RADIX_COUNT - 1) &&
592 (rnode->rn_child[slot] == NULL || m != NULL)) {
593 if (rnode->rn_clev == 0 || vm_radix_addlev(&index,
594 maplevels, rnode->rn_clev - 1) > 0)
595 break;
596 goto restart;
597 }
598 rnode = rnode->rn_child[slot];
599 }
600 return (NULL);
601}
602
603/*
604 * Look up any entry at a position less than or equal to index.
605 */
606vm_page_t
607vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
608{
609 vm_pindex_t inc;
610 vm_page_t m;
611 struct vm_radix_node *rnode;
612 int slot;
613 uint16_t difflev;
614 boolean_t maplevels[VM_RADIX_LIMIT + 1];
615#ifdef INVARIANTS
616 int loops = 0;
617#endif
618
619restart:
620 KASSERT(++loops < 1000, ("%s: too many loops", __func__));
621 for (difflev = 0; difflev < (VM_RADIX_LIMIT + 1); difflev++)
622 maplevels[difflev] = FALSE;
623 rnode = vm_radix_getroot(rtree);
624 while (rnode != NULL) {
625 maplevels[rnode->rn_clev] = TRUE;
626
627 /*
628 * If the keys differ before the current bisection node
629 * the search key might rollback to the earlierst
630 * available bisection node, or to the higher value
631 * in the current domain (if the owner is smaller than the
632 * search key).
633 * The search for a valid bisection node is helped through
634 * the use of maplevels array which should bring immediately
635 * a lower useful level, skipping holes.
636 */
637 if (vm_radix_keybarr(rnode, index) == TRUE) {
638 difflev = vm_radix_keydiff(index, rnode->rn_owner);
639 if (index > rnode->rn_owner) {
640 index = vm_radix_trimkey(rnode->rn_owner,
641 difflev);
642 index |= VM_RADIX_UNITLEVEL(difflev) - 1;
643 } else if (vm_radix_declev(&index, maplevels,
644 difflev) > 0)
645 break;
646 goto restart;
647 }
648 slot = vm_radix_slot(index, rnode->rn_clev);
649 m = vm_radix_node_page(rnode->rn_child[slot]);
650 if (m != NULL && m->pindex <= index)
651 return (m);
652 if (rnode->rn_child[slot] != NULL && m == NULL) {
653 rnode = rnode->rn_child[slot];
654 continue;
655 }
656
657 /*
658 * Look for an available edge or page within the current
659 * bisection node.
660 */
661 if (slot > 0) {
662 inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
663 index = vm_radix_trimkey(index, rnode->rn_clev);
664 index |= inc - 1;
665 index -= inc;
666 slot--;
667 for (;; index -= inc, slot--) {
668 m = vm_radix_node_page(rnode->rn_child[slot]);
669 if (m != NULL && m->pindex <= index)
670 return (m);
671 if ((rnode->rn_child[slot] != NULL &&
672 m == NULL) || slot == 0)
673 break;
674 }
675 }
676
677 /*
678 * If a valid page or edge, smaller than the search slot, is
679 * found in the traversal, skip to the next higher-level key.
680 */
681 if (slot == 0 && (rnode->rn_child[slot] == NULL || m != NULL)) {
682 if (rnode->rn_clev == 0 || vm_radix_declev(&index,
683 maplevels, rnode->rn_clev - 1) > 0)
684 break;
685 goto restart;
686 }
687 rnode = rnode->rn_child[slot];
688 }
689 return (NULL);
690}
691
692/*
693 * Remove the specified index from the tree.
694 * Panics if the key is not present.
695 */
696void
697vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
698{
699 struct vm_radix_node *rnode, *parent;
700 vm_page_t m;
701 int i, slot;
702
703 parent = NULL;
704 rnode = vm_radix_getroot(rtree);
705 for (;;) {
706 if (rnode == NULL)
707 panic("vm_radix_remove: impossible to locate the key");
708 slot = vm_radix_slot(index, rnode->rn_clev);
709 m = vm_radix_node_page(rnode->rn_child[slot]);
710 if (m != NULL && m->pindex == index) {
711 rnode->rn_child[slot] = NULL;
712 rnode->rn_count--;
713 if (rnode->rn_count > 1)
714 break;
715 if (parent == NULL) {
716 if (rnode->rn_count == 0) {
717 vm_radix_node_put(rnode);
718 vm_radix_setroot(rtree, NULL);
719 }
720 break;
721 }
722 for (i = 0; i < VM_RADIX_COUNT; i++)
723 if (rnode->rn_child[i] != NULL)
724 break;
725 KASSERT(i != VM_RADIX_COUNT,
726 ("%s: invalid node configuration", __func__));
727 slot = vm_radix_slot(index, parent->rn_clev);
728 KASSERT(parent->rn_child[slot] == rnode,
729 ("%s: invalid child value", __func__));
730 parent->rn_child[slot] = rnode->rn_child[i];
731 rnode->rn_count--;
732 rnode->rn_child[i] = NULL;
733 vm_radix_node_put(rnode);
734 break;
735 }
736 if (m != NULL && m->pindex != index)
737 panic("%s: invalid key found", __func__);
738 parent = rnode;
739 rnode = rnode->rn_child[slot];
740 }
741}
742
743/*
744 * Remove and free all the nodes from the radix tree.
745 * This function is recrusive but there is a tight control on it as the
746 * maximum depth of the tree is fixed.
747 */
748void
749vm_radix_reclaim_allnodes(struct vm_radix *rtree)
750{
751 struct vm_radix_node *root;
752
753 root = vm_radix_getroot(rtree);
754 if (root == NULL)
755 return;
756 vm_radix_reclaim_allnodes_int(root);
757 vm_radix_setroot(rtree, NULL);
758}
759
760#ifdef DDB
761/*
762 * Show details about the given vnode.
763 */
764DB_SHOW_COMMAND(radixnode, db_show_radixnode)
765{
766 struct vm_radix_node *rnode;
767 int i;
768
769 if (!have_addr)
770 return;
771 rnode = (struct vm_radix_node *)addr;
772 db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
773 (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
774 rnode->rn_clev);
775 for (i = 0; i < VM_RADIX_COUNT; i++)
776 if (rnode->rn_child[i] != NULL)
777 db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
778 i, (void *)rnode->rn_child[i],
779 (void *)vm_radix_node_page(rnode->rn_child[i]),
780 rnode->rn_clev);
781}
782#endif /* DDB */