Deleted Added
full compact
vm_radix.c (250520) vm_radix.c (254141)
1/*
2 * Copyright (c) 2013 EMC Corp.
3 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
4 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions

--- 35 unchanged lines hidden (view full) ---

44 * the number of insert and remove operations. This basically implies
45 * that optimizations supposedly helping one operation but hurting the
46 * other might be carefully evaluated.
47 * - On average not many nodes are expected to be fully populated, hence
48 * level compression may just complicate things.
49 */
50
51#include <sys/cdefs.h>
1/*
2 * Copyright (c) 2013 EMC Corp.
3 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
4 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions

--- 35 unchanged lines hidden (view full) ---

44 * the number of insert and remove operations. This basically implies
45 * that optimizations supposedly helping one operation but hurting the
46 * other might be carefully evaluated.
47 * - On average not many nodes are expected to be fully populated, hence
48 * level compression may just complicate things.
49 */
50
51#include <sys/cdefs.h>
52__FBSDID("$FreeBSD: head/sys/vm/vm_radix.c 250520 2013-05-11 18:01:41Z alc $");
52__FBSDID("$FreeBSD: head/sys/vm/vm_radix.c 254141 2013-08-09 11:28:55Z attilio $");
53
54#include "opt_ddb.h"
55
56#include <sys/param.h>
57#include <sys/systm.h>
58#include <sys/kernel.h>
59#include <sys/vmmeter.h>
60

--- 37 unchanged lines hidden (view full) ---

98 uint16_t rn_count; /* Valid children. */
99 uint16_t rn_clev; /* Current level. */
100 void *rn_child[VM_RADIX_COUNT]; /* Child nodes. */
101};
102
103static uma_zone_t vm_radix_node_zone;
104
105/*
53
54#include "opt_ddb.h"
55
56#include <sys/param.h>
57#include <sys/systm.h>
58#include <sys/kernel.h>
59#include <sys/vmmeter.h>
60

--- 37 unchanged lines hidden (view full) ---

98 uint16_t rn_count; /* Valid children. */
99 uint16_t rn_clev; /* Current level. */
100 void *rn_child[VM_RADIX_COUNT]; /* Child nodes. */
101};
102
103static uma_zone_t vm_radix_node_zone;
104
105/*
106 * Allocate a radix node. Pre-allocation should ensure that the request
107 * will always be satisfied.
106 * Allocate a radix node.
108 */
109static __inline struct vm_radix_node *
110vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
111{
112 struct vm_radix_node *rnode;
113
107 */
108static __inline struct vm_radix_node *
109vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
110{
111 struct vm_radix_node *rnode;
112
114 rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT);
115
116 /*
117 * The required number of nodes should already be pre-allocated
118 * by vm_radix_prealloc(). However, UMA can hold a few nodes
119 * in per-CPU buckets, which will not be accessible by the
120 * current CPU. Thus, the allocation could return NULL when
121 * the pre-allocated pool is close to exhaustion. Anyway,
122 * in practice this should never occur because a new node
123 * is not always required for insert. Thus, the pre-allocated
124 * pool should have some extra pages that prevent this from
125 * becoming a problem.
126 */
113 rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO);
127 if (rnode == NULL)
114 if (rnode == NULL)
128 panic("%s: uma_zalloc() returned NULL for a new node",
129 __func__);
115 return (NULL);
130 rnode->rn_owner = owner;
131 rnode->rn_count = count;
132 rnode->rn_clev = clevel;
133 return (rnode);
134}
135
136/*
137 * Free radix node.

--- 152 unchanged lines hidden (view full) ---

290 ("vm_radix_node_put: rnode %p has %d children", rnode,
291 rnode->rn_count));
292 for (slot = 0; slot < VM_RADIX_COUNT; slot++)
293 KASSERT(rnode->rn_child[slot] == NULL,
294 ("vm_radix_node_put: rnode %p has a child", rnode));
295}
296#endif
297
116 rnode->rn_owner = owner;
117 rnode->rn_count = count;
118 rnode->rn_clev = clevel;
119 return (rnode);
120}
121
122/*
123 * Free radix node.

--- 152 unchanged lines hidden (view full) ---

276 ("vm_radix_node_put: rnode %p has %d children", rnode,
277 rnode->rn_count));
278 for (slot = 0; slot < VM_RADIX_COUNT; slot++)
279 KASSERT(rnode->rn_child[slot] == NULL,
280 ("vm_radix_node_put: rnode %p has a child", rnode));
281}
282#endif
283
284#ifndef UMA_MD_SMALL_ALLOC
298/*
285/*
299 * Radix node zone initializer.
286 * Reserve the KVA necessary to satisfy the node allocation.
287 * This is mandatory in architectures not supporting direct
288 * mapping as they will need otherwise to carve into the kernel maps for
289 * every node allocation, resulting into deadlocks for consumers already
290 * working with kernel maps.
300 */
291 */
301static int
302vm_radix_node_zone_init(void *mem, int size __unused, int flags __unused)
303{
304 struct vm_radix_node *rnode;
305
306 rnode = mem;
307 memset(rnode->rn_child, 0, sizeof(rnode->rn_child));
308 return (0);
309}
310
311/*
312 * Pre-allocate intermediate nodes from the UMA slab zone.
313 */
314static void
292static void
315vm_radix_prealloc(void *arg __unused)
293vm_radix_reserve_kva(void *arg __unused)
316{
294{
317 int nodes;
318
319 /*
320 * Calculate the number of reserved nodes, discounting the pages that
321 * are needed to store them.
322 */
295
296 /*
297 * Calculate the number of reserved nodes, discounting the pages that
298 * are needed to store them.
299 */
323 nodes = ((vm_paddr_t)cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
324 sizeof(struct vm_radix_node));
325 if (!uma_zone_reserve_kva(vm_radix_node_zone, nodes))
326 panic("%s: unable to create new zone", __func__);
327 uma_prealloc(vm_radix_node_zone, nodes);
300 if (!uma_zone_reserve_kva(vm_radix_node_zone,
301 ((vm_paddr_t)cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
302 sizeof(struct vm_radix_node))))
303 panic("%s: unable to reserve KVA", __func__);
328}
304}
329SYSINIT(vm_radix_prealloc, SI_SUB_KMEM, SI_ORDER_SECOND, vm_radix_prealloc,
330 NULL);
305SYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_SECOND,
306 vm_radix_reserve_kva, NULL);
307#endif
331
332/*
333 * Initialize the UMA slab zone.
334 * Until vm_radix_prealloc() is called, the zone will be served by the
335 * UMA boot-time pre-allocated pool of pages.
336 */
337void
338vm_radix_init(void)
339{
340
341 vm_radix_node_zone = uma_zcreate("RADIX NODE",
342 sizeof(struct vm_radix_node), NULL,
343#ifdef INVARIANTS
344 vm_radix_node_zone_dtor,
345#else
346 NULL,
347#endif
308
309/*
310 * Initialize the UMA slab zone.
311 * Until vm_radix_prealloc() is called, the zone will be served by the
312 * UMA boot-time pre-allocated pool of pages.
313 */
314void
315vm_radix_init(void)
316{
317
318 vm_radix_node_zone = uma_zcreate("RADIX NODE",
319 sizeof(struct vm_radix_node), NULL,
320#ifdef INVARIANTS
321 vm_radix_node_zone_dtor,
322#else
323 NULL,
324#endif
348 vm_radix_node_zone_init, NULL, VM_RADIX_PAD, UMA_ZONE_VM |
349 UMA_ZONE_NOFREE);
325 NULL, NULL, VM_RADIX_PAD, UMA_ZONE_VM);
350}
351
352/*
353 * Inserts the key-value pair into the trie.
354 * Panics if the key already exists.
355 */
326}
327
328/*
329 * Inserts the key-value pair into the trie.
330 * Panics if the key already exists.
331 */
356void
332int
357vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
358{
359 vm_pindex_t index, newind;
360 void **parentp;
361 struct vm_radix_node *rnode, *tmp;
362 vm_page_t m;
363 int slot;
364 uint16_t clev;
365
366 index = page->pindex;
367
333vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
334{
335 vm_pindex_t index, newind;
336 void **parentp;
337 struct vm_radix_node *rnode, *tmp;
338 vm_page_t m;
339 int slot;
340 uint16_t clev;
341
342 index = page->pindex;
343
344restart:
345
368 /*
369 * The owner of record for root is not really important because it
370 * will never be used.
371 */
372 rnode = vm_radix_getroot(rtree);
373 if (rnode == NULL) {
374 rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
346 /*
347 * The owner of record for root is not really important because it
348 * will never be used.
349 */
350 rnode = vm_radix_getroot(rtree);
351 if (rnode == NULL) {
352 rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
375 return;
353 return (0);
376 }
377 parentp = (void **)&rtree->rt_root;
378 for (;;) {
379 if (vm_radix_isleaf(rnode)) {
380 m = vm_radix_topage(rnode);
381 if (m->pindex == index)
382 panic("%s: key %jx is already present",
383 __func__, (uintmax_t)index);
384 clev = vm_radix_keydiff(m->pindex, index);
354 }
355 parentp = (void **)&rtree->rt_root;
356 for (;;) {
357 if (vm_radix_isleaf(rnode)) {
358 m = vm_radix_topage(rnode);
359 if (m->pindex == index)
360 panic("%s: key %jx is already present",
361 __func__, (uintmax_t)index);
362 clev = vm_radix_keydiff(m->pindex, index);
363
364 /*
365 * During node allocation the trie that is being
366 * walked can be modified because of recursing radix
367 * trie operations.
368 * If this is the case, the recursing functions signal
369 * such situation and the insert operation must
370 * start from scratch again.
371 * The freed radix node will then be in the UMA
372 * caches very likely to avoid the same situation
373 * to happen.
374 */
375 rtree->rt_flags |= RT_INSERT_INPROG;
385 tmp = vm_radix_node_get(vm_radix_trimkey(index,
386 clev + 1), 2, clev);
376 tmp = vm_radix_node_get(vm_radix_trimkey(index,
377 clev + 1), 2, clev);
378 rtree->rt_flags &= ~RT_INSERT_INPROG;
379 if (tmp == NULL) {
380 rtree->rt_flags &= ~RT_TRIE_MODIFIED;
381 return (ENOMEM);
382 }
383 if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
384 rtree->rt_flags &= ~RT_TRIE_MODIFIED;
385 tmp->rn_count = 0;
386 vm_radix_node_put(tmp);
387 goto restart;
388 }
387 *parentp = tmp;
388 vm_radix_addpage(tmp, index, clev, page);
389 vm_radix_addpage(tmp, m->pindex, clev, m);
389 *parentp = tmp;
390 vm_radix_addpage(tmp, index, clev, page);
391 vm_radix_addpage(tmp, m->pindex, clev, m);
390 return;
392 return (0);
391 } else if (vm_radix_keybarr(rnode, index))
392 break;
393 slot = vm_radix_slot(index, rnode->rn_clev);
394 if (rnode->rn_child[slot] == NULL) {
395 rnode->rn_count++;
396 vm_radix_addpage(rnode, index, rnode->rn_clev, page);
393 } else if (vm_radix_keybarr(rnode, index))
394 break;
395 slot = vm_radix_slot(index, rnode->rn_clev);
396 if (rnode->rn_child[slot] == NULL) {
397 rnode->rn_count++;
398 vm_radix_addpage(rnode, index, rnode->rn_clev, page);
397 return;
399 return (0);
398 }
399 parentp = &rnode->rn_child[slot];
400 rnode = rnode->rn_child[slot];
401 }
402
403 /*
404 * A new node is needed because the right insertion level is reached.
405 * Setup the new intermediate node and add the 2 children: the
406 * new object and the older edge.
407 */
408 newind = rnode->rn_owner;
409 clev = vm_radix_keydiff(newind, index);
400 }
401 parentp = &rnode->rn_child[slot];
402 rnode = rnode->rn_child[slot];
403 }
404
405 /*
406 * A new node is needed because the right insertion level is reached.
407 * Setup the new intermediate node and add the 2 children: the
408 * new object and the older edge.
409 */
410 newind = rnode->rn_owner;
411 clev = vm_radix_keydiff(newind, index);
410 tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2,
411 clev);
412
413 /* See the comments above. */
414 rtree->rt_flags |= RT_INSERT_INPROG;
415 tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
416 rtree->rt_flags &= ~RT_INSERT_INPROG;
417 if (tmp == NULL) {
418 rtree->rt_flags &= ~RT_TRIE_MODIFIED;
419 return (ENOMEM);
420 }
421 if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
422 rtree->rt_flags &= ~RT_TRIE_MODIFIED;
423 tmp->rn_count = 0;
424 vm_radix_node_put(tmp);
425 goto restart;
426 }
412 *parentp = tmp;
413 vm_radix_addpage(tmp, index, clev, page);
414 slot = vm_radix_slot(newind, clev);
415 tmp->rn_child[slot] = rnode;
427 *parentp = tmp;
428 vm_radix_addpage(tmp, index, clev, page);
429 slot = vm_radix_slot(newind, clev);
430 tmp->rn_child[slot] = rnode;
431 return (0);
416}
417
418/*
419 * Returns the value stored at the index. If the index is not present,
420 * NULL is returned.
421 */
422vm_page_t
423vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)

--- 248 unchanged lines hidden (view full) ---

672 */
673void
674vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
675{
676 struct vm_radix_node *rnode, *parent;
677 vm_page_t m;
678 int i, slot;
679
432}
433
434/*
435 * Returns the value stored at the index. If the index is not present,
436 * NULL is returned.
437 */
438vm_page_t
439vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)

--- 248 unchanged lines hidden (view full) ---

688 */
689void
690vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
691{
692 struct vm_radix_node *rnode, *parent;
693 vm_page_t m;
694 int i, slot;
695
696 /*
697 * Detect if a page is going to be removed from a trie which is
698 * already undergoing another trie operation.
699 * Right now this is only possible for vm_radix_remove() recursing
700 * into vm_radix_insert().
701 * If this is the case, the caller must be notified about this
702 * situation. It will also takecare to update the RT_TRIE_MODIFIED
703 * accordingly.
704 * The RT_TRIE_MODIFIED bit is set here because the remove operation
705 * will always succeed.
706 */
707 if ((rtree->rt_flags & RT_INSERT_INPROG) != 0)
708 rtree->rt_flags |= RT_TRIE_MODIFIED;
709
680 rnode = vm_radix_getroot(rtree);
681 if (vm_radix_isleaf(rnode)) {
682 m = vm_radix_topage(rnode);
683 if (m->pindex != index)
684 panic("%s: invalid key found", __func__);
685 vm_radix_setroot(rtree, NULL);
686 return;
687 }

--- 38 unchanged lines hidden (view full) ---

726 * This function is recursive but there is a tight control on it as the
727 * maximum depth of the tree is fixed.
728 */
729void
730vm_radix_reclaim_allnodes(struct vm_radix *rtree)
731{
732 struct vm_radix_node *root;
733
710 rnode = vm_radix_getroot(rtree);
711 if (vm_radix_isleaf(rnode)) {
712 m = vm_radix_topage(rnode);
713 if (m->pindex != index)
714 panic("%s: invalid key found", __func__);
715 vm_radix_setroot(rtree, NULL);
716 return;
717 }

--- 38 unchanged lines hidden (view full) ---

756 * This function is recursive but there is a tight control on it as the
757 * maximum depth of the tree is fixed.
758 */
759void
760vm_radix_reclaim_allnodes(struct vm_radix *rtree)
761{
762 struct vm_radix_node *root;
763
764 KASSERT((rtree->rt_flags & RT_INSERT_INPROG) == 0,
765 ("vm_radix_reclaim_allnodes: unexpected trie recursion"));
766
734 root = vm_radix_getroot(rtree);
735 if (root == NULL)
736 return;
737 vm_radix_setroot(rtree, NULL);
738 if (!vm_radix_isleaf(root))
739 vm_radix_reclaim_allnodes_int(root);
740}
741
767 root = vm_radix_getroot(rtree);
768 if (root == NULL)
769 return;
770 vm_radix_setroot(rtree, NULL);
771 if (!vm_radix_isleaf(root))
772 vm_radix_reclaim_allnodes_int(root);
773}
774
775/*
776 * Replace an existing page into the trie with another one.
777 * Panics if the replacing page is not present or if the new page has an
778 * invalid key.
779 */
780vm_page_t
781vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage, vm_pindex_t index)
782{
783 struct vm_radix_node *rnode;
784 vm_page_t m;
785 int slot;
786
787 KASSERT(newpage->pindex == index, ("%s: newpage index invalid",
788 __func__));
789
790 rnode = vm_radix_getroot(rtree);
791 if (rnode == NULL)
792 panic("%s: replacing page on an empty trie", __func__);
793 if (vm_radix_isleaf(rnode)) {
794 m = vm_radix_topage(rnode);
795 if (m->pindex != index)
796 panic("%s: original replacing root key not found",
797 __func__);
798 rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF;
799 return (m);
800 }
801 for (;;) {
802 slot = vm_radix_slot(index, rnode->rn_clev);
803 if (vm_radix_isleaf(rnode->rn_child[slot])) {
804 m = vm_radix_topage(rnode->rn_child[slot]);
805 if (m->pindex == index) {
806 rnode->rn_child[slot] =
807 (void *)((uintptr_t)newpage |
808 VM_RADIX_ISLEAF);
809 return (m);
810 } else
811 break;
812 } else if (rnode->rn_child[slot] == NULL ||
813 vm_radix_keybarr(rnode->rn_child[slot], index))
814 break;
815 rnode = rnode->rn_child[slot];
816 }
817 panic("%s: original replacing page not found", __func__);
818}
819
742#ifdef DDB
743/*
744 * Show details about the given radix node.
745 */
746DB_SHOW_COMMAND(radixnode, db_show_radixnode)
747{
748 struct vm_radix_node *rnode;
749 int i;

--- 16 unchanged lines hidden ---
820#ifdef DDB
821/*
822 * Show details about the given radix node.
823 */
824DB_SHOW_COMMAND(radixnode, db_show_radixnode)
825{
826 struct vm_radix_node *rnode;
827 int i;

--- 16 unchanged lines hidden ---