1/*
2 * Copyright (C) 2001 Momchil Velikov
3 * Portions Copyright (C) 2001 Christoph Hellwig
4 * Copyright (C) 2005 SGI, Christoph Lameter
5 * Copyright (C) 2006 Nick Piggin
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2, or (at
10 * your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 */
21
22#include <linux/errno.h>
23#include <linux/init.h>
24#include <linux/kernel.h>
25#include <linux/module.h>
26#include <linux/radix-tree.h>
27#include <linux/percpu.h>
28#include <linux/slab.h>
29#include <linux/notifier.h>
30#include <linux/cpu.h>
31#include <linux/string.h>
32#include <linux/bitops.h>
33#include <linux/rcupdate.h>
34
35
36#ifdef __KERNEL__
37#define RADIX_TREE_MAP_SHIFT	(CONFIG_BASE_SMALL ? 4 : 6)
38#else
39#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
40#endif
41
42#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
43#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
44
45#define RADIX_TREE_TAG_LONGS	\
46	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
47
48struct radix_tree_node {
49	unsigned int	height;		/* Height from the bottom */
50	unsigned int	count;
51	struct rcu_head	rcu_head;
52	void		*slots[RADIX_TREE_MAP_SIZE];
53	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
54};
55
56struct radix_tree_path {
57	struct radix_tree_node *node;
58	int offset;
59};
60
61#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
62#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
63					  RADIX_TREE_MAP_SHIFT))
64
65/*
66 * The height_to_maxindex array needs to be one deeper than the maximum
67 * path as height 0 holds only 1 entry.
68 */
69static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
70
71/*
72 * Radix tree node cache.
73 */
74static struct kmem_cache *radix_tree_node_cachep;
75
76/*
77 * Per-cpu pool of preloaded nodes
78 */
79struct radix_tree_preload {
80	int nr;
81	struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
82};
83static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
84
85static inline void *ptr_to_indirect(void *ptr)
86{
87	return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
88}
89
90static inline void *indirect_to_ptr(void *ptr)
91{
92	return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
93}
94
95static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
96{
97	return root->gfp_mask & __GFP_BITS_MASK;
98}
99
100static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
101		int offset)
102{
103	__set_bit(offset, node->tags[tag]);
104}
105
106static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
107		int offset)
108{
109	__clear_bit(offset, node->tags[tag]);
110}
111
112static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
113		int offset)
114{
115	return test_bit(offset, node->tags[tag]);
116}
117
118static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
119{
120	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
121}
122
123static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
124{
125	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
126}
127
128static inline void root_tag_clear_all(struct radix_tree_root *root)
129{
130	root->gfp_mask &= __GFP_BITS_MASK;
131}
132
133static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
134{
135	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
136}
137
138/*
139 * Returns 1 if any slot in the node has this tag set.
140 * Otherwise returns 0.
141 */
142static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
143{
144	int idx;
145	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
146		if (node->tags[tag][idx])
147			return 1;
148	}
149	return 0;
150}
151/*
152 * This assumes that the caller has performed appropriate preallocation, and
153 * that the caller has pinned this thread of control to the current CPU.
154 */
155static struct radix_tree_node *
156radix_tree_node_alloc(struct radix_tree_root *root)
157{
158	struct radix_tree_node *ret = NULL;
159	gfp_t gfp_mask = root_gfp_mask(root);
160
161	if (!(gfp_mask & __GFP_WAIT)) {
162		struct radix_tree_preload *rtp;
163
164		/*
165		 * Provided the caller has preloaded here, we will always
166		 * succeed in getting a node here (and never reach
167		 * kmem_cache_alloc)
168		 */
169		rtp = &__get_cpu_var(radix_tree_preloads);
170		if (rtp->nr) {
171			ret = rtp->nodes[rtp->nr - 1];
172			rtp->nodes[rtp->nr - 1] = NULL;
173			rtp->nr--;
174		}
175	}
176	if (ret == NULL)
177		ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
178
179	BUG_ON(radix_tree_is_indirect_ptr(ret));
180	return ret;
181}
182
183static void radix_tree_node_rcu_free(struct rcu_head *head)
184{
185	struct radix_tree_node *node =
186			container_of(head, struct radix_tree_node, rcu_head);
187	int i;
188
189	/*
190	 * must only free zeroed nodes into the slab. radix_tree_shrink
191	 * can leave us with a non-NULL entry in the first slot, so clear
192	 * that here to make sure.
193	 */
194	for (i = 0; i < RADIX_TREE_MAX_TAGS; i++)
195		tag_clear(node, i, 0);
196
197	node->slots[0] = NULL;
198	node->count = 0;
199
200	kmem_cache_free(radix_tree_node_cachep, node);
201}
202
203static inline void
204radix_tree_node_free(struct radix_tree_node *node)
205{
206	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
207}
208
209/*
210 * Load up this CPU's radix_tree_node buffer with sufficient objects to
211 * ensure that the addition of a single element in the tree cannot fail.  On
212 * success, return zero, with preemption disabled.  On error, return -ENOMEM
213 * with preemption not disabled.
214 *
215 * To make use of this facility, the radix tree must be initialised without
216 * __GFP_WAIT being passed to INIT_RADIX_TREE().
217 */
218int radix_tree_preload(gfp_t gfp_mask)
219{
220	struct radix_tree_preload *rtp;
221	struct radix_tree_node *node;
222	int ret = -ENOMEM;
223
224	preempt_disable();
225	rtp = &__get_cpu_var(radix_tree_preloads);
226	while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
227		preempt_enable();
228		node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
229		if (node == NULL)
230			goto out;
231		preempt_disable();
232		rtp = &__get_cpu_var(radix_tree_preloads);
233		if (rtp->nr < ARRAY_SIZE(rtp->nodes))
234			rtp->nodes[rtp->nr++] = node;
235		else
236			kmem_cache_free(radix_tree_node_cachep, node);
237	}
238	ret = 0;
239out:
240	return ret;
241}
242EXPORT_SYMBOL(radix_tree_preload);
243
244/*
245 *	Return the maximum key which can be store into a
246 *	radix tree with height HEIGHT.
247 */
248static inline unsigned long radix_tree_maxindex(unsigned int height)
249{
250	return height_to_maxindex[height];
251}
252
253/*
254 *	Extend a radix tree so it can store key @index.
255 */
256static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
257{
258	struct radix_tree_node *node;
259	unsigned int height;
260	int tag;
261
262	/* Figure out what the height should be.  */
263	height = root->height + 1;
264	while (index > radix_tree_maxindex(height))
265		height++;
266
267	if (root->rnode == NULL) {
268		root->height = height;
269		goto out;
270	}
271
272	do {
273		unsigned int newheight;
274		if (!(node = radix_tree_node_alloc(root)))
275			return -ENOMEM;
276
277		/* Increase the height.  */
278		node->slots[0] = indirect_to_ptr(root->rnode);
279
280		/* Propagate the aggregated tag info into the new root */
281		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
282			if (root_tag_get(root, tag))
283				tag_set(node, tag, 0);
284		}
285
286		newheight = root->height+1;
287		node->height = newheight;
288		node->count = 1;
289		node = ptr_to_indirect(node);
290		rcu_assign_pointer(root->rnode, node);
291		root->height = newheight;
292	} while (height > root->height);
293out:
294	return 0;
295}
296
297/**
298 *	radix_tree_insert    -    insert into a radix tree
299 *	@root:		radix tree root
300 *	@index:		index key
301 *	@item:		item to insert
302 *
303 *	Insert an item into the radix tree at position @index.
304 */
305int radix_tree_insert(struct radix_tree_root *root,
306			unsigned long index, void *item)
307{
308	struct radix_tree_node *node = NULL, *slot;
309	unsigned int height, shift;
310	int offset;
311	int error;
312
313	BUG_ON(radix_tree_is_indirect_ptr(item));
314
315	/* Make sure the tree is high enough.  */
316	if (index > radix_tree_maxindex(root->height)) {
317		error = radix_tree_extend(root, index);
318		if (error)
319			return error;
320	}
321
322	slot = indirect_to_ptr(root->rnode);
323
324	height = root->height;
325	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
326
327	offset = 0;			/* uninitialised var warning */
328	while (height > 0) {
329		if (slot == NULL) {
330			/* Have to add a child node.  */
331			if (!(slot = radix_tree_node_alloc(root)))
332				return -ENOMEM;
333			slot->height = height;
334			if (node) {
335				rcu_assign_pointer(node->slots[offset], slot);
336				node->count++;
337			} else
338				rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
339		}
340
341		/* Go a level down */
342		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
343		node = slot;
344		slot = node->slots[offset];
345		shift -= RADIX_TREE_MAP_SHIFT;
346		height--;
347	}
348
349	if (slot != NULL)
350		return -EEXIST;
351
352	if (node) {
353		node->count++;
354		rcu_assign_pointer(node->slots[offset], item);
355		BUG_ON(tag_get(node, 0, offset));
356		BUG_ON(tag_get(node, 1, offset));
357	} else {
358		rcu_assign_pointer(root->rnode, item);
359		BUG_ON(root_tag_get(root, 0));
360		BUG_ON(root_tag_get(root, 1));
361	}
362
363	return 0;
364}
365EXPORT_SYMBOL(radix_tree_insert);
366
367/*
368 * is_slot == 1 : search for the slot.
369 * is_slot == 0 : search for the node.
370 */
371static void *radix_tree_lookup_element(struct radix_tree_root *root,
372				unsigned long index, int is_slot)
373{
374	unsigned int height, shift;
375	struct radix_tree_node *node, **slot;
376
377	node = rcu_dereference_raw(root->rnode);
378	if (node == NULL)
379		return NULL;
380
381	if (!radix_tree_is_indirect_ptr(node)) {
382		if (index > 0)
383			return NULL;
384		return is_slot ? (void *)&root->rnode : node;
385	}
386	node = indirect_to_ptr(node);
387
388	height = node->height;
389	if (index > radix_tree_maxindex(height))
390		return NULL;
391
392	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
393
394	do {
395		slot = (struct radix_tree_node **)
396			(node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
397		node = rcu_dereference_raw(*slot);
398		if (node == NULL)
399			return NULL;
400
401		shift -= RADIX_TREE_MAP_SHIFT;
402		height--;
403	} while (height > 0);
404
405	return is_slot ? (void *)slot : indirect_to_ptr(node);
406}
407
408/**
409 *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
410 *	@root:		radix tree root
411 *	@index:		index key
412 *
413 *	Returns:  the slot corresponding to the position @index in the
414 *	radix tree @root. This is useful for update-if-exists operations.
415 *
416 *	This function can be called under rcu_read_lock iff the slot is not
417 *	modified by radix_tree_replace_slot, otherwise it must be called
418 *	exclusive from other writers. Any dereference of the slot must be done
419 *	using radix_tree_deref_slot.
420 */
421void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
422{
423	return (void **)radix_tree_lookup_element(root, index, 1);
424}
425EXPORT_SYMBOL(radix_tree_lookup_slot);
426
427/**
428 *	radix_tree_lookup    -    perform lookup operation on a radix tree
429 *	@root:		radix tree root
430 *	@index:		index key
431 *
432 *	Lookup the item at the position @index in the radix tree @root.
433 *
434 *	This function can be called under rcu_read_lock, however the caller
435 *	must manage lifetimes of leaf nodes (eg. RCU may also be used to free
436 *	them safely). No RCU barriers are required to access or modify the
437 *	returned item, however.
438 */
439void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
440{
441	return radix_tree_lookup_element(root, index, 0);
442}
443EXPORT_SYMBOL(radix_tree_lookup);
444
445/**
446 *	radix_tree_tag_set - set a tag on a radix tree node
447 *	@root:		radix tree root
448 *	@index:		index key
449 *	@tag: 		tag index
450 *
451 *	Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
452 *	corresponding to @index in the radix tree.  From
453 *	the root all the way down to the leaf node.
454 *
455 *	Returns the address of the tagged item.   Setting a tag on a not-present
456 *	item is a bug.
457 */
458void *radix_tree_tag_set(struct radix_tree_root *root,
459			unsigned long index, unsigned int tag)
460{
461	unsigned int height, shift;
462	struct radix_tree_node *slot;
463
464	height = root->height;
465	BUG_ON(index > radix_tree_maxindex(height));
466
467	slot = indirect_to_ptr(root->rnode);
468	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
469
470	while (height > 0) {
471		int offset;
472
473		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
474		if (!tag_get(slot, tag, offset))
475			tag_set(slot, tag, offset);
476		slot = slot->slots[offset];
477		BUG_ON(slot == NULL);
478		shift -= RADIX_TREE_MAP_SHIFT;
479		height--;
480	}
481
482	/* set the root's tag bit */
483	if (slot && !root_tag_get(root, tag))
484		root_tag_set(root, tag);
485
486	return slot;
487}
488EXPORT_SYMBOL(radix_tree_tag_set);
489
490/**
491 *	radix_tree_tag_clear - clear a tag on a radix tree node
492 *	@root:		radix tree root
493 *	@index:		index key
494 *	@tag: 		tag index
495 *
496 *	Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
497 *	corresponding to @index in the radix tree.  If
498 *	this causes the leaf node to have no tags set then clear the tag in the
499 *	next-to-leaf node, etc.
500 *
501 *	Returns the address of the tagged item on success, else NULL.  ie:
502 *	has the same return value and semantics as radix_tree_lookup().
503 */
504void *radix_tree_tag_clear(struct radix_tree_root *root,
505			unsigned long index, unsigned int tag)
506{
507	/*
508	 * The radix tree path needs to be one longer than the maximum path
509	 * since the "list" is null terminated.
510	 */
511	struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
512	struct radix_tree_node *slot = NULL;
513	unsigned int height, shift;
514
515	height = root->height;
516	if (index > radix_tree_maxindex(height))
517		goto out;
518
519	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
520	pathp->node = NULL;
521	slot = indirect_to_ptr(root->rnode);
522
523	while (height > 0) {
524		int offset;
525
526		if (slot == NULL)
527			goto out;
528
529		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
530		pathp[1].offset = offset;
531		pathp[1].node = slot;
532		slot = slot->slots[offset];
533		pathp++;
534		shift -= RADIX_TREE_MAP_SHIFT;
535		height--;
536	}
537
538	if (slot == NULL)
539		goto out;
540
541	while (pathp->node) {
542		if (!tag_get(pathp->node, tag, pathp->offset))
543			goto out;
544		tag_clear(pathp->node, tag, pathp->offset);
545		if (any_tag_set(pathp->node, tag))
546			goto out;
547		pathp--;
548	}
549
550	/* clear the root's tag bit */
551	if (root_tag_get(root, tag))
552		root_tag_clear(root, tag);
553
554out:
555	return slot;
556}
557EXPORT_SYMBOL(radix_tree_tag_clear);
558
559/**
560 * radix_tree_tag_get - get a tag on a radix tree node
561 * @root:		radix tree root
562 * @index:		index key
563 * @tag: 		tag index (< RADIX_TREE_MAX_TAGS)
564 *
565 * Return values:
566 *
567 *  0: tag not present or not set
568 *  1: tag set
569 *
570 * Note that the return value of this function may not be relied on, even if
571 * the RCU lock is held, unless tag modification and node deletion are excluded
572 * from concurrency.
573 */
574int radix_tree_tag_get(struct radix_tree_root *root,
575			unsigned long index, unsigned int tag)
576{
577	unsigned int height, shift;
578	struct radix_tree_node *node;
579	int saw_unset_tag = 0;
580
581	/* check the root's tag bit */
582	if (!root_tag_get(root, tag))
583		return 0;
584
585	node = rcu_dereference_raw(root->rnode);
586	if (node == NULL)
587		return 0;
588
589	if (!radix_tree_is_indirect_ptr(node))
590		return (index == 0);
591	node = indirect_to_ptr(node);
592
593	height = node->height;
594	if (index > radix_tree_maxindex(height))
595		return 0;
596
597	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
598
599	for ( ; ; ) {
600		int offset;
601
602		if (node == NULL)
603			return 0;
604
605		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
606
607		/*
608		 * This is just a debug check.  Later, we can bale as soon as
609		 * we see an unset tag.
610		 */
611		if (!tag_get(node, tag, offset))
612			saw_unset_tag = 1;
613		if (height == 1)
614			return !!tag_get(node, tag, offset);
615		node = rcu_dereference_raw(node->slots[offset]);
616		shift -= RADIX_TREE_MAP_SHIFT;
617		height--;
618	}
619}
620EXPORT_SYMBOL(radix_tree_tag_get);
621
622/**
623 * radix_tree_range_tag_if_tagged - for each item in given range set given
624 *				   tag if item has another tag set
625 * @root:		radix tree root
626 * @first_indexp:	pointer to a starting index of a range to scan
627 * @last_index:		last index of a range to scan
628 * @nr_to_tag:		maximum number items to tag
629 * @iftag:		tag index to test
630 * @settag:		tag index to set if tested tag is set
631 *
632 * This function scans range of radix tree from first_index to last_index
633 * (inclusive).  For each item in the range if iftag is set, the function sets
634 * also settag. The function stops either after tagging nr_to_tag items or
635 * after reaching last_index.
636 *
637 * The tags must be set from the leaf level only and propagated back up the
638 * path to the root. We must do this so that we resolve the full path before
639 * setting any tags on intermediate nodes. If we set tags as we descend, then
640 * we can get to the leaf node and find that the index that has the iftag
641 * set is outside the range we are scanning. This reults in dangling tags and
642 * can lead to problems with later tag operations (e.g. livelocks on lookups).
643 *
644 * The function returns number of leaves where the tag was set and sets
645 * *first_indexp to the first unscanned index.
646 * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must
647 * be prepared to handle that.
648 */
649unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
650		unsigned long *first_indexp, unsigned long last_index,
651		unsigned long nr_to_tag,
652		unsigned int iftag, unsigned int settag)
653{
654	unsigned int height = root->height;
655	struct radix_tree_path path[height];
656	struct radix_tree_path *pathp = path;
657	struct radix_tree_node *slot;
658	unsigned int shift;
659	unsigned long tagged = 0;
660	unsigned long index = *first_indexp;
661
662	last_index = min(last_index, radix_tree_maxindex(height));
663	if (index > last_index)
664		return 0;
665	if (!nr_to_tag)
666		return 0;
667	if (!root_tag_get(root, iftag)) {
668		*first_indexp = last_index + 1;
669		return 0;
670	}
671	if (height == 0) {
672		*first_indexp = last_index + 1;
673		root_tag_set(root, settag);
674		return 1;
675	}
676
677	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
678	slot = indirect_to_ptr(root->rnode);
679
680	/*
681	 * we fill the path from (root->height - 2) to 0, leaving the index at
682	 * (root->height - 1) as a terminator. Zero the node in the terminator
683	 * so that we can use this to end walk loops back up the path.
684	 */
685	path[height - 1].node = NULL;
686
687	for (;;) {
688		int offset;
689
690		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
691		if (!slot->slots[offset])
692			goto next;
693		if (!tag_get(slot, iftag, offset))
694			goto next;
695		if (height > 1) {
696			/* Go down one level */
697			height--;
698			shift -= RADIX_TREE_MAP_SHIFT;
699			path[height - 1].node = slot;
700			path[height - 1].offset = offset;
701			slot = slot->slots[offset];
702			continue;
703		}
704
705		/* tag the leaf */
706		tagged++;
707		tag_set(slot, settag, offset);
708
709		/* walk back up the path tagging interior nodes */
710		pathp = &path[0];
711		while (pathp->node) {
712			/* stop if we find a node with the tag already set */
713			if (tag_get(pathp->node, settag, pathp->offset))
714				break;
715			tag_set(pathp->node, settag, pathp->offset);
716			pathp++;
717		}
718
719next:
720		/* Go to next item at level determined by 'shift' */
721		index = ((index >> shift) + 1) << shift;
722		/* Overflow can happen when last_index is ~0UL... */
723		if (index > last_index || !index)
724			break;
725		if (tagged >= nr_to_tag)
726			break;
727		while (((index >> shift) & RADIX_TREE_MAP_MASK) == 0) {
728			/*
729			 * We've fully scanned this node. Go up. Because
730			 * last_index is guaranteed to be in the tree, what
731			 * we do below cannot wander astray.
732			 */
733			slot = path[height - 1].node;
734			height++;
735			shift += RADIX_TREE_MAP_SHIFT;
736		}
737	}
738	/*
739	 * The iftag must have been set somewhere because otherwise
740	 * we would return immediated at the beginning of the function
741	 */
742	root_tag_set(root, settag);
743	*first_indexp = index;
744
745	return tagged;
746}
747EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
748
749
750/**
751 *	radix_tree_next_hole    -    find the next hole (not-present entry)
752 *	@root:		tree root
753 *	@index:		index key
754 *	@max_scan:	maximum range to search
755 *
756 *	Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest
757 *	indexed hole.
758 *
759 *	Returns: the index of the hole if found, otherwise returns an index
760 *	outside of the set specified (in which case 'return - index >= max_scan'
761 *	will be true). In rare cases of index wrap-around, 0 will be returned.
762 *
763 *	radix_tree_next_hole may be called under rcu_read_lock. However, like
764 *	radix_tree_gang_lookup, this will not atomically search a snapshot of
765 *	the tree at a single point in time. For example, if a hole is created
766 *	at index 5, then subsequently a hole is created at index 10,
767 *	radix_tree_next_hole covering both indexes may return 10 if called
768 *	under rcu_read_lock.
769 */
770unsigned long radix_tree_next_hole(struct radix_tree_root *root,
771				unsigned long index, unsigned long max_scan)
772{
773	unsigned long i;
774
775	for (i = 0; i < max_scan; i++) {
776		if (!radix_tree_lookup(root, index))
777			break;
778		index++;
779		if (index == 0)
780			break;
781	}
782
783	return index;
784}
785EXPORT_SYMBOL(radix_tree_next_hole);
786
787/**
788 *	radix_tree_prev_hole    -    find the prev hole (not-present entry)
789 *	@root:		tree root
790 *	@index:		index key
791 *	@max_scan:	maximum range to search
792 *
793 *	Search backwards in the range [max(index-max_scan+1, 0), index]
794 *	for the first hole.
795 *
796 *	Returns: the index of the hole if found, otherwise returns an index
797 *	outside of the set specified (in which case 'index - return >= max_scan'
798 *	will be true). In rare cases of wrap-around, ULONG_MAX will be returned.
799 *
800 *	radix_tree_next_hole may be called under rcu_read_lock. However, like
801 *	radix_tree_gang_lookup, this will not atomically search a snapshot of
802 *	the tree at a single point in time. For example, if a hole is created
803 *	at index 10, then subsequently a hole is created at index 5,
804 *	radix_tree_prev_hole covering both indexes may return 5 if called under
805 *	rcu_read_lock.
806 */
807unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
808				   unsigned long index, unsigned long max_scan)
809{
810	unsigned long i;
811
812	for (i = 0; i < max_scan; i++) {
813		if (!radix_tree_lookup(root, index))
814			break;
815		index--;
816		if (index == ULONG_MAX)
817			break;
818	}
819
820	return index;
821}
822EXPORT_SYMBOL(radix_tree_prev_hole);
823
824static unsigned int
825__lookup(struct radix_tree_node *slot, void ***results, unsigned long index,
826	unsigned int max_items, unsigned long *next_index)
827{
828	unsigned int nr_found = 0;
829	unsigned int shift, height;
830	unsigned long i;
831
832	height = slot->height;
833	if (height == 0)
834		goto out;
835	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
836
837	for ( ; height > 1; height--) {
838		i = (index >> shift) & RADIX_TREE_MAP_MASK;
839		for (;;) {
840			if (slot->slots[i] != NULL)
841				break;
842			index &= ~((1UL << shift) - 1);
843			index += 1UL << shift;
844			if (index == 0)
845				goto out;	/* 32-bit wraparound */
846			i++;
847			if (i == RADIX_TREE_MAP_SIZE)
848				goto out;
849		}
850
851		shift -= RADIX_TREE_MAP_SHIFT;
852		slot = rcu_dereference_raw(slot->slots[i]);
853		if (slot == NULL)
854			goto out;
855	}
856
857	/* Bottom level: grab some items */
858	for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
859		index++;
860		if (slot->slots[i]) {
861			results[nr_found++] = &(slot->slots[i]);
862			if (nr_found == max_items)
863				goto out;
864		}
865	}
866out:
867	*next_index = index;
868	return nr_found;
869}
870
871/**
872 *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
873 *	@root:		radix tree root
874 *	@results:	where the results of the lookup are placed
875 *	@first_index:	start the lookup from this key
876 *	@max_items:	place up to this many items at *results
877 *
878 *	Performs an index-ascending scan of the tree for present items.  Places
879 *	them at *@results and returns the number of items which were placed at
880 *	*@results.
881 *
882 *	The implementation is naive.
883 *
884 *	Like radix_tree_lookup, radix_tree_gang_lookup may be called under
885 *	rcu_read_lock. In this case, rather than the returned results being
886 *	an atomic snapshot of the tree at a single point in time, the semantics
887 *	of an RCU protected gang lookup are as though multiple radix_tree_lookups
888 *	have been issued in individual locks, and results stored in 'results'.
889 */
890unsigned int
891radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
892			unsigned long first_index, unsigned int max_items)
893{
894	unsigned long max_index;
895	struct radix_tree_node *node;
896	unsigned long cur_index = first_index;
897	unsigned int ret;
898
899	node = rcu_dereference_raw(root->rnode);
900	if (!node)
901		return 0;
902
903	if (!radix_tree_is_indirect_ptr(node)) {
904		if (first_index > 0)
905			return 0;
906		results[0] = node;
907		return 1;
908	}
909	node = indirect_to_ptr(node);
910
911	max_index = radix_tree_maxindex(node->height);
912
913	ret = 0;
914	while (ret < max_items) {
915		unsigned int nr_found, slots_found, i;
916		unsigned long next_index;	/* Index of next search */
917
918		if (cur_index > max_index)
919			break;
920		slots_found = __lookup(node, (void ***)results + ret, cur_index,
921					max_items - ret, &next_index);
922		nr_found = 0;
923		for (i = 0; i < slots_found; i++) {
924			struct radix_tree_node *slot;
925			slot = *(((void ***)results)[ret + i]);
926			if (!slot)
927				continue;
928			results[ret + nr_found] =
929				indirect_to_ptr(rcu_dereference_raw(slot));
930			nr_found++;
931		}
932		ret += nr_found;
933		if (next_index == 0)
934			break;
935		cur_index = next_index;
936	}
937
938	return ret;
939}
940EXPORT_SYMBOL(radix_tree_gang_lookup);
941
942/**
943 *	radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree
944 *	@root:		radix tree root
945 *	@results:	where the results of the lookup are placed
946 *	@first_index:	start the lookup from this key
947 *	@max_items:	place up to this many items at *results
948 *
949 *	Performs an index-ascending scan of the tree for present items.  Places
950 *	their slots at *@results and returns the number of items which were
951 *	placed at *@results.
952 *
953 *	The implementation is naive.
954 *
955 *	Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must
956 *	be dereferenced with radix_tree_deref_slot, and if using only RCU
957 *	protection, radix_tree_deref_slot may fail requiring a retry.
958 */
959unsigned int
960radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
961			unsigned long first_index, unsigned int max_items)
962{
963	unsigned long max_index;
964	struct radix_tree_node *node;
965	unsigned long cur_index = first_index;
966	unsigned int ret;
967
968	node = rcu_dereference_raw(root->rnode);
969	if (!node)
970		return 0;
971
972	if (!radix_tree_is_indirect_ptr(node)) {
973		if (first_index > 0)
974			return 0;
975		results[0] = (void **)&root->rnode;
976		return 1;
977	}
978	node = indirect_to_ptr(node);
979
980	max_index = radix_tree_maxindex(node->height);
981
982	ret = 0;
983	while (ret < max_items) {
984		unsigned int slots_found;
985		unsigned long next_index;	/* Index of next search */
986
987		if (cur_index > max_index)
988			break;
989		slots_found = __lookup(node, results + ret, cur_index,
990					max_items - ret, &next_index);
991		ret += slots_found;
992		if (next_index == 0)
993			break;
994		cur_index = next_index;
995	}
996
997	return ret;
998}
999EXPORT_SYMBOL(radix_tree_gang_lookup_slot);
1000
1001static unsigned int
1002__lookup_tag(struct radix_tree_node *slot, void ***results, unsigned long index,
1003	unsigned int max_items, unsigned long *next_index, unsigned int tag)
1004{
1005	unsigned int nr_found = 0;
1006	unsigned int shift, height;
1007
1008	height = slot->height;
1009	if (height == 0)
1010		goto out;
1011	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
1012
1013	while (height > 0) {
1014		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ;
1015
1016		for (;;) {
1017			if (tag_get(slot, tag, i))
1018				break;
1019			index &= ~((1UL << shift) - 1);
1020			index += 1UL << shift;
1021			if (index == 0)
1022				goto out;	/* 32-bit wraparound */
1023			i++;
1024			if (i == RADIX_TREE_MAP_SIZE)
1025				goto out;
1026		}
1027		height--;
1028		if (height == 0) {	/* Bottom level: grab some items */
1029			unsigned long j = index & RADIX_TREE_MAP_MASK;
1030
1031			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
1032				index++;
1033				if (!tag_get(slot, tag, j))
1034					continue;
1035				/*
1036				 * Even though the tag was found set, we need to
1037				 * recheck that we have a non-NULL node, because
1038				 * if this lookup is lockless, it may have been
1039				 * subsequently deleted.
1040				 *
1041				 * Similar care must be taken in any place that
1042				 * lookup ->slots[x] without a lock (ie. can't
1043				 * rely on its value remaining the same).
1044				 */
1045				if (slot->slots[j]) {
1046					results[nr_found++] = &(slot->slots[j]);
1047					if (nr_found == max_items)
1048						goto out;
1049				}
1050			}
1051		}
1052		shift -= RADIX_TREE_MAP_SHIFT;
1053		slot = rcu_dereference_raw(slot->slots[i]);
1054		if (slot == NULL)
1055			break;
1056	}
1057out:
1058	*next_index = index;
1059	return nr_found;
1060}
1061
1062/**
1063 *	radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
1064 *	                             based on a tag
1065 *	@root:		radix tree root
1066 *	@results:	where the results of the lookup are placed
1067 *	@first_index:	start the lookup from this key
1068 *	@max_items:	place up to this many items at *results
1069 *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
1070 *
1071 *	Performs an index-ascending scan of the tree for present items which
1072 *	have the tag indexed by @tag set.  Places the items at *@results and
1073 *	returns the number of items which were placed at *@results.
1074 */
1075unsigned int
1076radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
1077		unsigned long first_index, unsigned int max_items,
1078		unsigned int tag)
1079{
1080	struct radix_tree_node *node;
1081	unsigned long max_index;
1082	unsigned long cur_index = first_index;
1083	unsigned int ret;
1084
1085	/* check the root's tag bit */
1086	if (!root_tag_get(root, tag))
1087		return 0;
1088
1089	node = rcu_dereference_raw(root->rnode);
1090	if (!node)
1091		return 0;
1092
1093	if (!radix_tree_is_indirect_ptr(node)) {
1094		if (first_index > 0)
1095			return 0;
1096		results[0] = node;
1097		return 1;
1098	}
1099	node = indirect_to_ptr(node);
1100
1101	max_index = radix_tree_maxindex(node->height);
1102
1103	ret = 0;
1104	while (ret < max_items) {
1105		unsigned int nr_found, slots_found, i;
1106		unsigned long next_index;	/* Index of next search */
1107
1108		if (cur_index > max_index)
1109			break;
1110		slots_found = __lookup_tag(node, (void ***)results + ret,
1111				cur_index, max_items - ret, &next_index, tag);
1112		nr_found = 0;
1113		for (i = 0; i < slots_found; i++) {
1114			struct radix_tree_node *slot;
1115			slot = *(((void ***)results)[ret + i]);
1116			if (!slot)
1117				continue;
1118			results[ret + nr_found] =
1119				indirect_to_ptr(rcu_dereference_raw(slot));
1120			nr_found++;
1121		}
1122		ret += nr_found;
1123		if (next_index == 0)
1124			break;
1125		cur_index = next_index;
1126	}
1127
1128	return ret;
1129}
1130EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
1131
1132/**
1133 *	radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a
1134 *					  radix tree based on a tag
1135 *	@root:		radix tree root
1136 *	@results:	where the results of the lookup are placed
1137 *	@first_index:	start the lookup from this key
1138 *	@max_items:	place up to this many items at *results
1139 *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
1140 *
1141 *	Performs an index-ascending scan of the tree for present items which
1142 *	have the tag indexed by @tag set.  Places the slots at *@results and
1143 *	returns the number of slots which were placed at *@results.
1144 */
1145unsigned int
1146radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
1147		unsigned long first_index, unsigned int max_items,
1148		unsigned int tag)
1149{
1150	struct radix_tree_node *node;
1151	unsigned long max_index;
1152	unsigned long cur_index = first_index;
1153	unsigned int ret;
1154
1155	/* check the root's tag bit */
1156	if (!root_tag_get(root, tag))
1157		return 0;
1158
1159	node = rcu_dereference_raw(root->rnode);
1160	if (!node)
1161		return 0;
1162
1163	if (!radix_tree_is_indirect_ptr(node)) {
1164		if (first_index > 0)
1165			return 0;
1166		results[0] = (void **)&root->rnode;
1167		return 1;
1168	}
1169	node = indirect_to_ptr(node);
1170
1171	max_index = radix_tree_maxindex(node->height);
1172
1173	ret = 0;
1174	while (ret < max_items) {
1175		unsigned int slots_found;
1176		unsigned long next_index;	/* Index of next search */
1177
1178		if (cur_index > max_index)
1179			break;
1180		slots_found = __lookup_tag(node, results + ret,
1181				cur_index, max_items - ret, &next_index, tag);
1182		ret += slots_found;
1183		if (next_index == 0)
1184			break;
1185		cur_index = next_index;
1186	}
1187
1188	return ret;
1189}
1190EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
1191
1192
1193/**
1194 *	radix_tree_shrink    -    shrink height of a radix tree to minimal
1195 *	@root		radix tree root
1196 */
1197static inline void radix_tree_shrink(struct radix_tree_root *root)
1198{
1199	/* try to shrink tree height */
1200	while (root->height > 0) {
1201		struct radix_tree_node *to_free = root->rnode;
1202		void *newptr;
1203
1204		BUG_ON(!radix_tree_is_indirect_ptr(to_free));
1205		to_free = indirect_to_ptr(to_free);
1206
1207		/*
1208		 * The candidate node has more than one child, or its child
1209		 * is not at the leftmost slot, we cannot shrink.
1210		 */
1211		if (to_free->count != 1)
1212			break;
1213		if (!to_free->slots[0])
1214			break;
1215
1216		/*
1217		 * We don't need rcu_assign_pointer(), since we are simply
1218		 * moving the node from one part of the tree to another: if it
1219		 * was safe to dereference the old pointer to it
1220		 * (to_free->slots[0]), it will be safe to dereference the new
1221		 * one (root->rnode) as far as dependent read barriers go.
1222		 */
1223		newptr = to_free->slots[0];
1224		if (root->height > 1)
1225			newptr = ptr_to_indirect(newptr);
1226		root->rnode = newptr;
1227		root->height--;
1228
1229		/*
1230		 * We have a dilemma here. The node's slot[0] must not be
1231		 * NULLed in case there are concurrent lookups expecting to
1232		 * find the item. However if this was a bottom-level node,
1233		 * then it may be subject to the slot pointer being visible
1234		 * to callers dereferencing it. If item corresponding to
1235		 * slot[0] is subsequently deleted, these callers would expect
1236		 * their slot to become empty sooner or later.
1237		 *
1238		 * For example, lockless pagecache will look up a slot, deref
1239		 * the page pointer, and if the page is 0 refcount it means it
1240		 * was concurrently deleted from pagecache so try the deref
1241		 * again. Fortunately there is already a requirement for logic
1242		 * to retry the entire slot lookup -- the indirect pointer
1243		 * problem (replacing direct root node with an indirect pointer
1244		 * also results in a stale slot). So tag the slot as indirect
1245		 * to force callers to retry.
1246		 */
1247		if (root->height == 0)
1248			*((unsigned long *)&to_free->slots[0]) |=
1249						RADIX_TREE_INDIRECT_PTR;
1250
1251		radix_tree_node_free(to_free);
1252	}
1253}
1254
1255/**
1256 *	radix_tree_delete    -    delete an item from a radix tree
1257 *	@root:		radix tree root
1258 *	@index:		index key
1259 *
1260 *	Remove the item at @index from the radix tree rooted at @root.
1261 *
1262 *	Returns the address of the deleted item, or NULL if it was not present.
1263 */
1264void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1265{
1266	/*
1267	 * The radix tree path needs to be one longer than the maximum path
1268	 * since the "list" is null terminated.
1269	 */
1270	struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
1271	struct radix_tree_node *slot = NULL;
1272	struct radix_tree_node *to_free;
1273	unsigned int height, shift;
1274	int tag;
1275	int offset;
1276
1277	height = root->height;
1278	if (index > radix_tree_maxindex(height))
1279		goto out;
1280
1281	slot = root->rnode;
1282	if (height == 0) {
1283		root_tag_clear_all(root);
1284		root->rnode = NULL;
1285		goto out;
1286	}
1287	slot = indirect_to_ptr(slot);
1288
1289	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
1290	pathp->node = NULL;
1291
1292	do {
1293		if (slot == NULL)
1294			goto out;
1295
1296		pathp++;
1297		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
1298		pathp->offset = offset;
1299		pathp->node = slot;
1300		slot = slot->slots[offset];
1301		shift -= RADIX_TREE_MAP_SHIFT;
1302		height--;
1303	} while (height > 0);
1304
1305	if (slot == NULL)
1306		goto out;
1307
1308	/*
1309	 * Clear all tags associated with the just-deleted item
1310	 */
1311	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
1312		if (tag_get(pathp->node, tag, pathp->offset))
1313			radix_tree_tag_clear(root, index, tag);
1314	}
1315
1316	to_free = NULL;
1317	/* Now free the nodes we do not need anymore */
1318	while (pathp->node) {
1319		pathp->node->slots[pathp->offset] = NULL;
1320		pathp->node->count--;
1321		/*
1322		 * Queue the node for deferred freeing after the
1323		 * last reference to it disappears (set NULL, above).
1324		 */
1325		if (to_free)
1326			radix_tree_node_free(to_free);
1327
1328		if (pathp->node->count) {
1329			if (pathp->node == indirect_to_ptr(root->rnode))
1330				radix_tree_shrink(root);
1331			goto out;
1332		}
1333
1334		/* Node with zero slots in use so free it */
1335		to_free = pathp->node;
1336		pathp--;
1337
1338	}
1339	root_tag_clear_all(root);
1340	root->height = 0;
1341	root->rnode = NULL;
1342	if (to_free)
1343		radix_tree_node_free(to_free);
1344
1345out:
1346	return slot;
1347}
1348EXPORT_SYMBOL(radix_tree_delete);
1349
1350/**
1351 *	radix_tree_tagged - test whether any items in the tree are tagged
1352 *	@root:		radix tree root
1353 *	@tag:		tag to test
1354 */
1355int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
1356{
1357	return root_tag_get(root, tag);
1358}
1359EXPORT_SYMBOL(radix_tree_tagged);
1360
1361static void
1362radix_tree_node_ctor(void *node)
1363{
1364	memset(node, 0, sizeof(struct radix_tree_node));
1365}
1366
1367static __init unsigned long __maxindex(unsigned int height)
1368{
1369	unsigned int width = height * RADIX_TREE_MAP_SHIFT;
1370	int shift = RADIX_TREE_INDEX_BITS - width;
1371
1372	if (shift < 0)
1373		return ~0UL;
1374	if (shift >= BITS_PER_LONG)
1375		return 0UL;
1376	return ~0UL >> shift;
1377}
1378
1379static __init void radix_tree_init_maxindex(void)
1380{
1381	unsigned int i;
1382
1383	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
1384		height_to_maxindex[i] = __maxindex(i);
1385}
1386
1387static int radix_tree_callback(struct notifier_block *nfb,
1388                            unsigned long action,
1389                            void *hcpu)
1390{
1391       int cpu = (long)hcpu;
1392       struct radix_tree_preload *rtp;
1393
1394       /* Free per-cpu pool of perloaded nodes */
1395       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
1396               rtp = &per_cpu(radix_tree_preloads, cpu);
1397               while (rtp->nr) {
1398                       kmem_cache_free(radix_tree_node_cachep,
1399                                       rtp->nodes[rtp->nr-1]);
1400                       rtp->nodes[rtp->nr-1] = NULL;
1401                       rtp->nr--;
1402               }
1403       }
1404       return NOTIFY_OK;
1405}
1406
1407void __init radix_tree_init(void)
1408{
1409	radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
1410			sizeof(struct radix_tree_node), 0,
1411			SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
1412			radix_tree_node_ctor);
1413	radix_tree_init_maxindex();
1414	hotcpu_notifier(radix_tree_callback, 0);
1415}
1416