1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET		An implementation of the TCP/IP protocol suite for the LINUX
30 *		operating system.  INET is implemented using the  BSD Socket
31 *		interface as the means of communication with the user level.
32 *
33 *		IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 *		This program is free software; you can redistribute it and/or
39 *		modify it under the terms of the GNU General Public License
40 *		as published by the Free Software Foundation; either version
41 *		2 of the License, or (at your option) any later version.
42 *
43 * Substantial contributions to this work comes from:
44 *
45 *		David S. Miller, <davem@davemloft.net>
46 *		Stephen Hemminger <shemminger@osdl.org>
47 *		Paul E. McKenney <paulmck@us.ibm.com>
48 *		Patrick McHardy <kaber@trash.net>
49 */
50
51#define VERSION "0.409"
52
53#include <asm/uaccess.h>
54#include <asm/system.h>
55#include <linux/bitops.h>
56#include <linux/types.h>
57#include <linux/kernel.h>
58#include <linux/mm.h>
59#include <linux/string.h>
60#include <linux/socket.h>
61#include <linux/sockios.h>
62#include <linux/errno.h>
63#include <linux/in.h>
64#include <linux/inet.h>
65#include <linux/inetdevice.h>
66#include <linux/netdevice.h>
67#include <linux/if_arp.h>
68#include <linux/proc_fs.h>
69#include <linux/rcupdate.h>
70#include <linux/skbuff.h>
71#include <linux/netlink.h>
72#include <linux/init.h>
73#include <linux/list.h>
74#include <linux/slab.h>
75#include <net/net_namespace.h>
76#include <net/ip.h>
77#include <net/protocol.h>
78#include <net/route.h>
79#include <net/tcp.h>
80#include <net/sock.h>
81#include <net/ip_fib.h>
82#include "fib_lookup.h"
83
84#define MAX_STAT_DEPTH 32
85
86#define KEYLENGTH (8*sizeof(t_key))
87
88typedef unsigned int t_key;
89
90#define T_TNODE 0
91#define T_LEAF  1
92#define NODE_TYPE_MASK	0x1UL
93#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94
95#define IS_TNODE(n) (!(n->parent & T_LEAF))
96#define IS_LEAF(n) (n->parent & T_LEAF)
97
98struct node {
99	unsigned long parent;
100	t_key key;
101};
102
103struct leaf {
104	unsigned long parent;
105	t_key key;
106	struct hlist_head list;
107	struct rcu_head rcu;
108};
109
110struct leaf_info {
111	struct hlist_node hlist;
112	struct rcu_head rcu;
113	int plen;
114	struct list_head falh;
115};
116
117struct tnode {
118	unsigned long parent;
119	t_key key;
120	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
121	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
122	unsigned int full_children;	/* KEYLENGTH bits needed */
123	unsigned int empty_children;	/* KEYLENGTH bits needed */
124	union {
125		struct rcu_head rcu;
126		struct work_struct work;
127		struct tnode *tnode_free;
128	};
129	struct node *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134	unsigned int gets;
135	unsigned int backtrack;
136	unsigned int semantic_match_passed;
137	unsigned int semantic_match_miss;
138	unsigned int null_node_hit;
139	unsigned int resize_node_skipped;
140};
141#endif
142
143struct trie_stat {
144	unsigned int totdepth;
145	unsigned int maxdepth;
146	unsigned int tnodes;
147	unsigned int leaves;
148	unsigned int nullpointers;
149	unsigned int prefixes;
150	unsigned int nodesizes[MAX_STAT_DEPTH];
151};
152
153struct trie {
154	struct node *trie;
155#ifdef CONFIG_IP_FIB_TRIE_STATS
156	struct trie_use_stats stats;
157#endif
158};
159
160static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
161static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
162				  int wasfull);
163static struct node *resize(struct trie *t, struct tnode *tn);
164static struct tnode *inflate(struct trie *t, struct tnode *tn);
165static struct tnode *halve(struct trie *t, struct tnode *tn);
166/* tnodes to free after resize(); protected by RTNL */
167static struct tnode *tnode_free_head;
168static size_t tnode_free_size;
169
170/*
171 * synchronize_rcu after call_rcu for that many pages; it should be especially
172 * useful before resizing the root node with PREEMPT_NONE configs; the value was
173 * obtained experimentally, aiming to avoid visible slowdown.
174 */
175static const int sync_pages = 128;
176
177static struct kmem_cache *fn_alias_kmem __read_mostly;
178static struct kmem_cache *trie_leaf_kmem __read_mostly;
179
180static inline struct tnode *node_parent(struct node *node)
181{
182	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
183}
184
185static inline struct tnode *node_parent_rcu(struct node *node)
186{
187	struct tnode *ret = node_parent(node);
188
189	return rcu_dereference_check(ret,
190				     rcu_read_lock_held() ||
191				     lockdep_rtnl_is_held());
192}
193
194/* Same as rcu_assign_pointer
195 * but that macro() assumes that value is a pointer.
196 */
197static inline void node_set_parent(struct node *node, struct tnode *ptr)
198{
199	smp_wmb();
200	node->parent = (unsigned long)ptr | NODE_TYPE(node);
201}
202
203static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
204{
205	BUG_ON(i >= 1U << tn->bits);
206
207	return tn->child[i];
208}
209
210static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
211{
212	struct node *ret = tnode_get_child(tn, i);
213
214	return rcu_dereference_check(ret,
215				     rcu_read_lock_held() ||
216				     lockdep_rtnl_is_held());
217}
218
219static inline int tnode_child_length(const struct tnode *tn)
220{
221	return 1 << tn->bits;
222}
223
224static inline t_key mask_pfx(t_key k, unsigned short l)
225{
226	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
227}
228
229static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
230{
231	if (offset < KEYLENGTH)
232		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
233	else
234		return 0;
235}
236
237static inline int tkey_equals(t_key a, t_key b)
238{
239	return a == b;
240}
241
242static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
243{
244	if (bits == 0 || offset >= KEYLENGTH)
245		return 1;
246	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
247	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
248}
249
250static inline int tkey_mismatch(t_key a, int offset, t_key b)
251{
252	t_key diff = a ^ b;
253	int i = offset;
254
255	if (!diff)
256		return 0;
257	while ((diff << i) >> (KEYLENGTH-1) == 0)
258		i++;
259	return i;
260}
261
262/*
263  To understand this stuff, an understanding of keys and all their bits is
264  necessary. Every node in the trie has a key associated with it, but not
265  all of the bits in that key are significant.
266
267  Consider a node 'n' and its parent 'tp'.
268
269  If n is a leaf, every bit in its key is significant. Its presence is
270  necessitated by path compression, since during a tree traversal (when
271  searching for a leaf - unless we are doing an insertion) we will completely
272  ignore all skipped bits we encounter. Thus we need to verify, at the end of
273  a potentially successful search, that we have indeed been walking the
274  correct key path.
275
276  Note that we can never "miss" the correct key in the tree if present by
277  following the wrong path. Path compression ensures that segments of the key
278  that are the same for all keys with a given prefix are skipped, but the
279  skipped part *is* identical for each node in the subtrie below the skipped
280  bit! trie_insert() in this implementation takes care of that - note the
281  call to tkey_sub_equals() in trie_insert().
282
283  if n is an internal node - a 'tnode' here, the various parts of its key
284  have many different meanings.
285
286  Example:
287  _________________________________________________________________
288  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
289  -----------------------------------------------------------------
290    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
291
292  _________________________________________________________________
293  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
294  -----------------------------------------------------------------
295   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
296
297  tp->pos = 7
298  tp->bits = 3
299  n->pos = 15
300  n->bits = 4
301
302  First, let's just ignore the bits that come before the parent tp, that is
303  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
304  not use them for anything.
305
306  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
307  index into the parent's child array. That is, they will be used to find
308  'n' among tp's children.
309
310  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
311  for the node n.
312
313  All the bits we have seen so far are significant to the node n. The rest
314  of the bits are really not needed or indeed known in n->key.
315
316  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
317  n's child array, and will of course be different for each child.
318
319
320  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
321  at this point.
322
323*/
324
325static inline void check_tnode(const struct tnode *tn)
326{
327	WARN_ON(tn && tn->pos+tn->bits > 32);
328}
329
330static const int halve_threshold = 25;
331static const int inflate_threshold = 50;
332static const int halve_threshold_root = 15;
333static const int inflate_threshold_root = 30;
334
335static void __alias_free_mem(struct rcu_head *head)
336{
337	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
338	kmem_cache_free(fn_alias_kmem, fa);
339}
340
341static inline void alias_free_mem_rcu(struct fib_alias *fa)
342{
343	call_rcu(&fa->rcu, __alias_free_mem);
344}
345
346static void __leaf_free_rcu(struct rcu_head *head)
347{
348	struct leaf *l = container_of(head, struct leaf, rcu);
349	kmem_cache_free(trie_leaf_kmem, l);
350}
351
352static inline void free_leaf(struct leaf *l)
353{
354	call_rcu_bh(&l->rcu, __leaf_free_rcu);
355}
356
357static void __leaf_info_free_rcu(struct rcu_head *head)
358{
359	kfree(container_of(head, struct leaf_info, rcu));
360}
361
362static inline void free_leaf_info(struct leaf_info *leaf)
363{
364	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
365}
366
367static struct tnode *tnode_alloc(size_t size)
368{
369	if (size <= PAGE_SIZE)
370		return kzalloc(size, GFP_KERNEL);
371	else
372		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
373}
374
375static void __tnode_vfree(struct work_struct *arg)
376{
377	struct tnode *tn = container_of(arg, struct tnode, work);
378	vfree(tn);
379}
380
381static void __tnode_free_rcu(struct rcu_head *head)
382{
383	struct tnode *tn = container_of(head, struct tnode, rcu);
384	size_t size = sizeof(struct tnode) +
385		      (sizeof(struct node *) << tn->bits);
386
387	if (size <= PAGE_SIZE)
388		kfree(tn);
389	else {
390		INIT_WORK(&tn->work, __tnode_vfree);
391		schedule_work(&tn->work);
392	}
393}
394
395static inline void tnode_free(struct tnode *tn)
396{
397	if (IS_LEAF(tn))
398		free_leaf((struct leaf *) tn);
399	else
400		call_rcu(&tn->rcu, __tnode_free_rcu);
401}
402
403static void tnode_free_safe(struct tnode *tn)
404{
405	BUG_ON(IS_LEAF(tn));
406	tn->tnode_free = tnode_free_head;
407	tnode_free_head = tn;
408	tnode_free_size += sizeof(struct tnode) +
409			   (sizeof(struct node *) << tn->bits);
410}
411
412static void tnode_free_flush(void)
413{
414	struct tnode *tn;
415
416	while ((tn = tnode_free_head)) {
417		tnode_free_head = tn->tnode_free;
418		tn->tnode_free = NULL;
419		tnode_free(tn);
420	}
421
422	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
423		tnode_free_size = 0;
424		synchronize_rcu();
425	}
426}
427
428static struct leaf *leaf_new(void)
429{
430	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
431	if (l) {
432		l->parent = T_LEAF;
433		INIT_HLIST_HEAD(&l->list);
434	}
435	return l;
436}
437
438static struct leaf_info *leaf_info_new(int plen)
439{
440	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
441	if (li) {
442		li->plen = plen;
443		INIT_LIST_HEAD(&li->falh);
444	}
445	return li;
446}
447
448static struct tnode *tnode_new(t_key key, int pos, int bits)
449{
450	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
451	struct tnode *tn = tnode_alloc(sz);
452
453	if (tn) {
454		tn->parent = T_TNODE;
455		tn->pos = pos;
456		tn->bits = bits;
457		tn->key = key;
458		tn->full_children = 0;
459		tn->empty_children = 1<<bits;
460	}
461
462	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
463		 (unsigned long) (sizeof(struct node) << bits));
464	return tn;
465}
466
467/*
468 * Check whether a tnode 'n' is "full", i.e. it is an internal node
469 * and no bits are skipped. See discussion in dyntree paper p. 6
470 */
471
472static inline int tnode_full(const struct tnode *tn, const struct node *n)
473{
474	if (n == NULL || IS_LEAF(n))
475		return 0;
476
477	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
478}
479
480static inline void put_child(struct trie *t, struct tnode *tn, int i,
481			     struct node *n)
482{
483	tnode_put_child_reorg(tn, i, n, -1);
484}
485
486 /*
487  * Add a child at position i overwriting the old value.
488  * Update the value of full_children and empty_children.
489  */
490
491static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
492				  int wasfull)
493{
494	struct node *chi = tn->child[i];
495	int isfull;
496
497	BUG_ON(i >= 1<<tn->bits);
498
499	/* update emptyChildren */
500	if (n == NULL && chi != NULL)
501		tn->empty_children++;
502	else if (n != NULL && chi == NULL)
503		tn->empty_children--;
504
505	/* update fullChildren */
506	if (wasfull == -1)
507		wasfull = tnode_full(tn, chi);
508
509	isfull = tnode_full(tn, n);
510	if (wasfull && !isfull)
511		tn->full_children--;
512	else if (!wasfull && isfull)
513		tn->full_children++;
514
515	if (n)
516		node_set_parent(n, tn);
517
518	rcu_assign_pointer(tn->child[i], n);
519}
520
521#define MAX_WORK 10
522static struct node *resize(struct trie *t, struct tnode *tn)
523{
524	int i;
525	struct tnode *old_tn;
526	int inflate_threshold_use;
527	int halve_threshold_use;
528	int max_work;
529
530	if (!tn)
531		return NULL;
532
533	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
534		 tn, inflate_threshold, halve_threshold);
535
536	/* No children */
537	if (tn->empty_children == tnode_child_length(tn)) {
538		tnode_free_safe(tn);
539		return NULL;
540	}
541	/* One child */
542	if (tn->empty_children == tnode_child_length(tn) - 1)
543		goto one_child;
544	/*
545	 * Double as long as the resulting node has a number of
546	 * nonempty nodes that are above the threshold.
547	 */
548
549	/*
550	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
551	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
552	 * Telecommunications, page 6:
553	 * "A node is doubled if the ratio of non-empty children to all
554	 * children in the *doubled* node is at least 'high'."
555	 *
556	 * 'high' in this instance is the variable 'inflate_threshold'. It
557	 * is expressed as a percentage, so we multiply it with
558	 * tnode_child_length() and instead of multiplying by 2 (since the
559	 * child array will be doubled by inflate()) and multiplying
560	 * the left-hand side by 100 (to handle the percentage thing) we
561	 * multiply the left-hand side by 50.
562	 *
563	 * The left-hand side may look a bit weird: tnode_child_length(tn)
564	 * - tn->empty_children is of course the number of non-null children
565	 * in the current node. tn->full_children is the number of "full"
566	 * children, that is non-null tnodes with a skip value of 0.
567	 * All of those will be doubled in the resulting inflated tnode, so
568	 * we just count them one extra time here.
569	 *
570	 * A clearer way to write this would be:
571	 *
572	 * to_be_doubled = tn->full_children;
573	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
574	 *     tn->full_children;
575	 *
576	 * new_child_length = tnode_child_length(tn) * 2;
577	 *
578	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
579	 *      new_child_length;
580	 * if (new_fill_factor >= inflate_threshold)
581	 *
582	 * ...and so on, tho it would mess up the while () loop.
583	 *
584	 * anyway,
585	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
586	 *      inflate_threshold
587	 *
588	 * avoid a division:
589	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
590	 *      inflate_threshold * new_child_length
591	 *
592	 * expand not_to_be_doubled and to_be_doubled, and shorten:
593	 * 100 * (tnode_child_length(tn) - tn->empty_children +
594	 *    tn->full_children) >= inflate_threshold * new_child_length
595	 *
596	 * expand new_child_length:
597	 * 100 * (tnode_child_length(tn) - tn->empty_children +
598	 *    tn->full_children) >=
599	 *      inflate_threshold * tnode_child_length(tn) * 2
600	 *
601	 * shorten again:
602	 * 50 * (tn->full_children + tnode_child_length(tn) -
603	 *    tn->empty_children) >= inflate_threshold *
604	 *    tnode_child_length(tn)
605	 *
606	 */
607
608	check_tnode(tn);
609
610	/* Keep root node larger  */
611
612	if (!node_parent((struct node*) tn)) {
613		inflate_threshold_use = inflate_threshold_root;
614		halve_threshold_use = halve_threshold_root;
615	}
616	else {
617		inflate_threshold_use = inflate_threshold;
618		halve_threshold_use = halve_threshold;
619	}
620
621	max_work = MAX_WORK;
622	while ((tn->full_children > 0 &&  max_work-- &&
623		50 * (tn->full_children + tnode_child_length(tn)
624		      - tn->empty_children)
625		>= inflate_threshold_use * tnode_child_length(tn))) {
626
627		old_tn = tn;
628		tn = inflate(t, tn);
629
630		if (IS_ERR(tn)) {
631			tn = old_tn;
632#ifdef CONFIG_IP_FIB_TRIE_STATS
633			t->stats.resize_node_skipped++;
634#endif
635			break;
636		}
637	}
638
639	check_tnode(tn);
640
641	/* Return if at least one inflate is run */
642	if( max_work != MAX_WORK)
643		return (struct node *) tn;
644
645	/*
646	 * Halve as long as the number of empty children in this
647	 * node is above threshold.
648	 */
649
650	max_work = MAX_WORK;
651	while (tn->bits > 1 &&  max_work-- &&
652	       100 * (tnode_child_length(tn) - tn->empty_children) <
653	       halve_threshold_use * tnode_child_length(tn)) {
654
655		old_tn = tn;
656		tn = halve(t, tn);
657		if (IS_ERR(tn)) {
658			tn = old_tn;
659#ifdef CONFIG_IP_FIB_TRIE_STATS
660			t->stats.resize_node_skipped++;
661#endif
662			break;
663		}
664	}
665
666
667	/* Only one child remains */
668	if (tn->empty_children == tnode_child_length(tn) - 1) {
669one_child:
670		for (i = 0; i < tnode_child_length(tn); i++) {
671			struct node *n;
672
673			n = tn->child[i];
674			if (!n)
675				continue;
676
677			/* compress one level */
678
679			node_set_parent(n, NULL);
680			tnode_free_safe(tn);
681			return n;
682		}
683	}
684	return (struct node *) tn;
685}
686
687static struct tnode *inflate(struct trie *t, struct tnode *tn)
688{
689	struct tnode *oldtnode = tn;
690	int olen = tnode_child_length(tn);
691	int i;
692
693	pr_debug("In inflate\n");
694
695	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
696
697	if (!tn)
698		return ERR_PTR(-ENOMEM);
699
700	/*
701	 * Preallocate and store tnodes before the actual work so we
702	 * don't get into an inconsistent state if memory allocation
703	 * fails. In case of failure we return the oldnode and  inflate
704	 * of tnode is ignored.
705	 */
706
707	for (i = 0; i < olen; i++) {
708		struct tnode *inode;
709
710		inode = (struct tnode *) tnode_get_child(oldtnode, i);
711		if (inode &&
712		    IS_TNODE(inode) &&
713		    inode->pos == oldtnode->pos + oldtnode->bits &&
714		    inode->bits > 1) {
715			struct tnode *left, *right;
716			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
717
718			left = tnode_new(inode->key&(~m), inode->pos + 1,
719					 inode->bits - 1);
720			if (!left)
721				goto nomem;
722
723			right = tnode_new(inode->key|m, inode->pos + 1,
724					  inode->bits - 1);
725
726			if (!right) {
727				tnode_free(left);
728				goto nomem;
729			}
730
731			put_child(t, tn, 2*i, (struct node *) left);
732			put_child(t, tn, 2*i+1, (struct node *) right);
733		}
734	}
735
736	for (i = 0; i < olen; i++) {
737		struct tnode *inode;
738		struct node *node = tnode_get_child(oldtnode, i);
739		struct tnode *left, *right;
740		int size, j;
741
742		/* An empty child */
743		if (node == NULL)
744			continue;
745
746		/* A leaf or an internal node with skipped bits */
747
748		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
749		   tn->pos + tn->bits - 1) {
750			if (tkey_extract_bits(node->key,
751					      oldtnode->pos + oldtnode->bits,
752					      1) == 0)
753				put_child(t, tn, 2*i, node);
754			else
755				put_child(t, tn, 2*i+1, node);
756			continue;
757		}
758
759		/* An internal node with two children */
760		inode = (struct tnode *) node;
761
762		if (inode->bits == 1) {
763			put_child(t, tn, 2*i, inode->child[0]);
764			put_child(t, tn, 2*i+1, inode->child[1]);
765
766			tnode_free_safe(inode);
767			continue;
768		}
769
770		/* An internal node with more than two children */
771
772		/* We will replace this node 'inode' with two new
773		 * ones, 'left' and 'right', each with half of the
774		 * original children. The two new nodes will have
775		 * a position one bit further down the key and this
776		 * means that the "significant" part of their keys
777		 * (see the discussion near the top of this file)
778		 * will differ by one bit, which will be "0" in
779		 * left's key and "1" in right's key. Since we are
780		 * moving the key position by one step, the bit that
781		 * we are moving away from - the bit at position
782		 * (inode->pos) - is the one that will differ between
783		 * left and right. So... we synthesize that bit in the
784		 * two  new keys.
785		 * The mask 'm' below will be a single "one" bit at
786		 * the position (inode->pos)
787		 */
788
789		/* Use the old key, but set the new significant
790		 *   bit to zero.
791		 */
792
793		left = (struct tnode *) tnode_get_child(tn, 2*i);
794		put_child(t, tn, 2*i, NULL);
795
796		BUG_ON(!left);
797
798		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
799		put_child(t, tn, 2*i+1, NULL);
800
801		BUG_ON(!right);
802
803		size = tnode_child_length(left);
804		for (j = 0; j < size; j++) {
805			put_child(t, left, j, inode->child[j]);
806			put_child(t, right, j, inode->child[j + size]);
807		}
808		put_child(t, tn, 2*i, resize(t, left));
809		put_child(t, tn, 2*i+1, resize(t, right));
810
811		tnode_free_safe(inode);
812	}
813	tnode_free_safe(oldtnode);
814	return tn;
815nomem:
816	{
817		int size = tnode_child_length(tn);
818		int j;
819
820		for (j = 0; j < size; j++)
821			if (tn->child[j])
822				tnode_free((struct tnode *)tn->child[j]);
823
824		tnode_free(tn);
825
826		return ERR_PTR(-ENOMEM);
827	}
828}
829
830static struct tnode *halve(struct trie *t, struct tnode *tn)
831{
832	struct tnode *oldtnode = tn;
833	struct node *left, *right;
834	int i;
835	int olen = tnode_child_length(tn);
836
837	pr_debug("In halve\n");
838
839	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
840
841	if (!tn)
842		return ERR_PTR(-ENOMEM);
843
844	/*
845	 * Preallocate and store tnodes before the actual work so we
846	 * don't get into an inconsistent state if memory allocation
847	 * fails. In case of failure we return the oldnode and halve
848	 * of tnode is ignored.
849	 */
850
851	for (i = 0; i < olen; i += 2) {
852		left = tnode_get_child(oldtnode, i);
853		right = tnode_get_child(oldtnode, i+1);
854
855		/* Two nonempty children */
856		if (left && right) {
857			struct tnode *newn;
858
859			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
860
861			if (!newn)
862				goto nomem;
863
864			put_child(t, tn, i/2, (struct node *)newn);
865		}
866
867	}
868
869	for (i = 0; i < olen; i += 2) {
870		struct tnode *newBinNode;
871
872		left = tnode_get_child(oldtnode, i);
873		right = tnode_get_child(oldtnode, i+1);
874
875		/* At least one of the children is empty */
876		if (left == NULL) {
877			if (right == NULL)    /* Both are empty */
878				continue;
879			put_child(t, tn, i/2, right);
880			continue;
881		}
882
883		if (right == NULL) {
884			put_child(t, tn, i/2, left);
885			continue;
886		}
887
888		/* Two nonempty children */
889		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
890		put_child(t, tn, i/2, NULL);
891		put_child(t, newBinNode, 0, left);
892		put_child(t, newBinNode, 1, right);
893		put_child(t, tn, i/2, resize(t, newBinNode));
894	}
895	tnode_free_safe(oldtnode);
896	return tn;
897nomem:
898	{
899		int size = tnode_child_length(tn);
900		int j;
901
902		for (j = 0; j < size; j++)
903			if (tn->child[j])
904				tnode_free((struct tnode *)tn->child[j]);
905
906		tnode_free(tn);
907
908		return ERR_PTR(-ENOMEM);
909	}
910}
911
912/* readside must use rcu_read_lock currently dump routines
913 via get_fa_head and dump */
914
915static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
916{
917	struct hlist_head *head = &l->list;
918	struct hlist_node *node;
919	struct leaf_info *li;
920
921	hlist_for_each_entry_rcu(li, node, head, hlist)
922		if (li->plen == plen)
923			return li;
924
925	return NULL;
926}
927
928static inline struct list_head *get_fa_head(struct leaf *l, int plen)
929{
930	struct leaf_info *li = find_leaf_info(l, plen);
931
932	if (!li)
933		return NULL;
934
935	return &li->falh;
936}
937
938static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
939{
940	struct leaf_info *li = NULL, *last = NULL;
941	struct hlist_node *node;
942
943	if (hlist_empty(head)) {
944		hlist_add_head_rcu(&new->hlist, head);
945	} else {
946		hlist_for_each_entry(li, node, head, hlist) {
947			if (new->plen > li->plen)
948				break;
949
950			last = li;
951		}
952		if (last)
953			hlist_add_after_rcu(&last->hlist, &new->hlist);
954		else
955			hlist_add_before_rcu(&new->hlist, &li->hlist);
956	}
957}
958
959/* rcu_read_lock needs to be hold by caller from readside */
960
961static struct leaf *
962fib_find_node(struct trie *t, u32 key)
963{
964	int pos;
965	struct tnode *tn;
966	struct node *n;
967
968	pos = 0;
969	n = rcu_dereference_check(t->trie,
970				  rcu_read_lock_held() ||
971				  lockdep_rtnl_is_held());
972
973	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
974		tn = (struct tnode *) n;
975
976		check_tnode(tn);
977
978		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
979			pos = tn->pos + tn->bits;
980			n = tnode_get_child_rcu(tn,
981						tkey_extract_bits(key,
982								  tn->pos,
983								  tn->bits));
984		} else
985			break;
986	}
987	/* Case we have found a leaf. Compare prefixes */
988
989	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
990		return (struct leaf *)n;
991
992	return NULL;
993}
994
995static void trie_rebalance(struct trie *t, struct tnode *tn)
996{
997	int wasfull;
998	t_key cindex, key;
999	struct tnode *tp;
1000
1001	key = tn->key;
1002
1003	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
1004		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1005		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1006		tn = (struct tnode *) resize(t, (struct tnode *)tn);
1007
1008		tnode_put_child_reorg((struct tnode *)tp, cindex,
1009				      (struct node *)tn, wasfull);
1010
1011		tp = node_parent((struct node *) tn);
1012		if (!tp)
1013			rcu_assign_pointer(t->trie, (struct node *)tn);
1014
1015		tnode_free_flush();
1016		if (!tp)
1017			break;
1018		tn = tp;
1019	}
1020
1021	/* Handle last (top) tnode */
1022	if (IS_TNODE(tn))
1023		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1024
1025	rcu_assign_pointer(t->trie, (struct node *)tn);
1026	tnode_free_flush();
1027}
1028
1029/* only used from updater-side */
1030
1031static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1032{
1033	int pos, newpos;
1034	struct tnode *tp = NULL, *tn = NULL;
1035	struct node *n;
1036	struct leaf *l;
1037	int missbit;
1038	struct list_head *fa_head = NULL;
1039	struct leaf_info *li;
1040	t_key cindex;
1041
1042	pos = 0;
1043	n = t->trie;
1044
1045	/* If we point to NULL, stop. Either the tree is empty and we should
1046	 * just put a new leaf in if, or we have reached an empty child slot,
1047	 * and we should just put our new leaf in that.
1048	 * If we point to a T_TNODE, check if it matches our key. Note that
1049	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1050	 * not be the parent's 'pos'+'bits'!
1051	 *
1052	 * If it does match the current key, get pos/bits from it, extract
1053	 * the index from our key, push the T_TNODE and walk the tree.
1054	 *
1055	 * If it doesn't, we have to replace it with a new T_TNODE.
1056	 *
1057	 * If we point to a T_LEAF, it might or might not have the same key
1058	 * as we do. If it does, just change the value, update the T_LEAF's
1059	 * value, and return it.
1060	 * If it doesn't, we need to replace it with a T_TNODE.
1061	 */
1062
1063	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1064		tn = (struct tnode *) n;
1065
1066		check_tnode(tn);
1067
1068		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1069			tp = tn;
1070			pos = tn->pos + tn->bits;
1071			n = tnode_get_child(tn,
1072					    tkey_extract_bits(key,
1073							      tn->pos,
1074							      tn->bits));
1075
1076			BUG_ON(n && node_parent(n) != tn);
1077		} else
1078			break;
1079	}
1080
1081	/*
1082	 * n  ----> NULL, LEAF or TNODE
1083	 *
1084	 * tp is n's (parent) ----> NULL or TNODE
1085	 */
1086
1087	BUG_ON(tp && IS_LEAF(tp));
1088
1089	/* Case 1: n is a leaf. Compare prefixes */
1090
1091	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1092		l = (struct leaf *) n;
1093		li = leaf_info_new(plen);
1094
1095		if (!li)
1096			return NULL;
1097
1098		fa_head = &li->falh;
1099		insert_leaf_info(&l->list, li);
1100		goto done;
1101	}
1102	l = leaf_new();
1103
1104	if (!l)
1105		return NULL;
1106
1107	l->key = key;
1108	li = leaf_info_new(plen);
1109
1110	if (!li) {
1111		free_leaf(l);
1112		return NULL;
1113	}
1114
1115	fa_head = &li->falh;
1116	insert_leaf_info(&l->list, li);
1117
1118	if (t->trie && n == NULL) {
1119		/* Case 2: n is NULL, and will just insert a new leaf */
1120
1121		node_set_parent((struct node *)l, tp);
1122
1123		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1124		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1125	} else {
1126		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1127		/*
1128		 *  Add a new tnode here
1129		 *  first tnode need some special handling
1130		 */
1131
1132		if (tp)
1133			pos = tp->pos+tp->bits;
1134		else
1135			pos = 0;
1136
1137		if (n) {
1138			newpos = tkey_mismatch(key, pos, n->key);
1139			tn = tnode_new(n->key, newpos, 1);
1140		} else {
1141			newpos = 0;
1142			tn = tnode_new(key, newpos, 1); /* First tnode */
1143		}
1144
1145		if (!tn) {
1146			free_leaf_info(li);
1147			free_leaf(l);
1148			return NULL;
1149		}
1150
1151		node_set_parent((struct node *)tn, tp);
1152
1153		missbit = tkey_extract_bits(key, newpos, 1);
1154		put_child(t, tn, missbit, (struct node *)l);
1155		put_child(t, tn, 1-missbit, n);
1156
1157		if (tp) {
1158			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1159			put_child(t, (struct tnode *)tp, cindex,
1160				  (struct node *)tn);
1161		} else {
1162			rcu_assign_pointer(t->trie, (struct node *)tn);
1163			tp = tn;
1164		}
1165	}
1166
1167	if (tp && tp->pos + tp->bits > 32)
1168		pr_warning("fib_trie"
1169			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1170			   tp, tp->pos, tp->bits, key, plen);
1171
1172	/* Rebalance the trie */
1173
1174	trie_rebalance(t, tp);
1175done:
1176	return fa_head;
1177}
1178
1179/*
1180 * Caller must hold RTNL.
1181 */
1182int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1183{
1184	struct trie *t = (struct trie *) tb->tb_data;
1185	struct fib_alias *fa, *new_fa;
1186	struct list_head *fa_head = NULL;
1187	struct fib_info *fi;
1188	int plen = cfg->fc_dst_len;
1189	u8 tos = cfg->fc_tos;
1190	u32 key, mask;
1191	int err;
1192	struct leaf *l;
1193
1194	if (plen > 32)
1195		return -EINVAL;
1196
1197	key = ntohl(cfg->fc_dst);
1198
1199	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1200
1201	mask = ntohl(inet_make_mask(plen));
1202
1203	if (key & ~mask)
1204		return -EINVAL;
1205
1206	key = key & mask;
1207
1208	fi = fib_create_info(cfg);
1209	if (IS_ERR(fi)) {
1210		err = PTR_ERR(fi);
1211		goto err;
1212	}
1213
1214	l = fib_find_node(t, key);
1215	fa = NULL;
1216
1217	if (l) {
1218		fa_head = get_fa_head(l, plen);
1219		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1220	}
1221
1222	/* Now fa, if non-NULL, points to the first fib alias
1223	 * with the same keys [prefix,tos,priority], if such key already
1224	 * exists or to the node before which we will insert new one.
1225	 *
1226	 * If fa is NULL, we will need to allocate a new one and
1227	 * insert to the head of f.
1228	 *
1229	 * If f is NULL, no fib node matched the destination key
1230	 * and we need to allocate a new one of those as well.
1231	 */
1232
1233	if (fa && fa->fa_tos == tos &&
1234	    fa->fa_info->fib_priority == fi->fib_priority) {
1235		struct fib_alias *fa_first, *fa_match;
1236
1237		err = -EEXIST;
1238		if (cfg->fc_nlflags & NLM_F_EXCL)
1239			goto out;
1240
1241		/* We have 2 goals:
1242		 * 1. Find exact match for type, scope, fib_info to avoid
1243		 * duplicate routes
1244		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1245		 */
1246		fa_match = NULL;
1247		fa_first = fa;
1248		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1249		list_for_each_entry_continue(fa, fa_head, fa_list) {
1250			if (fa->fa_tos != tos)
1251				break;
1252			if (fa->fa_info->fib_priority != fi->fib_priority)
1253				break;
1254			if (fa->fa_type == cfg->fc_type &&
1255			    fa->fa_scope == cfg->fc_scope &&
1256			    fa->fa_info == fi) {
1257				fa_match = fa;
1258				break;
1259			}
1260		}
1261
1262		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1263			struct fib_info *fi_drop;
1264			u8 state;
1265
1266			fa = fa_first;
1267			if (fa_match) {
1268				if (fa == fa_match)
1269					err = 0;
1270				goto out;
1271			}
1272			err = -ENOBUFS;
1273			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1274			if (new_fa == NULL)
1275				goto out;
1276
1277			fi_drop = fa->fa_info;
1278			new_fa->fa_tos = fa->fa_tos;
1279			new_fa->fa_info = fi;
1280			new_fa->fa_type = cfg->fc_type;
1281			new_fa->fa_scope = cfg->fc_scope;
1282			state = fa->fa_state;
1283			new_fa->fa_state = state & ~FA_S_ACCESSED;
1284
1285			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1286			alias_free_mem_rcu(fa);
1287
1288			fib_release_info(fi_drop);
1289			if (state & FA_S_ACCESSED)
1290				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1291			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1292				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1293
1294			goto succeeded;
1295		}
1296		/* Error if we find a perfect match which
1297		 * uses the same scope, type, and nexthop
1298		 * information.
1299		 */
1300		if (fa_match)
1301			goto out;
1302
1303		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1304			fa = fa_first;
1305	}
1306	err = -ENOENT;
1307	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1308		goto out;
1309
1310	err = -ENOBUFS;
1311	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1312	if (new_fa == NULL)
1313		goto out;
1314
1315	new_fa->fa_info = fi;
1316	new_fa->fa_tos = tos;
1317	new_fa->fa_type = cfg->fc_type;
1318	new_fa->fa_scope = cfg->fc_scope;
1319	new_fa->fa_state = 0;
1320	/*
1321	 * Insert new entry to the list.
1322	 */
1323
1324	if (!fa_head) {
1325		fa_head = fib_insert_node(t, key, plen);
1326		if (unlikely(!fa_head)) {
1327			err = -ENOMEM;
1328			goto out_free_new_fa;
1329		}
1330	}
1331
1332	list_add_tail_rcu(&new_fa->fa_list,
1333			  (fa ? &fa->fa_list : fa_head));
1334
1335	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1336	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1337		  &cfg->fc_nlinfo, 0);
1338succeeded:
1339	return 0;
1340
1341out_free_new_fa:
1342	kmem_cache_free(fn_alias_kmem, new_fa);
1343out:
1344	fib_release_info(fi);
1345err:
1346	return err;
1347}
1348
1349/* should be called with rcu_read_lock */
1350static int check_leaf(struct trie *t, struct leaf *l,
1351		      t_key key,  const struct flowi *flp,
1352		      struct fib_result *res)
1353{
1354	struct leaf_info *li;
1355	struct hlist_head *hhead = &l->list;
1356	struct hlist_node *node;
1357
1358	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1359		int err;
1360		int plen = li->plen;
1361		__be32 mask = inet_make_mask(plen);
1362
1363		if (l->key != (key & ntohl(mask)))
1364			continue;
1365
1366		err = fib_semantic_match(&li->falh, flp, res, plen);
1367
1368#ifdef CONFIG_IP_FIB_TRIE_STATS
1369		if (err <= 0)
1370			t->stats.semantic_match_passed++;
1371		else
1372			t->stats.semantic_match_miss++;
1373#endif
1374		if (err <= 0)
1375			return err;
1376	}
1377
1378	return 1;
1379}
1380
1381int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1382		     struct fib_result *res)
1383{
1384	struct trie *t = (struct trie *) tb->tb_data;
1385	int ret;
1386	struct node *n;
1387	struct tnode *pn;
1388	int pos, bits;
1389	t_key key = ntohl(flp->fl4_dst);
1390	int chopped_off;
1391	t_key cindex = 0;
1392	int current_prefix_length = KEYLENGTH;
1393	struct tnode *cn;
1394	t_key node_prefix, key_prefix, pref_mismatch;
1395	int mp;
1396
1397	rcu_read_lock();
1398
1399	n = rcu_dereference(t->trie);
1400	if (!n)
1401		goto failed;
1402
1403#ifdef CONFIG_IP_FIB_TRIE_STATS
1404	t->stats.gets++;
1405#endif
1406
1407	/* Just a leaf? */
1408	if (IS_LEAF(n)) {
1409		ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1410		goto found;
1411	}
1412
1413	pn = (struct tnode *) n;
1414	chopped_off = 0;
1415
1416	while (pn) {
1417		pos = pn->pos;
1418		bits = pn->bits;
1419
1420		if (!chopped_off)
1421			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1422						   pos, bits);
1423
1424		n = tnode_get_child_rcu(pn, cindex);
1425
1426		if (n == NULL) {
1427#ifdef CONFIG_IP_FIB_TRIE_STATS
1428			t->stats.null_node_hit++;
1429#endif
1430			goto backtrace;
1431		}
1432
1433		if (IS_LEAF(n)) {
1434			ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1435			if (ret > 0)
1436				goto backtrace;
1437			goto found;
1438		}
1439
1440		cn = (struct tnode *)n;
1441
1442		/*
1443		 * It's a tnode, and we can do some extra checks here if we
1444		 * like, to avoid descending into a dead-end branch.
1445		 * This tnode is in the parent's child array at index
1446		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1447		 * chopped off, so in reality the index may be just a
1448		 * subprefix, padded with zero at the end.
1449		 * We can also take a look at any skipped bits in this
1450		 * tnode - everything up to p_pos is supposed to be ok,
1451		 * and the non-chopped bits of the index (se previous
1452		 * paragraph) are also guaranteed ok, but the rest is
1453		 * considered unknown.
1454		 *
1455		 * The skipped bits are key[pos+bits..cn->pos].
1456		 */
1457
1458		/* If current_prefix_length < pos+bits, we are already doing
1459		 * actual prefix  matching, which means everything from
1460		 * pos+(bits-chopped_off) onward must be zero along some
1461		 * branch of this subtree - otherwise there is *no* valid
1462		 * prefix present. Here we can only check the skipped
1463		 * bits. Remember, since we have already indexed into the
1464		 * parent's child array, we know that the bits we chopped of
1465		 * *are* zero.
1466		 */
1467
1468		/* NOTA BENE: Checking only skipped bits
1469		   for the new node here */
1470
1471		if (current_prefix_length < pos+bits) {
1472			if (tkey_extract_bits(cn->key, current_prefix_length,
1473						cn->pos - current_prefix_length)
1474			    || !(cn->child[0]))
1475				goto backtrace;
1476		}
1477
1478		/*
1479		 * If chopped_off=0, the index is fully validated and we
1480		 * only need to look at the skipped bits for this, the new,
1481		 * tnode. What we actually want to do is to find out if
1482		 * these skipped bits match our key perfectly, or if we will
1483		 * have to count on finding a matching prefix further down,
1484		 * because if we do, we would like to have some way of
1485		 * verifying the existence of such a prefix at this point.
1486		 */
1487
1488		/* The only thing we can do at this point is to verify that
1489		 * any such matching prefix can indeed be a prefix to our
1490		 * key, and if the bits in the node we are inspecting that
1491		 * do not match our key are not ZERO, this cannot be true.
1492		 * Thus, find out where there is a mismatch (before cn->pos)
1493		 * and verify that all the mismatching bits are zero in the
1494		 * new tnode's key.
1495		 */
1496
1497		/*
1498		 * Note: We aren't very concerned about the piece of
1499		 * the key that precede pn->pos+pn->bits, since these
1500		 * have already been checked. The bits after cn->pos
1501		 * aren't checked since these are by definition
1502		 * "unknown" at this point. Thus, what we want to see
1503		 * is if we are about to enter the "prefix matching"
1504		 * state, and in that case verify that the skipped
1505		 * bits that will prevail throughout this subtree are
1506		 * zero, as they have to be if we are to find a
1507		 * matching prefix.
1508		 */
1509
1510		node_prefix = mask_pfx(cn->key, cn->pos);
1511		key_prefix = mask_pfx(key, cn->pos);
1512		pref_mismatch = key_prefix^node_prefix;
1513		mp = 0;
1514
1515		/*
1516		 * In short: If skipped bits in this node do not match
1517		 * the search key, enter the "prefix matching"
1518		 * state.directly.
1519		 */
1520		if (pref_mismatch) {
1521			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1522				mp++;
1523				pref_mismatch = pref_mismatch << 1;
1524			}
1525			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1526
1527			if (key_prefix != 0)
1528				goto backtrace;
1529
1530			if (current_prefix_length >= cn->pos)
1531				current_prefix_length = mp;
1532		}
1533
1534		pn = (struct tnode *)n; /* Descend */
1535		chopped_off = 0;
1536		continue;
1537
1538backtrace:
1539		chopped_off++;
1540
1541		/* As zero don't change the child key (cindex) */
1542		while ((chopped_off <= pn->bits)
1543		       && !(cindex & (1<<(chopped_off-1))))
1544			chopped_off++;
1545
1546		/* Decrease current_... with bits chopped off */
1547		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1548			current_prefix_length = pn->pos + pn->bits
1549				- chopped_off;
1550
1551		/*
1552		 * Either we do the actual chop off according or if we have
1553		 * chopped off all bits in this tnode walk up to our parent.
1554		 */
1555
1556		if (chopped_off <= pn->bits) {
1557			cindex &= ~(1 << (chopped_off-1));
1558		} else {
1559			struct tnode *parent = node_parent_rcu((struct node *) pn);
1560			if (!parent)
1561				goto failed;
1562
1563			/* Get Child's index */
1564			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1565			pn = parent;
1566			chopped_off = 0;
1567
1568#ifdef CONFIG_IP_FIB_TRIE_STATS
1569			t->stats.backtrack++;
1570#endif
1571			goto backtrace;
1572		}
1573	}
1574failed:
1575	ret = 1;
1576found:
1577	rcu_read_unlock();
1578	return ret;
1579}
1580
1581/*
1582 * Remove the leaf and return parent.
1583 */
1584static void trie_leaf_remove(struct trie *t, struct leaf *l)
1585{
1586	struct tnode *tp = node_parent((struct node *) l);
1587
1588	pr_debug("entering trie_leaf_remove(%p)\n", l);
1589
1590	if (tp) {
1591		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1592		put_child(t, (struct tnode *)tp, cindex, NULL);
1593		trie_rebalance(t, tp);
1594	} else
1595		rcu_assign_pointer(t->trie, NULL);
1596
1597	free_leaf(l);
1598}
1599
1600/*
1601 * Caller must hold RTNL.
1602 */
1603int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1604{
1605	struct trie *t = (struct trie *) tb->tb_data;
1606	u32 key, mask;
1607	int plen = cfg->fc_dst_len;
1608	u8 tos = cfg->fc_tos;
1609	struct fib_alias *fa, *fa_to_delete;
1610	struct list_head *fa_head;
1611	struct leaf *l;
1612	struct leaf_info *li;
1613
1614	if (plen > 32)
1615		return -EINVAL;
1616
1617	key = ntohl(cfg->fc_dst);
1618	mask = ntohl(inet_make_mask(plen));
1619
1620	if (key & ~mask)
1621		return -EINVAL;
1622
1623	key = key & mask;
1624	l = fib_find_node(t, key);
1625
1626	if (!l)
1627		return -ESRCH;
1628
1629	fa_head = get_fa_head(l, plen);
1630	fa = fib_find_alias(fa_head, tos, 0);
1631
1632	if (!fa)
1633		return -ESRCH;
1634
1635	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1636
1637	fa_to_delete = NULL;
1638	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1639	list_for_each_entry_continue(fa, fa_head, fa_list) {
1640		struct fib_info *fi = fa->fa_info;
1641
1642		if (fa->fa_tos != tos)
1643			break;
1644
1645		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1646		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1647		     fa->fa_scope == cfg->fc_scope) &&
1648		    (!cfg->fc_protocol ||
1649		     fi->fib_protocol == cfg->fc_protocol) &&
1650		    fib_nh_match(cfg, fi) == 0) {
1651			fa_to_delete = fa;
1652			break;
1653		}
1654	}
1655
1656	if (!fa_to_delete)
1657		return -ESRCH;
1658
1659	fa = fa_to_delete;
1660	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1661		  &cfg->fc_nlinfo, 0);
1662
1663	l = fib_find_node(t, key);
1664	li = find_leaf_info(l, plen);
1665
1666	list_del_rcu(&fa->fa_list);
1667
1668	if (list_empty(fa_head)) {
1669		hlist_del_rcu(&li->hlist);
1670		free_leaf_info(li);
1671	}
1672
1673	if (hlist_empty(&l->list))
1674		trie_leaf_remove(t, l);
1675
1676	if (fa->fa_state & FA_S_ACCESSED)
1677		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1678
1679	fib_release_info(fa->fa_info);
1680	alias_free_mem_rcu(fa);
1681	return 0;
1682}
1683
1684static int trie_flush_list(struct list_head *head)
1685{
1686	struct fib_alias *fa, *fa_node;
1687	int found = 0;
1688
1689	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1690		struct fib_info *fi = fa->fa_info;
1691
1692		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1693			list_del_rcu(&fa->fa_list);
1694			fib_release_info(fa->fa_info);
1695			alias_free_mem_rcu(fa);
1696			found++;
1697		}
1698	}
1699	return found;
1700}
1701
1702static int trie_flush_leaf(struct leaf *l)
1703{
1704	int found = 0;
1705	struct hlist_head *lih = &l->list;
1706	struct hlist_node *node, *tmp;
1707	struct leaf_info *li = NULL;
1708
1709	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1710		found += trie_flush_list(&li->falh);
1711
1712		if (list_empty(&li->falh)) {
1713			hlist_del_rcu(&li->hlist);
1714			free_leaf_info(li);
1715		}
1716	}
1717	return found;
1718}
1719
1720/*
1721 * Scan for the next right leaf starting at node p->child[idx]
1722 * Since we have back pointer, no recursion necessary.
1723 */
1724static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1725{
1726	do {
1727		t_key idx;
1728
1729		if (c)
1730			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1731		else
1732			idx = 0;
1733
1734		while (idx < 1u << p->bits) {
1735			c = tnode_get_child_rcu(p, idx++);
1736			if (!c)
1737				continue;
1738
1739			if (IS_LEAF(c)) {
1740				prefetch(p->child[idx]);
1741				return (struct leaf *) c;
1742			}
1743
1744			/* Rescan start scanning in new node */
1745			p = (struct tnode *) c;
1746			idx = 0;
1747		}
1748
1749		/* Node empty, walk back up to parent */
1750		c = (struct node *) p;
1751	} while ( (p = node_parent_rcu(c)) != NULL);
1752
1753	return NULL; /* Root of trie */
1754}
1755
1756static struct leaf *trie_firstleaf(struct trie *t)
1757{
1758	struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie,
1759							rcu_read_lock_held() ||
1760							lockdep_rtnl_is_held());
1761
1762	if (!n)
1763		return NULL;
1764
1765	if (IS_LEAF(n))          /* trie is just a leaf */
1766		return (struct leaf *) n;
1767
1768	return leaf_walk_rcu(n, NULL);
1769}
1770
1771static struct leaf *trie_nextleaf(struct leaf *l)
1772{
1773	struct node *c = (struct node *) l;
1774	struct tnode *p = node_parent_rcu(c);
1775
1776	if (!p)
1777		return NULL;	/* trie with just one leaf */
1778
1779	return leaf_walk_rcu(p, c);
1780}
1781
1782static struct leaf *trie_leafindex(struct trie *t, int index)
1783{
1784	struct leaf *l = trie_firstleaf(t);
1785
1786	while (l && index-- > 0)
1787		l = trie_nextleaf(l);
1788
1789	return l;
1790}
1791
1792
1793/*
1794 * Caller must hold RTNL.
1795 */
1796int fib_table_flush(struct fib_table *tb)
1797{
1798	struct trie *t = (struct trie *) tb->tb_data;
1799	struct leaf *l, *ll = NULL;
1800	int found = 0;
1801
1802	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1803		found += trie_flush_leaf(l);
1804
1805		if (ll && hlist_empty(&ll->list))
1806			trie_leaf_remove(t, ll);
1807		ll = l;
1808	}
1809
1810	if (ll && hlist_empty(&ll->list))
1811		trie_leaf_remove(t, ll);
1812
1813	pr_debug("trie_flush found=%d\n", found);
1814	return found;
1815}
1816
1817void fib_table_select_default(struct fib_table *tb,
1818			      const struct flowi *flp,
1819			      struct fib_result *res)
1820{
1821	struct trie *t = (struct trie *) tb->tb_data;
1822	int order, last_idx;
1823	struct fib_info *fi = NULL;
1824	struct fib_info *last_resort;
1825	struct fib_alias *fa = NULL;
1826	struct list_head *fa_head;
1827	struct leaf *l;
1828
1829	last_idx = -1;
1830	last_resort = NULL;
1831	order = -1;
1832
1833	rcu_read_lock();
1834
1835	l = fib_find_node(t, 0);
1836	if (!l)
1837		goto out;
1838
1839	fa_head = get_fa_head(l, 0);
1840	if (!fa_head)
1841		goto out;
1842
1843	if (list_empty(fa_head))
1844		goto out;
1845
1846	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1847		struct fib_info *next_fi = fa->fa_info;
1848
1849		if (fa->fa_scope != res->scope ||
1850		    fa->fa_type != RTN_UNICAST)
1851			continue;
1852
1853		if (next_fi->fib_priority > res->fi->fib_priority)
1854			break;
1855		if (!next_fi->fib_nh[0].nh_gw ||
1856		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1857			continue;
1858		fa->fa_state |= FA_S_ACCESSED;
1859
1860		if (fi == NULL) {
1861			if (next_fi != res->fi)
1862				break;
1863		} else if (!fib_detect_death(fi, order, &last_resort,
1864					     &last_idx, tb->tb_default)) {
1865			fib_result_assign(res, fi);
1866			tb->tb_default = order;
1867			goto out;
1868		}
1869		fi = next_fi;
1870		order++;
1871	}
1872	if (order <= 0 || fi == NULL) {
1873		tb->tb_default = -1;
1874		goto out;
1875	}
1876
1877	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1878				tb->tb_default)) {
1879		fib_result_assign(res, fi);
1880		tb->tb_default = order;
1881		goto out;
1882	}
1883	if (last_idx >= 0)
1884		fib_result_assign(res, last_resort);
1885	tb->tb_default = last_idx;
1886out:
1887	rcu_read_unlock();
1888}
1889
1890static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1891			   struct fib_table *tb,
1892			   struct sk_buff *skb, struct netlink_callback *cb)
1893{
1894	int i, s_i;
1895	struct fib_alias *fa;
1896	__be32 xkey = htonl(key);
1897
1898	s_i = cb->args[5];
1899	i = 0;
1900
1901	/* rcu_read_lock is hold by caller */
1902
1903	list_for_each_entry_rcu(fa, fah, fa_list) {
1904		if (i < s_i) {
1905			i++;
1906			continue;
1907		}
1908
1909		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1910				  cb->nlh->nlmsg_seq,
1911				  RTM_NEWROUTE,
1912				  tb->tb_id,
1913				  fa->fa_type,
1914				  fa->fa_scope,
1915				  xkey,
1916				  plen,
1917				  fa->fa_tos,
1918				  fa->fa_info, NLM_F_MULTI) < 0) {
1919			cb->args[5] = i;
1920			return -1;
1921		}
1922		i++;
1923	}
1924	cb->args[5] = i;
1925	return skb->len;
1926}
1927
1928static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1929			struct sk_buff *skb, struct netlink_callback *cb)
1930{
1931	struct leaf_info *li;
1932	struct hlist_node *node;
1933	int i, s_i;
1934
1935	s_i = cb->args[4];
1936	i = 0;
1937
1938	/* rcu_read_lock is hold by caller */
1939	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1940		if (i < s_i) {
1941			i++;
1942			continue;
1943		}
1944
1945		if (i > s_i)
1946			cb->args[5] = 0;
1947
1948		if (list_empty(&li->falh))
1949			continue;
1950
1951		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1952			cb->args[4] = i;
1953			return -1;
1954		}
1955		i++;
1956	}
1957
1958	cb->args[4] = i;
1959	return skb->len;
1960}
1961
1962int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1963		   struct netlink_callback *cb)
1964{
1965	struct leaf *l;
1966	struct trie *t = (struct trie *) tb->tb_data;
1967	t_key key = cb->args[2];
1968	int count = cb->args[3];
1969
1970	rcu_read_lock();
1971	/* Dump starting at last key.
1972	 * Note: 0.0.0.0/0 (ie default) is first key.
1973	 */
1974	if (count == 0)
1975		l = trie_firstleaf(t);
1976	else {
1977		/* Normally, continue from last key, but if that is missing
1978		 * fallback to using slow rescan
1979		 */
1980		l = fib_find_node(t, key);
1981		if (!l)
1982			l = trie_leafindex(t, count);
1983	}
1984
1985	while (l) {
1986		cb->args[2] = l->key;
1987		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1988			cb->args[3] = count;
1989			rcu_read_unlock();
1990			return -1;
1991		}
1992
1993		++count;
1994		l = trie_nextleaf(l);
1995		memset(&cb->args[4], 0,
1996		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1997	}
1998	cb->args[3] = count;
1999	rcu_read_unlock();
2000
2001	return skb->len;
2002}
2003
2004void __init fib_hash_init(void)
2005{
2006	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2007					  sizeof(struct fib_alias),
2008					  0, SLAB_PANIC, NULL);
2009
2010	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2011					   max(sizeof(struct leaf),
2012					       sizeof(struct leaf_info)),
2013					   0, SLAB_PANIC, NULL);
2014}
2015
2016
2017/* Fix more generic FIB names for init later */
2018struct fib_table *fib_hash_table(u32 id)
2019{
2020	struct fib_table *tb;
2021	struct trie *t;
2022
2023	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2024		     GFP_KERNEL);
2025	if (tb == NULL)
2026		return NULL;
2027
2028	tb->tb_id = id;
2029	tb->tb_default = -1;
2030
2031	t = (struct trie *) tb->tb_data;
2032	memset(t, 0, sizeof(*t));
2033
2034	if (id == RT_TABLE_LOCAL)
2035		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2036
2037	return tb;
2038}
2039
2040#ifdef CONFIG_PROC_FS
2041/* Depth first Trie walk iterator */
2042struct fib_trie_iter {
2043	struct seq_net_private p;
2044	struct fib_table *tb;
2045	struct tnode *tnode;
2046	unsigned index;
2047	unsigned depth;
2048};
2049
2050static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2051{
2052	struct tnode *tn = iter->tnode;
2053	unsigned cindex = iter->index;
2054	struct tnode *p;
2055
2056	/* A single entry routing table */
2057	if (!tn)
2058		return NULL;
2059
2060	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2061		 iter->tnode, iter->index, iter->depth);
2062rescan:
2063	while (cindex < (1<<tn->bits)) {
2064		struct node *n = tnode_get_child_rcu(tn, cindex);
2065
2066		if (n) {
2067			if (IS_LEAF(n)) {
2068				iter->tnode = tn;
2069				iter->index = cindex + 1;
2070			} else {
2071				/* push down one level */
2072				iter->tnode = (struct tnode *) n;
2073				iter->index = 0;
2074				++iter->depth;
2075			}
2076			return n;
2077		}
2078
2079		++cindex;
2080	}
2081
2082	/* Current node exhausted, pop back up */
2083	p = node_parent_rcu((struct node *)tn);
2084	if (p) {
2085		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2086		tn = p;
2087		--iter->depth;
2088		goto rescan;
2089	}
2090
2091	/* got root? */
2092	return NULL;
2093}
2094
2095static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2096				       struct trie *t)
2097{
2098	struct node *n;
2099
2100	if (!t)
2101		return NULL;
2102
2103	n = rcu_dereference(t->trie);
2104	if (!n)
2105		return NULL;
2106
2107	if (IS_TNODE(n)) {
2108		iter->tnode = (struct tnode *) n;
2109		iter->index = 0;
2110		iter->depth = 1;
2111	} else {
2112		iter->tnode = NULL;
2113		iter->index = 0;
2114		iter->depth = 0;
2115	}
2116
2117	return n;
2118}
2119
2120static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2121{
2122	struct node *n;
2123	struct fib_trie_iter iter;
2124
2125	memset(s, 0, sizeof(*s));
2126
2127	rcu_read_lock();
2128	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2129		if (IS_LEAF(n)) {
2130			struct leaf *l = (struct leaf *)n;
2131			struct leaf_info *li;
2132			struct hlist_node *tmp;
2133
2134			s->leaves++;
2135			s->totdepth += iter.depth;
2136			if (iter.depth > s->maxdepth)
2137				s->maxdepth = iter.depth;
2138
2139			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2140				++s->prefixes;
2141		} else {
2142			const struct tnode *tn = (const struct tnode *) n;
2143			int i;
2144
2145			s->tnodes++;
2146			if (tn->bits < MAX_STAT_DEPTH)
2147				s->nodesizes[tn->bits]++;
2148
2149			for (i = 0; i < (1<<tn->bits); i++)
2150				if (!tn->child[i])
2151					s->nullpointers++;
2152		}
2153	}
2154	rcu_read_unlock();
2155}
2156
2157/*
2158 *	This outputs /proc/net/fib_triestats
2159 */
2160static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2161{
2162	unsigned i, max, pointers, bytes, avdepth;
2163
2164	if (stat->leaves)
2165		avdepth = stat->totdepth*100 / stat->leaves;
2166	else
2167		avdepth = 0;
2168
2169	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2170		   avdepth / 100, avdepth % 100);
2171	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2172
2173	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2174	bytes = sizeof(struct leaf) * stat->leaves;
2175
2176	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2177	bytes += sizeof(struct leaf_info) * stat->prefixes;
2178
2179	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2180	bytes += sizeof(struct tnode) * stat->tnodes;
2181
2182	max = MAX_STAT_DEPTH;
2183	while (max > 0 && stat->nodesizes[max-1] == 0)
2184		max--;
2185
2186	pointers = 0;
2187	for (i = 1; i <= max; i++)
2188		if (stat->nodesizes[i] != 0) {
2189			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2190			pointers += (1<<i) * stat->nodesizes[i];
2191		}
2192	seq_putc(seq, '\n');
2193	seq_printf(seq, "\tPointers: %u\n", pointers);
2194
2195	bytes += sizeof(struct node *) * pointers;
2196	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2197	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2198}
2199
2200#ifdef CONFIG_IP_FIB_TRIE_STATS
2201static void trie_show_usage(struct seq_file *seq,
2202			    const struct trie_use_stats *stats)
2203{
2204	seq_printf(seq, "\nCounters:\n---------\n");
2205	seq_printf(seq, "gets = %u\n", stats->gets);
2206	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2207	seq_printf(seq, "semantic match passed = %u\n",
2208		   stats->semantic_match_passed);
2209	seq_printf(seq, "semantic match miss = %u\n",
2210		   stats->semantic_match_miss);
2211	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2212	seq_printf(seq, "skipped node resize = %u\n\n",
2213		   stats->resize_node_skipped);
2214}
2215#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2216
2217static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2218{
2219	if (tb->tb_id == RT_TABLE_LOCAL)
2220		seq_puts(seq, "Local:\n");
2221	else if (tb->tb_id == RT_TABLE_MAIN)
2222		seq_puts(seq, "Main:\n");
2223	else
2224		seq_printf(seq, "Id %d:\n", tb->tb_id);
2225}
2226
2227
2228static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2229{
2230	struct net *net = (struct net *)seq->private;
2231	unsigned int h;
2232
2233	seq_printf(seq,
2234		   "Basic info: size of leaf:"
2235		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2236		   sizeof(struct leaf), sizeof(struct tnode));
2237
2238	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2239		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2240		struct hlist_node *node;
2241		struct fib_table *tb;
2242
2243		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2244			struct trie *t = (struct trie *) tb->tb_data;
2245			struct trie_stat stat;
2246
2247			if (!t)
2248				continue;
2249
2250			fib_table_print(seq, tb);
2251
2252			trie_collect_stats(t, &stat);
2253			trie_show_stats(seq, &stat);
2254#ifdef CONFIG_IP_FIB_TRIE_STATS
2255			trie_show_usage(seq, &t->stats);
2256#endif
2257		}
2258	}
2259
2260	return 0;
2261}
2262
2263static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2264{
2265	return single_open_net(inode, file, fib_triestat_seq_show);
2266}
2267
2268static const struct file_operations fib_triestat_fops = {
2269	.owner	= THIS_MODULE,
2270	.open	= fib_triestat_seq_open,
2271	.read	= seq_read,
2272	.llseek	= seq_lseek,
2273	.release = single_release_net,
2274};
2275
2276static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2277{
2278	struct fib_trie_iter *iter = seq->private;
2279	struct net *net = seq_file_net(seq);
2280	loff_t idx = 0;
2281	unsigned int h;
2282
2283	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2284		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2285		struct hlist_node *node;
2286		struct fib_table *tb;
2287
2288		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2289			struct node *n;
2290
2291			for (n = fib_trie_get_first(iter,
2292						    (struct trie *) tb->tb_data);
2293			     n; n = fib_trie_get_next(iter))
2294				if (pos == idx++) {
2295					iter->tb = tb;
2296					return n;
2297				}
2298		}
2299	}
2300
2301	return NULL;
2302}
2303
2304static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2305	__acquires(RCU)
2306{
2307	rcu_read_lock();
2308	return fib_trie_get_idx(seq, *pos);
2309}
2310
2311static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2312{
2313	struct fib_trie_iter *iter = seq->private;
2314	struct net *net = seq_file_net(seq);
2315	struct fib_table *tb = iter->tb;
2316	struct hlist_node *tb_node;
2317	unsigned int h;
2318	struct node *n;
2319
2320	++*pos;
2321	/* next node in same table */
2322	n = fib_trie_get_next(iter);
2323	if (n)
2324		return n;
2325
2326	/* walk rest of this hash chain */
2327	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2328	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2329		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2330		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2331		if (n)
2332			goto found;
2333	}
2334
2335	/* new hash chain */
2336	while (++h < FIB_TABLE_HASHSZ) {
2337		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2338		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2339			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2340			if (n)
2341				goto found;
2342		}
2343	}
2344	return NULL;
2345
2346found:
2347	iter->tb = tb;
2348	return n;
2349}
2350
2351static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2352	__releases(RCU)
2353{
2354	rcu_read_unlock();
2355}
2356
2357static void seq_indent(struct seq_file *seq, int n)
2358{
2359	while (n-- > 0) seq_puts(seq, "   ");
2360}
2361
2362static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2363{
2364	switch (s) {
2365	case RT_SCOPE_UNIVERSE: return "universe";
2366	case RT_SCOPE_SITE:	return "site";
2367	case RT_SCOPE_LINK:	return "link";
2368	case RT_SCOPE_HOST:	return "host";
2369	case RT_SCOPE_NOWHERE:	return "nowhere";
2370	default:
2371		snprintf(buf, len, "scope=%d", s);
2372		return buf;
2373	}
2374}
2375
2376static const char *const rtn_type_names[__RTN_MAX] = {
2377	[RTN_UNSPEC] = "UNSPEC",
2378	[RTN_UNICAST] = "UNICAST",
2379	[RTN_LOCAL] = "LOCAL",
2380	[RTN_BROADCAST] = "BROADCAST",
2381	[RTN_ANYCAST] = "ANYCAST",
2382	[RTN_MULTICAST] = "MULTICAST",
2383	[RTN_BLACKHOLE] = "BLACKHOLE",
2384	[RTN_UNREACHABLE] = "UNREACHABLE",
2385	[RTN_PROHIBIT] = "PROHIBIT",
2386	[RTN_THROW] = "THROW",
2387	[RTN_NAT] = "NAT",
2388	[RTN_XRESOLVE] = "XRESOLVE",
2389};
2390
2391static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2392{
2393	if (t < __RTN_MAX && rtn_type_names[t])
2394		return rtn_type_names[t];
2395	snprintf(buf, len, "type %u", t);
2396	return buf;
2397}
2398
2399/* Pretty print the trie */
2400static int fib_trie_seq_show(struct seq_file *seq, void *v)
2401{
2402	const struct fib_trie_iter *iter = seq->private;
2403	struct node *n = v;
2404
2405	if (!node_parent_rcu(n))
2406		fib_table_print(seq, iter->tb);
2407
2408	if (IS_TNODE(n)) {
2409		struct tnode *tn = (struct tnode *) n;
2410		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2411
2412		seq_indent(seq, iter->depth-1);
2413		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2414			   &prf, tn->pos, tn->bits, tn->full_children,
2415			   tn->empty_children);
2416
2417	} else {
2418		struct leaf *l = (struct leaf *) n;
2419		struct leaf_info *li;
2420		struct hlist_node *node;
2421		__be32 val = htonl(l->key);
2422
2423		seq_indent(seq, iter->depth);
2424		seq_printf(seq, "  |-- %pI4\n", &val);
2425
2426		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2427			struct fib_alias *fa;
2428
2429			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2430				char buf1[32], buf2[32];
2431
2432				seq_indent(seq, iter->depth+1);
2433				seq_printf(seq, "  /%d %s %s", li->plen,
2434					   rtn_scope(buf1, sizeof(buf1),
2435						     fa->fa_scope),
2436					   rtn_type(buf2, sizeof(buf2),
2437						    fa->fa_type));
2438				if (fa->fa_tos)
2439					seq_printf(seq, " tos=%d", fa->fa_tos);
2440				seq_putc(seq, '\n');
2441			}
2442		}
2443	}
2444
2445	return 0;
2446}
2447
2448static const struct seq_operations fib_trie_seq_ops = {
2449	.start  = fib_trie_seq_start,
2450	.next   = fib_trie_seq_next,
2451	.stop   = fib_trie_seq_stop,
2452	.show   = fib_trie_seq_show,
2453};
2454
2455static int fib_trie_seq_open(struct inode *inode, struct file *file)
2456{
2457	return seq_open_net(inode, file, &fib_trie_seq_ops,
2458			    sizeof(struct fib_trie_iter));
2459}
2460
2461static const struct file_operations fib_trie_fops = {
2462	.owner  = THIS_MODULE,
2463	.open   = fib_trie_seq_open,
2464	.read   = seq_read,
2465	.llseek = seq_lseek,
2466	.release = seq_release_net,
2467};
2468
2469struct fib_route_iter {
2470	struct seq_net_private p;
2471	struct trie *main_trie;
2472	loff_t	pos;
2473	t_key	key;
2474};
2475
2476static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2477{
2478	struct leaf *l = NULL;
2479	struct trie *t = iter->main_trie;
2480
2481	/* use cache location of last found key */
2482	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2483		pos -= iter->pos;
2484	else {
2485		iter->pos = 0;
2486		l = trie_firstleaf(t);
2487	}
2488
2489	while (l && pos-- > 0) {
2490		iter->pos++;
2491		l = trie_nextleaf(l);
2492	}
2493
2494	if (l)
2495		iter->key = pos;	/* remember it */
2496	else
2497		iter->pos = 0;		/* forget it */
2498
2499	return l;
2500}
2501
2502static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2503	__acquires(RCU)
2504{
2505	struct fib_route_iter *iter = seq->private;
2506	struct fib_table *tb;
2507
2508	rcu_read_lock();
2509	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2510	if (!tb)
2511		return NULL;
2512
2513	iter->main_trie = (struct trie *) tb->tb_data;
2514	if (*pos == 0)
2515		return SEQ_START_TOKEN;
2516	else
2517		return fib_route_get_idx(iter, *pos - 1);
2518}
2519
2520static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2521{
2522	struct fib_route_iter *iter = seq->private;
2523	struct leaf *l = v;
2524
2525	++*pos;
2526	if (v == SEQ_START_TOKEN) {
2527		iter->pos = 0;
2528		l = trie_firstleaf(iter->main_trie);
2529	} else {
2530		iter->pos++;
2531		l = trie_nextleaf(l);
2532	}
2533
2534	if (l)
2535		iter->key = l->key;
2536	else
2537		iter->pos = 0;
2538	return l;
2539}
2540
2541static void fib_route_seq_stop(struct seq_file *seq, void *v)
2542	__releases(RCU)
2543{
2544	rcu_read_unlock();
2545}
2546
2547static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2548{
2549	static unsigned type2flags[RTN_MAX + 1] = {
2550		[7] = RTF_REJECT, [8] = RTF_REJECT,
2551	};
2552	unsigned flags = type2flags[type];
2553
2554	if (fi && fi->fib_nh->nh_gw)
2555		flags |= RTF_GATEWAY;
2556	if (mask == htonl(0xFFFFFFFF))
2557		flags |= RTF_HOST;
2558	flags |= RTF_UP;
2559	return flags;
2560}
2561
2562/*
2563 *	This outputs /proc/net/route.
2564 *	The format of the file is not supposed to be changed
2565 * 	and needs to be same as fib_hash output to avoid breaking
2566 *	legacy utilities
2567 */
2568static int fib_route_seq_show(struct seq_file *seq, void *v)
2569{
2570	struct leaf *l = v;
2571	struct leaf_info *li;
2572	struct hlist_node *node;
2573
2574	if (v == SEQ_START_TOKEN) {
2575		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2576			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2577			   "\tWindow\tIRTT");
2578		return 0;
2579	}
2580
2581	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2582		struct fib_alias *fa;
2583		__be32 mask, prefix;
2584
2585		mask = inet_make_mask(li->plen);
2586		prefix = htonl(l->key);
2587
2588		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2589			const struct fib_info *fi = fa->fa_info;
2590			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2591			int len;
2592
2593			if (fa->fa_type == RTN_BROADCAST
2594			    || fa->fa_type == RTN_MULTICAST)
2595				continue;
2596
2597			if (fi)
2598				seq_printf(seq,
2599					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2600					 "%d\t%08X\t%d\t%u\t%u%n",
2601					 fi->fib_dev ? fi->fib_dev->name : "*",
2602					 prefix,
2603					 fi->fib_nh->nh_gw, flags, 0, 0,
2604					 fi->fib_priority,
2605					 mask,
2606					 (fi->fib_advmss ?
2607					  fi->fib_advmss + 40 : 0),
2608					 fi->fib_window,
2609					 fi->fib_rtt >> 3, &len);
2610			else
2611				seq_printf(seq,
2612					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2613					 "%d\t%08X\t%d\t%u\t%u%n",
2614					 prefix, 0, flags, 0, 0, 0,
2615					 mask, 0, 0, 0, &len);
2616
2617			seq_printf(seq, "%*s\n", 127 - len, "");
2618		}
2619	}
2620
2621	return 0;
2622}
2623
2624static const struct seq_operations fib_route_seq_ops = {
2625	.start  = fib_route_seq_start,
2626	.next   = fib_route_seq_next,
2627	.stop   = fib_route_seq_stop,
2628	.show   = fib_route_seq_show,
2629};
2630
2631static int fib_route_seq_open(struct inode *inode, struct file *file)
2632{
2633	return seq_open_net(inode, file, &fib_route_seq_ops,
2634			    sizeof(struct fib_route_iter));
2635}
2636
2637static const struct file_operations fib_route_fops = {
2638	.owner  = THIS_MODULE,
2639	.open   = fib_route_seq_open,
2640	.read   = seq_read,
2641	.llseek = seq_lseek,
2642	.release = seq_release_net,
2643};
2644
2645int __net_init fib_proc_init(struct net *net)
2646{
2647	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2648		goto out1;
2649
2650	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2651				  &fib_triestat_fops))
2652		goto out2;
2653
2654	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2655		goto out3;
2656
2657	return 0;
2658
2659out3:
2660	proc_net_remove(net, "fib_triestat");
2661out2:
2662	proc_net_remove(net, "fib_trie");
2663out1:
2664	return -ENOMEM;
2665}
2666
2667void __net_exit fib_proc_exit(struct net *net)
2668{
2669	proc_net_remove(net, "fib_trie");
2670	proc_net_remove(net, "fib_triestat");
2671	proc_net_remove(net, "route");
2672}
2673
2674#endif /* CONFIG_PROC_FS */
2675