1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		ROUTE - implementation of the IP router.
7 *
8 * Version:	$Id: route.c,v 1.1.1.1 2007/08/03 18:53:51 Exp $
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 *		Alan Cox	:	Verify area fixes.
18 *		Alan Cox	:	cli() protects routing changes
19 *		Rui Oliveira	:	ICMP routing table updates
20 *		(rco@di.uminho.pt)	Routing table insertion and update
21 *		Linus Torvalds	:	Rewrote bits to be sensible
22 *		Alan Cox	:	Added BSD route gw semantics
23 *		Alan Cox	:	Super /proc >4K
24 *		Alan Cox	:	MTU in route table
25 *		Alan Cox	: 	MSS actually. Also added the window
26 *					clamper.
27 *		Sam Lantinga	:	Fixed route matching in rt_del()
28 *		Alan Cox	:	Routing cache support.
29 *		Alan Cox	:	Removed compatibility cruft.
30 *		Alan Cox	:	RTF_REJECT support.
31 *		Alan Cox	:	TCP irtt support.
32 *		Jonathan Naylor	:	Added Metric support.
33 *	Miquel van Smoorenburg	:	BSD API fixes.
34 *	Miquel van Smoorenburg	:	Metrics.
35 *		Alan Cox	:	Use __u32 properly
36 *		Alan Cox	:	Aligned routing errors more closely with BSD
37 *					our system is still very different.
38 *		Alan Cox	:	Faster /proc handling
39 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40 *					routing caches and better behaviour.
41 *
42 *		Olaf Erb	:	irtt wasn't being copied right.
43 *		Bjorn Ekwall	:	Kerneld route support.
44 *		Alan Cox	:	Multicast fixed (I hope)
45 * 		Pavel Krauz	:	Limited broadcast fixed
46 *		Mike McLagan	:	Routing by source
47 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48 *					route.c and rewritten from scratch.
49 *		Andi Kleen	:	Load-limit warning messages.
50 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54 *		Marc Boucher	:	routing by fwmark
55 *	Robert Olsson		:	Added rt_cache statistics
56 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
59 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
60 *
61 *		This program is free software; you can redistribute it and/or
62 *		modify it under the terms of the GNU General Public License
63 *		as published by the Free Software Foundation; either version
64 *		2 of the License, or (at your option) any later version.
65 */
66
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
73#include <linux/mm.h>
74#include <linux/bootmem.h>
75#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
84#include <linux/skbuff.h>
85#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
94#include <net/protocol.h>
95#include <net/ip.h>
96#include <net/route.h>
97#include <net/inetpeer.h>
98#include <net/sock.h>
99#include <net/ip_fib.h>
100#include <net/arp.h>
101#include <net/tcp.h>
102#include <net/icmp.h>
103#include <net/xfrm.h>
104#include <net/ip_mp_alg.h>
105#include <net/netevent.h>
106#include <net/rtnetlink.h>
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112    ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU	0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
118static int ip_rt_min_delay		= 2 * HZ;
119static int ip_rt_max_delay		= 10 * HZ;
120static int ip_rt_max_size;
121static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
122static int ip_rt_gc_interval		= 60 * HZ;
123static int ip_rt_gc_min_interval	= HZ / 2;
124static int ip_rt_redirect_number	= 9;
125static int ip_rt_redirect_load		= HZ / 50;
126static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost		= HZ;
128static int ip_rt_error_burst		= 5 * HZ;
129static int ip_rt_gc_elasticity		= 8;
130static int ip_rt_mtu_expires		= 10 * 60 * HZ;
131static int ip_rt_min_pmtu		= 512 + 20 + 20;
132static int ip_rt_min_advmss		= 256;
133static int ip_rt_secret_interval	= 10 * 60 * HZ;
134static unsigned long rt_deadline;
135
136#define RTprint(a...)	printk(KERN_DEBUG a)
137
138static struct timer_list rt_flush_timer;
139static struct timer_list rt_periodic_timer;
140static struct timer_list rt_secret_timer;
141
142/*
143 *	Interface to generic destination cache.
144 */
145
146static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
147static void		 ipv4_dst_destroy(struct dst_entry *dst);
148static void		 ipv4_dst_ifdown(struct dst_entry *dst,
149					 struct net_device *dev, int how);
150static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
151static void		 ipv4_link_failure(struct sk_buff *skb);
152static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
153static int rt_garbage_collect(void);
154
155
156static struct dst_ops ipv4_dst_ops = {
157	.family =		AF_INET,
158	.protocol =		__constant_htons(ETH_P_IP),
159	.gc =			rt_garbage_collect,
160	.check =		ipv4_dst_check,
161	.destroy =		ipv4_dst_destroy,
162	.ifdown =		ipv4_dst_ifdown,
163	.negative_advice =	ipv4_negative_advice,
164	.link_failure =		ipv4_link_failure,
165	.update_pmtu =		ip_rt_update_pmtu,
166	.entry_size =		sizeof(struct rtable),
167};
168
169#define ECN_OR_COST(class)	TC_PRIO_##class
170
171__u8 ip_tos2prio[16] = {
172	TC_PRIO_BESTEFFORT,
173	ECN_OR_COST(FILLER),
174	TC_PRIO_BESTEFFORT,
175	ECN_OR_COST(BESTEFFORT),
176	TC_PRIO_BULK,
177	ECN_OR_COST(BULK),
178	TC_PRIO_BULK,
179	ECN_OR_COST(BULK),
180	TC_PRIO_INTERACTIVE,
181	ECN_OR_COST(INTERACTIVE),
182	TC_PRIO_INTERACTIVE,
183	ECN_OR_COST(INTERACTIVE),
184	TC_PRIO_INTERACTIVE_BULK,
185	ECN_OR_COST(INTERACTIVE_BULK),
186	TC_PRIO_INTERACTIVE_BULK,
187	ECN_OR_COST(INTERACTIVE_BULK)
188};
189
190
191/*
192 * Route cache.
193 */
194
195/* The locking scheme is rather straight forward:
196 *
197 * 1) Read-Copy Update protects the buckets of the central route hash.
198 * 2) Only writers remove entries, and they hold the lock
199 *    as they look at rtable reference counts.
200 * 3) Only readers acquire references to rtable entries,
201 *    they do so with atomic increments and with the
202 *    lock held.
203 */
204
205struct rt_hash_bucket {
206	struct rtable	*chain;
207};
208#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
209	defined(CONFIG_PROVE_LOCKING)
210/*
211 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
212 * The size of this table is a power of two and depends on the number of CPUS.
213 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
214 */
215#ifdef CONFIG_LOCKDEP
216# define RT_HASH_LOCK_SZ	256
217#else
218# if NR_CPUS >= 32
219#  define RT_HASH_LOCK_SZ	4096
220# elif NR_CPUS >= 16
221#  define RT_HASH_LOCK_SZ	2048
222# elif NR_CPUS >= 8
223#  define RT_HASH_LOCK_SZ	1024
224# elif NR_CPUS >= 4
225#  define RT_HASH_LOCK_SZ	512
226# else
227#  define RT_HASH_LOCK_SZ	256
228# endif
229#endif
230
231static spinlock_t	*rt_hash_locks;
232# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233# define rt_hash_lock_init()	{ \
234		int i; \
235		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
236		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
237		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
238			spin_lock_init(&rt_hash_locks[i]); \
239		}
240#else
241# define rt_hash_lock_addr(slot) NULL
242# define rt_hash_lock_init()
243#endif
244
245static struct rt_hash_bucket 	*rt_hash_table;
246static unsigned			rt_hash_mask;
247static int			rt_hash_log;
248static unsigned int		rt_hash_rnd;
249
250static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
251#define RT_CACHE_STAT_INC(field) \
252	(__raw_get_cpu_var(rt_cache_stat).field++)
253
254static int rt_intern_hash(unsigned hash, struct rtable *rth,
255				struct rtable **res);
256
257static unsigned int rt_hash_code(u32 daddr, u32 saddr)
258{
259	return (jhash_2words(daddr, saddr, rt_hash_rnd)
260		& rt_hash_mask);
261}
262
263#define rt_hash(daddr, saddr, idx) \
264	rt_hash_code((__force u32)(__be32)(daddr),\
265		     (__force u32)(__be32)(saddr) ^ ((idx) << 5))
266
267#ifdef CONFIG_PROC_FS
268struct rt_cache_iter_state {
269	int bucket;
270};
271
272static struct rtable *rt_cache_get_first(struct seq_file *seq)
273{
274	struct rtable *r = NULL;
275	struct rt_cache_iter_state *st = seq->private;
276
277	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
278		rcu_read_lock_bh();
279		r = rt_hash_table[st->bucket].chain;
280		if (r)
281			break;
282		rcu_read_unlock_bh();
283	}
284	return r;
285}
286
287static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
288{
289	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
290
291	r = r->u.dst.rt_next;
292	while (!r) {
293		rcu_read_unlock_bh();
294		if (--st->bucket < 0)
295			break;
296		rcu_read_lock_bh();
297		r = rt_hash_table[st->bucket].chain;
298	}
299	return r;
300}
301
302static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
303{
304	struct rtable *r = rt_cache_get_first(seq);
305
306	if (r)
307		while (pos && (r = rt_cache_get_next(seq, r)))
308			--pos;
309	return pos ? NULL : r;
310}
311
312static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
313{
314	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
315}
316
317static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
318{
319	struct rtable *r = NULL;
320
321	if (v == SEQ_START_TOKEN)
322		r = rt_cache_get_first(seq);
323	else
324		r = rt_cache_get_next(seq, v);
325	++*pos;
326	return r;
327}
328
329static void rt_cache_seq_stop(struct seq_file *seq, void *v)
330{
331	if (v && v != SEQ_START_TOKEN)
332		rcu_read_unlock_bh();
333}
334
335static int rt_cache_seq_show(struct seq_file *seq, void *v)
336{
337	if (v == SEQ_START_TOKEN)
338		seq_printf(seq, "%-127s\n",
339			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
340			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
341			   "HHUptod\tSpecDst");
342	else {
343		struct rtable *r = v;
344		char temp[256];
345
346		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
347			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
348			r->u.dst.dev ? r->u.dst.dev->name : "*",
349			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
350			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
351			r->u.dst.__use, 0, (unsigned long)r->rt_src,
352			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
353			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
354			dst_metric(&r->u.dst, RTAX_WINDOW),
355			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
356			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
357			r->fl.fl4_tos,
358			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
359			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
360				       dev_queue_xmit) : 0,
361			r->rt_spec_dst);
362		seq_printf(seq, "%-127s\n", temp);
363	}
364	return 0;
365}
366
367static const struct seq_operations rt_cache_seq_ops = {
368	.start  = rt_cache_seq_start,
369	.next   = rt_cache_seq_next,
370	.stop   = rt_cache_seq_stop,
371	.show   = rt_cache_seq_show,
372};
373
374static int rt_cache_seq_open(struct inode *inode, struct file *file)
375{
376	struct seq_file *seq;
377	int rc = -ENOMEM;
378	struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
379
380	if (!s)
381		goto out;
382	rc = seq_open(file, &rt_cache_seq_ops);
383	if (rc)
384		goto out_kfree;
385	seq          = file->private_data;
386	seq->private = s;
387	memset(s, 0, sizeof(*s));
388out:
389	return rc;
390out_kfree:
391	kfree(s);
392	goto out;
393}
394
395static const struct file_operations rt_cache_seq_fops = {
396	.owner	 = THIS_MODULE,
397	.open	 = rt_cache_seq_open,
398	.read	 = seq_read,
399	.llseek	 = seq_lseek,
400	.release = seq_release_private,
401};
402
403
404static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
405{
406	int cpu;
407
408	if (*pos == 0)
409		return SEQ_START_TOKEN;
410
411	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
412		if (!cpu_possible(cpu))
413			continue;
414		*pos = cpu+1;
415		return &per_cpu(rt_cache_stat, cpu);
416	}
417	return NULL;
418}
419
420static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
421{
422	int cpu;
423
424	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
425		if (!cpu_possible(cpu))
426			continue;
427		*pos = cpu+1;
428		return &per_cpu(rt_cache_stat, cpu);
429	}
430	return NULL;
431
432}
433
434static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
435{
436
437}
438
439static int rt_cpu_seq_show(struct seq_file *seq, void *v)
440{
441	struct rt_cache_stat *st = v;
442
443	if (v == SEQ_START_TOKEN) {
444		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
445		return 0;
446	}
447
448	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
449		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
450		   atomic_read(&ipv4_dst_ops.entries),
451		   st->in_hit,
452		   st->in_slow_tot,
453		   st->in_slow_mc,
454		   st->in_no_route,
455		   st->in_brd,
456		   st->in_martian_dst,
457		   st->in_martian_src,
458
459		   st->out_hit,
460		   st->out_slow_tot,
461		   st->out_slow_mc,
462
463		   st->gc_total,
464		   st->gc_ignored,
465		   st->gc_goal_miss,
466		   st->gc_dst_overflow,
467		   st->in_hlist_search,
468		   st->out_hlist_search
469		);
470	return 0;
471}
472
473static const struct seq_operations rt_cpu_seq_ops = {
474	.start  = rt_cpu_seq_start,
475	.next   = rt_cpu_seq_next,
476	.stop   = rt_cpu_seq_stop,
477	.show   = rt_cpu_seq_show,
478};
479
480
481static int rt_cpu_seq_open(struct inode *inode, struct file *file)
482{
483	return seq_open(file, &rt_cpu_seq_ops);
484}
485
486static const struct file_operations rt_cpu_seq_fops = {
487	.owner	 = THIS_MODULE,
488	.open	 = rt_cpu_seq_open,
489	.read	 = seq_read,
490	.llseek	 = seq_lseek,
491	.release = seq_release,
492};
493
494#endif /* CONFIG_PROC_FS */
495
496static __inline__ void rt_free(struct rtable *rt)
497{
498	multipath_remove(rt);
499	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
500}
501
502static __inline__ void rt_drop(struct rtable *rt)
503{
504	multipath_remove(rt);
505	ip_rt_put(rt);
506	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
507}
508
509static __inline__ int rt_fast_clean(struct rtable *rth)
510{
511	/* Kill broadcast/multicast entries very aggresively, if they
512	   collide in hash table with more useful entries */
513	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
514		rth->fl.iif && rth->u.dst.rt_next;
515}
516
517static __inline__ int rt_valuable(struct rtable *rth)
518{
519	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
520		rth->u.dst.expires;
521}
522
523static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
524{
525	unsigned long age;
526	int ret = 0;
527
528	if (atomic_read(&rth->u.dst.__refcnt))
529		goto out;
530
531	ret = 1;
532	if (rth->u.dst.expires &&
533	    time_after_eq(jiffies, rth->u.dst.expires))
534		goto out;
535
536	age = jiffies - rth->u.dst.lastuse;
537	ret = 0;
538	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
539	    (age <= tmo2 && rt_valuable(rth)))
540		goto out;
541	ret = 1;
542out:	return ret;
543}
544
545/* Bits of score are:
546 * 31: very valuable
547 * 30: not quite useless
548 * 29..0: usage counter
549 */
550static inline u32 rt_score(struct rtable *rt)
551{
552	u32 score = jiffies - rt->u.dst.lastuse;
553
554	score = ~score & ~(3<<30);
555
556	if (rt_valuable(rt))
557		score |= (1<<31);
558
559	if (!rt->fl.iif ||
560	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
561		score |= (1<<30);
562
563	return score;
564}
565
566static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
567{
568	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
569		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
570		(fl1->mark ^ fl2->mark) |
571		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
572		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
573		(fl1->oif ^ fl2->oif) |
574		(fl1->iif ^ fl2->iif)) == 0;
575}
576
577#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
578static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
579						struct rtable *expentry,
580						int *removed_count)
581{
582	int passedexpired = 0;
583	struct rtable **nextstep = NULL;
584	struct rtable **rthp = chain_head;
585	struct rtable *rth;
586
587	if (removed_count)
588		*removed_count = 0;
589
590	while ((rth = *rthp) != NULL) {
591		if (rth == expentry)
592			passedexpired = 1;
593
594		if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
595		    compare_keys(&(*rthp)->fl, &expentry->fl)) {
596			if (*rthp == expentry) {
597				*rthp = rth->u.dst.rt_next;
598				continue;
599			} else {
600				*rthp = rth->u.dst.rt_next;
601				rt_free(rth);
602				if (removed_count)
603					++(*removed_count);
604			}
605		} else {
606			if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
607			    passedexpired && !nextstep)
608				nextstep = &rth->u.dst.rt_next;
609
610			rthp = &rth->u.dst.rt_next;
611		}
612	}
613
614	rt_free(expentry);
615	if (removed_count)
616		++(*removed_count);
617
618	return nextstep;
619}
620#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
621
622
623/* This runs via a timer and thus is always in BH context. */
624static void rt_check_expire(unsigned long dummy)
625{
626	static unsigned int rover;
627	unsigned int i = rover, goal;
628	struct rtable *rth, **rthp;
629	unsigned long now = jiffies;
630	u64 mult;
631
632	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
633	if (ip_rt_gc_timeout > 1)
634		do_div(mult, ip_rt_gc_timeout);
635	goal = (unsigned int)mult;
636	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
637	for (; goal > 0; goal--) {
638		unsigned long tmo = ip_rt_gc_timeout;
639
640		i = (i + 1) & rt_hash_mask;
641		rthp = &rt_hash_table[i].chain;
642
643		if (*rthp == 0)
644			continue;
645		spin_lock(rt_hash_lock_addr(i));
646		while ((rth = *rthp) != NULL) {
647			if (rth->u.dst.expires) {
648				/* Entry is expired even if it is in use */
649				if (time_before_eq(now, rth->u.dst.expires)) {
650					tmo >>= 1;
651					rthp = &rth->u.dst.rt_next;
652					continue;
653				}
654			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
655				tmo >>= 1;
656				rthp = &rth->u.dst.rt_next;
657				continue;
658			}
659
660			/* Cleanup aged off entries. */
661#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
662			/* remove all related balanced entries if necessary */
663			if (rth->u.dst.flags & DST_BALANCED) {
664				rthp = rt_remove_balanced_route(
665					&rt_hash_table[i].chain,
666					rth, NULL);
667				if (!rthp)
668					break;
669			} else {
670				*rthp = rth->u.dst.rt_next;
671				rt_free(rth);
672			}
673#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
674			*rthp = rth->u.dst.rt_next;
675			rt_free(rth);
676#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
677		}
678		spin_unlock(rt_hash_lock_addr(i));
679
680		/* Fallback loop breaker. */
681		if (time_after(jiffies, now))
682			break;
683	}
684	rover = i;
685	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
686}
687
688/* This can run from both BH and non-BH contexts, the latter
689 * in the case of a forced flush event.
690 */
691static void rt_run_flush(unsigned long dummy)
692{
693	int i;
694	struct rtable *rth, *next;
695
696	rt_deadline = 0;
697
698	get_random_bytes(&rt_hash_rnd, 4);
699
700	for (i = rt_hash_mask; i >= 0; i--) {
701		spin_lock_bh(rt_hash_lock_addr(i));
702		rth = rt_hash_table[i].chain;
703		if (rth)
704			rt_hash_table[i].chain = NULL;
705		spin_unlock_bh(rt_hash_lock_addr(i));
706
707		for (; rth; rth = next) {
708			next = rth->u.dst.rt_next;
709			rt_free(rth);
710		}
711	}
712}
713
714static DEFINE_SPINLOCK(rt_flush_lock);
715
716void rt_cache_flush(int delay)
717{
718	unsigned long now = jiffies;
719	int user_mode = !in_softirq();
720
721	if (delay < 0)
722		delay = ip_rt_min_delay;
723
724	/* flush existing multipath state*/
725	multipath_flush();
726
727	spin_lock_bh(&rt_flush_lock);
728
729	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
730		long tmo = (long)(rt_deadline - now);
731
732		/* If flush timer is already running
733		   and flush request is not immediate (delay > 0):
734
735		   if deadline is not achieved, prolongate timer to "delay",
736		   otherwise fire it at deadline time.
737		 */
738
739		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
740			tmo = 0;
741
742		if (delay > tmo)
743			delay = tmo;
744	}
745
746	if (delay <= 0) {
747		spin_unlock_bh(&rt_flush_lock);
748		rt_run_flush(0);
749		return;
750	}
751
752	if (rt_deadline == 0)
753		rt_deadline = now + ip_rt_max_delay;
754
755	mod_timer(&rt_flush_timer, now+delay);
756	spin_unlock_bh(&rt_flush_lock);
757}
758
759static void rt_secret_rebuild(unsigned long dummy)
760{
761	unsigned long now = jiffies;
762
763	rt_cache_flush(0);
764	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
765}
766
767/*
768   Short description of GC goals.
769
770   We want to build algorithm, which will keep routing cache
771   at some equilibrium point, when number of aged off entries
772   is kept approximately equal to newly generated ones.
773
774   Current expiration strength is variable "expire".
775   We try to adjust it dynamically, so that if networking
776   is idle expires is large enough to keep enough of warm entries,
777   and when load increases it reduces to limit cache size.
778 */
779
780static int rt_garbage_collect(void)
781{
782	static unsigned long expire = RT_GC_TIMEOUT;
783	static unsigned long last_gc;
784	static int rover;
785	static int equilibrium;
786	struct rtable *rth, **rthp;
787	unsigned long now = jiffies;
788	int goal;
789
790	/*
791	 * Garbage collection is pretty expensive,
792	 * do not make it too frequently.
793	 */
794
795	RT_CACHE_STAT_INC(gc_total);
796
797	if (now - last_gc < ip_rt_gc_min_interval &&
798	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
799		RT_CACHE_STAT_INC(gc_ignored);
800		goto out;
801	}
802
803	/* Calculate number of entries, which we want to expire now. */
804	goal = atomic_read(&ipv4_dst_ops.entries) -
805		(ip_rt_gc_elasticity << rt_hash_log);
806	if (goal <= 0) {
807		if (equilibrium < ipv4_dst_ops.gc_thresh)
808			equilibrium = ipv4_dst_ops.gc_thresh;
809		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
810		if (goal > 0) {
811			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
812			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
813		}
814	} else {
815		/* We are in dangerous area. Try to reduce cache really
816		 * aggressively.
817		 */
818		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
819		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
820	}
821
822	if (now - last_gc >= ip_rt_gc_min_interval)
823		last_gc = now;
824
825	if (goal <= 0) {
826		equilibrium += goal;
827		goto work_done;
828	}
829
830	do {
831		int i, k;
832
833		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
834			unsigned long tmo = expire;
835
836			k = (k + 1) & rt_hash_mask;
837			rthp = &rt_hash_table[k].chain;
838			spin_lock_bh(rt_hash_lock_addr(k));
839			while ((rth = *rthp) != NULL) {
840				if (!rt_may_expire(rth, tmo, expire)) {
841					tmo >>= 1;
842					rthp = &rth->u.dst.rt_next;
843					continue;
844				}
845#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
846				/* remove all related balanced entries
847				 * if necessary
848				 */
849				if (rth->u.dst.flags & DST_BALANCED) {
850					int r;
851
852					rthp = rt_remove_balanced_route(
853						&rt_hash_table[k].chain,
854						rth,
855						&r);
856					goal -= r;
857					if (!rthp)
858						break;
859				} else {
860					*rthp = rth->u.dst.rt_next;
861					rt_free(rth);
862					goal--;
863				}
864#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
865				*rthp = rth->u.dst.rt_next;
866				rt_free(rth);
867				goal--;
868#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
869			}
870			spin_unlock_bh(rt_hash_lock_addr(k));
871			if (goal <= 0)
872				break;
873		}
874		rover = k;
875
876		if (goal <= 0)
877			goto work_done;
878
879		/* Goal is not achieved. We stop process if:
880
881		   - if expire reduced to zero. Otherwise, expire is halfed.
882		   - if table is not full.
883		   - if we are called from interrupt.
884		   - jiffies check is just fallback/debug loop breaker.
885		     We will not spin here for long time in any case.
886		 */
887
888		RT_CACHE_STAT_INC(gc_goal_miss);
889
890		if (expire == 0)
891			break;
892
893		expire >>= 1;
894#if RT_CACHE_DEBUG >= 2
895		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
896				atomic_read(&ipv4_dst_ops.entries), goal, i);
897#endif
898
899		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
900			goto out;
901	} while (!in_softirq() && time_before_eq(jiffies, now));
902
903	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
904		goto out;
905	if (net_ratelimit())
906		printk(KERN_WARNING "dst cache overflow\n");
907	RT_CACHE_STAT_INC(gc_dst_overflow);
908	return 1;
909
910work_done:
911	expire += ip_rt_gc_min_interval;
912	if (expire > ip_rt_gc_timeout ||
913	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
914		expire = ip_rt_gc_timeout;
915#if RT_CACHE_DEBUG >= 2
916	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
917			atomic_read(&ipv4_dst_ops.entries), goal, rover);
918#endif
919out:	return 0;
920}
921
922static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
923{
924	struct rtable	*rth, **rthp;
925	unsigned long	now;
926	struct rtable *cand, **candp;
927	u32 		min_score;
928	int		chain_length;
929	int attempts = !in_softirq();
930
931restart:
932	chain_length = 0;
933	min_score = ~(u32)0;
934	cand = NULL;
935	candp = NULL;
936	now = jiffies;
937
938	rthp = &rt_hash_table[hash].chain;
939
940	spin_lock_bh(rt_hash_lock_addr(hash));
941	while ((rth = *rthp) != NULL) {
942#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
943		if (!(rth->u.dst.flags & DST_BALANCED) &&
944		    compare_keys(&rth->fl, &rt->fl)) {
945#else
946		if (compare_keys(&rth->fl, &rt->fl)) {
947#endif
948			/* Put it first */
949			*rthp = rth->u.dst.rt_next;
950			/*
951			 * Since lookup is lockfree, the deletion
952			 * must be visible to another weakly ordered CPU before
953			 * the insertion at the start of the hash chain.
954			 */
955			rcu_assign_pointer(rth->u.dst.rt_next,
956					   rt_hash_table[hash].chain);
957			/*
958			 * Since lookup is lockfree, the update writes
959			 * must be ordered for consistency on SMP.
960			 */
961			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
962
963			rth->u.dst.__use++;
964			dst_hold(&rth->u.dst);
965			rth->u.dst.lastuse = now;
966			spin_unlock_bh(rt_hash_lock_addr(hash));
967
968			rt_drop(rt);
969			*rp = rth;
970			return 0;
971		}
972
973		if (!atomic_read(&rth->u.dst.__refcnt)) {
974			u32 score = rt_score(rth);
975
976			if (score <= min_score) {
977				cand = rth;
978				candp = rthp;
979				min_score = score;
980			}
981		}
982
983		chain_length++;
984
985		rthp = &rth->u.dst.rt_next;
986	}
987
988	if (cand) {
989		/* ip_rt_gc_elasticity used to be average length of chain
990		 * length, when exceeded gc becomes really aggressive.
991		 *
992		 * The second limit is less certain. At the moment it allows
993		 * only 2 entries per bucket. We will see.
994		 */
995		if (chain_length > ip_rt_gc_elasticity) {
996			*candp = cand->u.dst.rt_next;
997			rt_free(cand);
998		}
999	}
1000
1001	/* Try to bind route to arp only if it is output
1002	   route or unicast forwarding path.
1003	 */
1004	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1005		int err = arp_bind_neighbour(&rt->u.dst);
1006		if (err) {
1007			spin_unlock_bh(rt_hash_lock_addr(hash));
1008
1009			if (err != -ENOBUFS) {
1010				rt_drop(rt);
1011				return err;
1012			}
1013
1014			/* Neighbour tables are full and nothing
1015			   can be released. Try to shrink route cache,
1016			   it is most likely it holds some neighbour records.
1017			 */
1018			if (attempts-- > 0) {
1019				int saved_elasticity = ip_rt_gc_elasticity;
1020				int saved_int = ip_rt_gc_min_interval;
1021				ip_rt_gc_elasticity	= 1;
1022				ip_rt_gc_min_interval	= 0;
1023				rt_garbage_collect();
1024				ip_rt_gc_min_interval	= saved_int;
1025				ip_rt_gc_elasticity	= saved_elasticity;
1026				goto restart;
1027			}
1028
1029			if (net_ratelimit())
1030				printk(KERN_WARNING "Neighbour table overflow.\n");
1031			rt_drop(rt);
1032			return -ENOBUFS;
1033		}
1034	}
1035
1036	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1037#if RT_CACHE_DEBUG >= 2
1038	if (rt->u.dst.rt_next) {
1039		struct rtable *trt;
1040		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1041		       NIPQUAD(rt->rt_dst));
1042		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1043			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1044		printk("\n");
1045	}
1046#endif
1047	rt_hash_table[hash].chain = rt;
1048	spin_unlock_bh(rt_hash_lock_addr(hash));
1049	*rp = rt;
1050	return 0;
1051}
1052
1053void rt_bind_peer(struct rtable *rt, int create)
1054{
1055	static DEFINE_SPINLOCK(rt_peer_lock);
1056	struct inet_peer *peer;
1057
1058	peer = inet_getpeer(rt->rt_dst, create);
1059
1060	spin_lock_bh(&rt_peer_lock);
1061	if (rt->peer == NULL) {
1062		rt->peer = peer;
1063		peer = NULL;
1064	}
1065	spin_unlock_bh(&rt_peer_lock);
1066	if (peer)
1067		inet_putpeer(peer);
1068}
1069
1070/*
1071 * Peer allocation may fail only in serious out-of-memory conditions.  However
1072 * we still can generate some output.
1073 * Random ID selection looks a bit dangerous because we have no chances to
1074 * select ID being unique in a reasonable period of time.
1075 * But broken packet identifier may be better than no packet at all.
1076 */
1077static void ip_select_fb_ident(struct iphdr *iph)
1078{
1079	static DEFINE_SPINLOCK(ip_fb_id_lock);
1080	static u32 ip_fallback_id;
1081	u32 salt;
1082
1083	spin_lock_bh(&ip_fb_id_lock);
1084	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1085	iph->id = htons(salt & 0xFFFF);
1086	ip_fallback_id = salt;
1087	spin_unlock_bh(&ip_fb_id_lock);
1088}
1089
1090void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1091{
1092	struct rtable *rt = (struct rtable *) dst;
1093
1094	if (rt) {
1095		if (rt->peer == NULL)
1096			rt_bind_peer(rt, 1);
1097
1098		/* If peer is attached to destination, it is never detached,
1099		   so that we need not to grab a lock to dereference it.
1100		 */
1101		if (rt->peer) {
1102			iph->id = htons(inet_getid(rt->peer, more));
1103			return;
1104		}
1105	} else
1106		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1107		       __builtin_return_address(0));
1108
1109	ip_select_fb_ident(iph);
1110}
1111
1112static void rt_del(unsigned hash, struct rtable *rt)
1113{
1114	struct rtable **rthp;
1115
1116	spin_lock_bh(rt_hash_lock_addr(hash));
1117	ip_rt_put(rt);
1118	for (rthp = &rt_hash_table[hash].chain; *rthp;
1119	     rthp = &(*rthp)->u.dst.rt_next)
1120		if (*rthp == rt) {
1121			*rthp = rt->u.dst.rt_next;
1122			rt_free(rt);
1123			break;
1124		}
1125	spin_unlock_bh(rt_hash_lock_addr(hash));
1126}
1127
1128void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1129		    __be32 saddr, struct net_device *dev)
1130{
1131	int i, k;
1132	struct in_device *in_dev = in_dev_get(dev);
1133	struct rtable *rth, **rthp;
1134	__be32  skeys[2] = { saddr, 0 };
1135	int  ikeys[2] = { dev->ifindex, 0 };
1136	struct netevent_redirect netevent;
1137
1138	if (!in_dev)
1139		return;
1140
1141	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1142	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1143		goto reject_redirect;
1144
1145	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1146		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1147			goto reject_redirect;
1148		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1149			goto reject_redirect;
1150	} else {
1151		if (inet_addr_type(new_gw) != RTN_UNICAST)
1152			goto reject_redirect;
1153	}
1154
1155	for (i = 0; i < 2; i++) {
1156		for (k = 0; k < 2; k++) {
1157			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1158
1159			rthp=&rt_hash_table[hash].chain;
1160
1161			rcu_read_lock();
1162			while ((rth = rcu_dereference(*rthp)) != NULL) {
1163				struct rtable *rt;
1164
1165				if (rth->fl.fl4_dst != daddr ||
1166				    rth->fl.fl4_src != skeys[i] ||
1167				    rth->fl.oif != ikeys[k] ||
1168				    rth->fl.iif != 0) {
1169					rthp = &rth->u.dst.rt_next;
1170					continue;
1171				}
1172
1173				if (rth->rt_dst != daddr ||
1174				    rth->rt_src != saddr ||
1175				    rth->u.dst.error ||
1176				    rth->rt_gateway != old_gw ||
1177				    rth->u.dst.dev != dev)
1178					break;
1179
1180				dst_hold(&rth->u.dst);
1181				rcu_read_unlock();
1182
1183				rt = dst_alloc(&ipv4_dst_ops);
1184				if (rt == NULL) {
1185					ip_rt_put(rth);
1186					in_dev_put(in_dev);
1187					return;
1188				}
1189
1190				/* Copy all the information. */
1191				*rt = *rth;
1192				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1193				rt->u.dst.__use		= 1;
1194				atomic_set(&rt->u.dst.__refcnt, 1);
1195				rt->u.dst.child		= NULL;
1196				if (rt->u.dst.dev)
1197					dev_hold(rt->u.dst.dev);
1198				if (rt->idev)
1199					in_dev_hold(rt->idev);
1200				rt->u.dst.obsolete	= 0;
1201				rt->u.dst.lastuse	= jiffies;
1202				rt->u.dst.path		= &rt->u.dst;
1203				rt->u.dst.neighbour	= NULL;
1204				rt->u.dst.hh		= NULL;
1205				rt->u.dst.xfrm		= NULL;
1206
1207				rt->rt_flags		|= RTCF_REDIRECTED;
1208
1209				/* Gateway is different ... */
1210				rt->rt_gateway		= new_gw;
1211
1212				/* Redirect received -> path was valid */
1213				dst_confirm(&rth->u.dst);
1214
1215				if (rt->peer)
1216					atomic_inc(&rt->peer->refcnt);
1217
1218				if (arp_bind_neighbour(&rt->u.dst) ||
1219				    !(rt->u.dst.neighbour->nud_state &
1220					    NUD_VALID)) {
1221					if (rt->u.dst.neighbour)
1222						neigh_event_send(rt->u.dst.neighbour, NULL);
1223					ip_rt_put(rth);
1224					rt_drop(rt);
1225					goto do_next;
1226				}
1227
1228				netevent.old = &rth->u.dst;
1229				netevent.new = &rt->u.dst;
1230				call_netevent_notifiers(NETEVENT_REDIRECT,
1231							&netevent);
1232
1233				rt_del(hash, rth);
1234				if (!rt_intern_hash(hash, rt, &rt))
1235					ip_rt_put(rt);
1236				goto do_next;
1237			}
1238			rcu_read_unlock();
1239		do_next:
1240			;
1241		}
1242	}
1243	in_dev_put(in_dev);
1244	return;
1245
1246reject_redirect:
1247#ifdef CONFIG_IP_ROUTE_VERBOSE
1248	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1249		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1250			"%u.%u.%u.%u ignored.\n"
1251			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1252		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1253		       NIPQUAD(saddr), NIPQUAD(daddr));
1254#endif
1255	in_dev_put(in_dev);
1256}
1257
1258static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1259{
1260	struct rtable *rt = (struct rtable*)dst;
1261	struct dst_entry *ret = dst;
1262
1263	if (rt) {
1264		if (dst->obsolete) {
1265			ip_rt_put(rt);
1266			ret = NULL;
1267		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1268			   rt->u.dst.expires) {
1269			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1270						rt->fl.oif);
1271#if RT_CACHE_DEBUG >= 1
1272			printk(KERN_DEBUG "ip_rt_advice: redirect to "
1273					  "%u.%u.%u.%u/%02x dropped\n",
1274				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1275#endif
1276			rt_del(hash, rt);
1277			ret = NULL;
1278		}
1279	}
1280	return ret;
1281}
1282
1283/*
1284 * Algorithm:
1285 *	1. The first ip_rt_redirect_number redirects are sent
1286 *	   with exponential backoff, then we stop sending them at all,
1287 *	   assuming that the host ignores our redirects.
1288 *	2. If we did not see packets requiring redirects
1289 *	   during ip_rt_redirect_silence, we assume that the host
1290 *	   forgot redirected route and start to send redirects again.
1291 *
1292 * This algorithm is much cheaper and more intelligent than dumb load limiting
1293 * in icmp.c.
1294 *
1295 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1296 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1297 */
1298
1299void ip_rt_send_redirect(struct sk_buff *skb)
1300{
1301	struct rtable *rt = (struct rtable*)skb->dst;
1302	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1303
1304	if (!in_dev)
1305		return;
1306
1307	if (!IN_DEV_TX_REDIRECTS(in_dev))
1308		goto out;
1309
1310	/* No redirected packets during ip_rt_redirect_silence;
1311	 * reset the algorithm.
1312	 */
1313	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1314		rt->u.dst.rate_tokens = 0;
1315
1316	/* Too many ignored redirects; do not send anything
1317	 * set u.dst.rate_last to the last seen redirected packet.
1318	 */
1319	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1320		rt->u.dst.rate_last = jiffies;
1321		goto out;
1322	}
1323
1324	/* Check for load limit; set rate_last to the latest sent
1325	 * redirect.
1326	 */
1327	if (rt->u.dst.rate_tokens == 0 ||
1328	    time_after(jiffies,
1329		       (rt->u.dst.rate_last +
1330			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1331		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1332		rt->u.dst.rate_last = jiffies;
1333		++rt->u.dst.rate_tokens;
1334#ifdef CONFIG_IP_ROUTE_VERBOSE
1335		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1336		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1337		    net_ratelimit())
1338			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1339				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1340				NIPQUAD(rt->rt_src), rt->rt_iif,
1341				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1342#endif
1343	}
1344out:
1345	in_dev_put(in_dev);
1346}
1347
1348static int ip_error(struct sk_buff *skb)
1349{
1350	struct rtable *rt = (struct rtable*)skb->dst;
1351	unsigned long now;
1352	int code;
1353
1354	switch (rt->u.dst.error) {
1355		case EINVAL:
1356		default:
1357			goto out;
1358		case EHOSTUNREACH:
1359			code = ICMP_HOST_UNREACH;
1360			break;
1361		case ENETUNREACH:
1362			code = ICMP_NET_UNREACH;
1363			break;
1364		case EACCES:
1365			code = ICMP_PKT_FILTERED;
1366			break;
1367	}
1368
1369	now = jiffies;
1370	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1371	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1372		rt->u.dst.rate_tokens = ip_rt_error_burst;
1373	rt->u.dst.rate_last = now;
1374	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1375		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1376		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1377	}
1378
1379out:	kfree_skb(skb);
1380	return 0;
1381}
1382
1383/*
1384 *	The last two values are not from the RFC but
1385 *	are needed for AMPRnet AX.25 paths.
1386 */
1387
1388static const unsigned short mtu_plateau[] =
1389{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1390
1391static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1392{
1393	int i;
1394
1395	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1396		if (old_mtu > mtu_plateau[i])
1397			return mtu_plateau[i];
1398	return 68;
1399}
1400
1401unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1402{
1403	int i;
1404	unsigned short old_mtu = ntohs(iph->tot_len);
1405	struct rtable *rth;
1406	__be32  skeys[2] = { iph->saddr, 0, };
1407	__be32  daddr = iph->daddr;
1408	unsigned short est_mtu = 0;
1409
1410	if (ipv4_config.no_pmtu_disc)
1411		return 0;
1412
1413	for (i = 0; i < 2; i++) {
1414		unsigned hash = rt_hash(daddr, skeys[i], 0);
1415
1416		rcu_read_lock();
1417		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1418		     rth = rcu_dereference(rth->u.dst.rt_next)) {
1419			if (rth->fl.fl4_dst == daddr &&
1420			    rth->fl.fl4_src == skeys[i] &&
1421			    rth->rt_dst  == daddr &&
1422			    rth->rt_src  == iph->saddr &&
1423			    rth->fl.iif == 0 &&
1424			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1425				unsigned short mtu = new_mtu;
1426
1427				if (new_mtu < 68 || new_mtu >= old_mtu) {
1428
1429					/* BSD 4.2 compatibility hack :-( */
1430					if (mtu == 0 &&
1431					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1432					    old_mtu >= 68 + (iph->ihl << 2))
1433						old_mtu -= iph->ihl << 2;
1434
1435					mtu = guess_mtu(old_mtu);
1436				}
1437				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1438					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1439						dst_confirm(&rth->u.dst);
1440						if (mtu < ip_rt_min_pmtu) {
1441							mtu = ip_rt_min_pmtu;
1442							rth->u.dst.metrics[RTAX_LOCK-1] |=
1443								(1 << RTAX_MTU);
1444						}
1445						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1446						dst_set_expires(&rth->u.dst,
1447							ip_rt_mtu_expires);
1448					}
1449					est_mtu = mtu;
1450				}
1451			}
1452		}
1453		rcu_read_unlock();
1454	}
1455	return est_mtu ? : new_mtu;
1456}
1457
1458static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1459{
1460	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1461	    !(dst_metric_locked(dst, RTAX_MTU))) {
1462		if (mtu < ip_rt_min_pmtu) {
1463			mtu = ip_rt_min_pmtu;
1464			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1465		}
1466		dst->metrics[RTAX_MTU-1] = mtu;
1467		dst_set_expires(dst, ip_rt_mtu_expires);
1468		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1469	}
1470}
1471
1472static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1473{
1474	return NULL;
1475}
1476
1477static void ipv4_dst_destroy(struct dst_entry *dst)
1478{
1479	struct rtable *rt = (struct rtable *) dst;
1480	struct inet_peer *peer = rt->peer;
1481	struct in_device *idev = rt->idev;
1482
1483	if (peer) {
1484		rt->peer = NULL;
1485		inet_putpeer(peer);
1486	}
1487
1488	if (idev) {
1489		rt->idev = NULL;
1490		in_dev_put(idev);
1491	}
1492}
1493
1494static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1495			    int how)
1496{
1497	struct rtable *rt = (struct rtable *) dst;
1498	struct in_device *idev = rt->idev;
1499	if (dev != &loopback_dev && idev && idev->dev == dev) {
1500		struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1501		if (loopback_idev) {
1502			rt->idev = loopback_idev;
1503			in_dev_put(idev);
1504		}
1505	}
1506}
1507
1508static void ipv4_link_failure(struct sk_buff *skb)
1509{
1510	struct rtable *rt;
1511
1512	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1513
1514	rt = (struct rtable *) skb->dst;
1515	if (rt)
1516		dst_set_expires(&rt->u.dst, 0);
1517}
1518
1519static int ip_rt_bug(struct sk_buff *skb)
1520{
1521	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1522		NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1523		skb->dev ? skb->dev->name : "?");
1524	kfree_skb(skb);
1525	return 0;
1526}
1527
1528/*
1529   We do not cache source address of outgoing interface,
1530   because it is used only by IP RR, TS and SRR options,
1531   so that it out of fast path.
1532
1533   BTW remember: "addr" is allowed to be not aligned
1534   in IP options!
1535 */
1536
1537void ip_rt_get_source(u8 *addr, struct rtable *rt)
1538{
1539	__be32 src;
1540	struct fib_result res;
1541
1542	if (rt->fl.iif == 0)
1543		src = rt->rt_src;
1544	else if (fib_lookup(&rt->fl, &res) == 0) {
1545		src = FIB_RES_PREFSRC(res);
1546		fib_res_put(&res);
1547	} else
1548		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1549					RT_SCOPE_UNIVERSE);
1550	memcpy(addr, &src, 4);
1551}
1552
1553#ifdef CONFIG_NET_CLS_ROUTE
1554static void set_class_tag(struct rtable *rt, u32 tag)
1555{
1556	if (!(rt->u.dst.tclassid & 0xFFFF))
1557		rt->u.dst.tclassid |= tag & 0xFFFF;
1558	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1559		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1560}
1561#endif
1562
1563static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1564{
1565	struct fib_info *fi = res->fi;
1566
1567	if (fi) {
1568		if (FIB_RES_GW(*res) &&
1569		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1570			rt->rt_gateway = FIB_RES_GW(*res);
1571		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1572		       sizeof(rt->u.dst.metrics));
1573		if (fi->fib_mtu == 0) {
1574			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1575			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1576			    rt->rt_gateway != rt->rt_dst &&
1577			    rt->u.dst.dev->mtu > 576)
1578				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1579		}
1580#ifdef CONFIG_NET_CLS_ROUTE
1581		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1582#endif
1583	} else
1584		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1585
1586	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1587		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1588	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1589		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1590	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1591		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1592				       ip_rt_min_advmss);
1593	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1594		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1595
1596#ifdef CONFIG_NET_CLS_ROUTE
1597#ifdef CONFIG_IP_MULTIPLE_TABLES
1598	set_class_tag(rt, fib_rules_tclass(res));
1599#endif
1600	set_class_tag(rt, itag);
1601#endif
1602	rt->rt_type = res->type;
1603}
1604
1605static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1606				u8 tos, struct net_device *dev, int our)
1607{
1608	unsigned hash;
1609	struct rtable *rth;
1610	__be32 spec_dst;
1611	struct in_device *in_dev = in_dev_get(dev);
1612	u32 itag = 0;
1613
1614	/* Primary sanity checks. */
1615
1616	if (in_dev == NULL)
1617		return -EINVAL;
1618
1619	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1620	    skb->protocol != htons(ETH_P_IP))
1621		goto e_inval;
1622
1623	if (ZERONET(saddr)) {
1624		if (!LOCAL_MCAST(daddr))
1625			goto e_inval;
1626		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1627	} else if (fib_validate_source(saddr, 0, tos, 0,
1628					dev, &spec_dst, &itag) < 0)
1629		goto e_inval;
1630
1631	rth = dst_alloc(&ipv4_dst_ops);
1632	if (!rth)
1633		goto e_nobufs;
1634
1635	rth->u.dst.output= ip_rt_bug;
1636
1637	atomic_set(&rth->u.dst.__refcnt, 1);
1638	rth->u.dst.flags= DST_HOST;
1639	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1640		rth->u.dst.flags |= DST_NOPOLICY;
1641	rth->fl.fl4_dst	= daddr;
1642	rth->rt_dst	= daddr;
1643	rth->fl.fl4_tos	= tos;
1644	rth->fl.mark    = skb->mark;
1645	rth->fl.fl4_src	= saddr;
1646	rth->rt_src	= saddr;
1647#ifdef CONFIG_NET_CLS_ROUTE
1648	rth->u.dst.tclassid = itag;
1649#endif
1650	rth->rt_iif	=
1651	rth->fl.iif	= dev->ifindex;
1652	rth->u.dst.dev	= &loopback_dev;
1653	dev_hold(rth->u.dst.dev);
1654	rth->idev	= in_dev_get(rth->u.dst.dev);
1655	rth->fl.oif	= 0;
1656	rth->rt_gateway	= daddr;
1657	rth->rt_spec_dst= spec_dst;
1658	rth->rt_type	= RTN_MULTICAST;
1659	rth->rt_flags	= RTCF_MULTICAST;
1660	if (our) {
1661		rth->u.dst.input= ip_local_deliver;
1662		rth->rt_flags |= RTCF_LOCAL;
1663	}
1664
1665#ifdef CONFIG_IP_MROUTE
1666	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1667		rth->u.dst.input = ip_mr_input;
1668#endif
1669	RT_CACHE_STAT_INC(in_slow_mc);
1670
1671	in_dev_put(in_dev);
1672	hash = rt_hash(daddr, saddr, dev->ifindex);
1673	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1674
1675e_nobufs:
1676	in_dev_put(in_dev);
1677	return -ENOBUFS;
1678
1679e_inval:
1680	in_dev_put(in_dev);
1681	return -EINVAL;
1682}
1683
1684
1685static void ip_handle_martian_source(struct net_device *dev,
1686				     struct in_device *in_dev,
1687				     struct sk_buff *skb,
1688				     __be32 daddr,
1689				     __be32 saddr)
1690{
1691	RT_CACHE_STAT_INC(in_martian_src);
1692#ifdef CONFIG_IP_ROUTE_VERBOSE
1693	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1694		/*
1695		 *	RFC1812 recommendation, if source is martian,
1696		 *	the only hint is MAC header.
1697		 */
1698		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1699			"%u.%u.%u.%u, on dev %s\n",
1700			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1701		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1702			int i;
1703			const unsigned char *p = skb_mac_header(skb);
1704			printk(KERN_WARNING "ll header: ");
1705			for (i = 0; i < dev->hard_header_len; i++, p++) {
1706				printk("%02x", *p);
1707				if (i < (dev->hard_header_len - 1))
1708					printk(":");
1709			}
1710			printk("\n");
1711		}
1712	}
1713#endif
1714}
1715
1716static inline int __mkroute_input(struct sk_buff *skb,
1717				  struct fib_result* res,
1718				  struct in_device *in_dev,
1719				  __be32 daddr, __be32 saddr, u32 tos,
1720				  struct rtable **result)
1721{
1722
1723	struct rtable *rth;
1724	int err;
1725	struct in_device *out_dev;
1726	unsigned flags = 0;
1727	__be32 spec_dst;
1728	u32 itag;
1729
1730	/* get a working reference to the output device */
1731	out_dev = in_dev_get(FIB_RES_DEV(*res));
1732	if (out_dev == NULL) {
1733		if (net_ratelimit())
1734			printk(KERN_CRIT "Bug in ip_route_input" \
1735			       "_slow(). Please, report\n");
1736		return -EINVAL;
1737	}
1738
1739
1740	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1741				  in_dev->dev, &spec_dst, &itag);
1742	if (err < 0) {
1743		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1744					 saddr);
1745
1746		err = -EINVAL;
1747		goto cleanup;
1748	}
1749
1750	if (err)
1751		flags |= RTCF_DIRECTSRC;
1752
1753	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1754	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1755	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1756		flags |= RTCF_DOREDIRECT;
1757
1758	if (skb->protocol != htons(ETH_P_IP)) {
1759		/* Not IP (i.e. ARP). Do not create route, if it is
1760		 * invalid for proxy arp. DNAT routes are always valid.
1761		 */
1762		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1763			err = -EINVAL;
1764			goto cleanup;
1765		}
1766	}
1767
1768
1769	rth = dst_alloc(&ipv4_dst_ops);
1770	if (!rth) {
1771		err = -ENOBUFS;
1772		goto cleanup;
1773	}
1774
1775	atomic_set(&rth->u.dst.__refcnt, 1);
1776	rth->u.dst.flags= DST_HOST;
1777#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1778	if (res->fi->fib_nhs > 1)
1779		rth->u.dst.flags |= DST_BALANCED;
1780#endif
1781	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1782		rth->u.dst.flags |= DST_NOPOLICY;
1783	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1784		rth->u.dst.flags |= DST_NOXFRM;
1785	rth->fl.fl4_dst	= daddr;
1786	rth->rt_dst	= daddr;
1787	rth->fl.fl4_tos	= tos;
1788	rth->fl.mark    = skb->mark;
1789	rth->fl.fl4_src	= saddr;
1790	rth->rt_src	= saddr;
1791	rth->rt_gateway	= daddr;
1792	rth->rt_iif 	=
1793		rth->fl.iif	= in_dev->dev->ifindex;
1794	rth->u.dst.dev	= (out_dev)->dev;
1795	dev_hold(rth->u.dst.dev);
1796	rth->idev	= in_dev_get(rth->u.dst.dev);
1797	rth->fl.oif 	= 0;
1798	rth->rt_spec_dst= spec_dst;
1799
1800	rth->u.dst.input = ip_forward;
1801	rth->u.dst.output = ip_output;
1802
1803	rt_set_nexthop(rth, res, itag);
1804
1805	rth->rt_flags = flags;
1806
1807	*result = rth;
1808	err = 0;
1809 cleanup:
1810	/* release the working reference to the output device */
1811	in_dev_put(out_dev);
1812	return err;
1813}
1814
1815static inline int ip_mkroute_input_def(struct sk_buff *skb,
1816				       struct fib_result* res,
1817				       const struct flowi *fl,
1818				       struct in_device *in_dev,
1819				       __be32 daddr, __be32 saddr, u32 tos)
1820{
1821	struct rtable* rth = NULL;
1822	int err;
1823	unsigned hash;
1824
1825#ifdef CONFIG_IP_ROUTE_MULTIPATH
1826	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1827		fib_select_multipath(fl, res);
1828#endif
1829
1830	/* create a routing cache entry */
1831	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1832	if (err)
1833		return err;
1834
1835	/* put it into the cache */
1836	hash = rt_hash(daddr, saddr, fl->iif);
1837	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1838}
1839
1840static inline int ip_mkroute_input(struct sk_buff *skb,
1841				   struct fib_result* res,
1842				   const struct flowi *fl,
1843				   struct in_device *in_dev,
1844				   __be32 daddr, __be32 saddr, u32 tos)
1845{
1846#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1847	struct rtable* rth = NULL, *rtres;
1848	unsigned char hop, hopcount;
1849	int err = -EINVAL;
1850	unsigned int hash;
1851
1852	if (res->fi)
1853		hopcount = res->fi->fib_nhs;
1854	else
1855		hopcount = 1;
1856
1857	/* distinguish between multipath and singlepath */
1858	if (hopcount < 2)
1859		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1860					    saddr, tos);
1861
1862	/* add all alternatives to the routing cache */
1863	for (hop = 0; hop < hopcount; hop++) {
1864		res->nh_sel = hop;
1865
1866		/* put reference to previous result */
1867		if (hop)
1868			ip_rt_put(rtres);
1869
1870		/* create a routing cache entry */
1871		err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1872				      &rth);
1873		if (err)
1874			return err;
1875
1876		/* put it into the cache */
1877		hash = rt_hash(daddr, saddr, fl->iif);
1878		err = rt_intern_hash(hash, rth, &rtres);
1879		if (err)
1880			return err;
1881
1882		/* forward hop information to multipath impl. */
1883		multipath_set_nhinfo(rth,
1884				     FIB_RES_NETWORK(*res),
1885				     FIB_RES_NETMASK(*res),
1886				     res->prefixlen,
1887				     &FIB_RES_NH(*res));
1888	}
1889	skb->dst = &rtres->u.dst;
1890	return err;
1891#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1892	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1893#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1894}
1895
1896
1897/*
1898 *	NOTE. We drop all the packets that has local source
1899 *	addresses, because every properly looped back packet
1900 *	must have correct destination already attached by output routine.
1901 *
1902 *	Such approach solves two big problems:
1903 *	1. Not simplex devices are handled properly.
1904 *	2. IP spoofing attempts are filtered with 100% of guarantee.
1905 */
1906
1907static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1908			       u8 tos, struct net_device *dev)
1909{
1910	struct fib_result res;
1911	struct in_device *in_dev = in_dev_get(dev);
1912	struct flowi fl = { .nl_u = { .ip4_u =
1913				      { .daddr = daddr,
1914					.saddr = saddr,
1915					.tos = tos,
1916					.scope = RT_SCOPE_UNIVERSE,
1917				      } },
1918			    .mark = skb->mark,
1919			    .iif = dev->ifindex };
1920	unsigned	flags = 0;
1921	u32		itag = 0;
1922	struct rtable * rth;
1923	unsigned	hash;
1924	__be32		spec_dst;
1925	int		err = -EINVAL;
1926	int		free_res = 0;
1927
1928	/* IP on this device is disabled. */
1929
1930	if (!in_dev)
1931		goto out;
1932
1933	/* Check for the most weird martians, which can be not detected
1934	   by fib_lookup.
1935	 */
1936
1937	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1938		goto martian_source;
1939
1940	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1941		goto brd_input;
1942
1943	/* Accept zero addresses only to limited broadcast;
1944	 * I even do not know to fix it or not. Waiting for complains :-)
1945	 */
1946	if (ZERONET(saddr))
1947		goto martian_source;
1948
1949	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1950		goto martian_destination;
1951
1952	/*
1953	 *	Now we are ready to route packet.
1954	 */
1955	if ((err = fib_lookup(&fl, &res)) != 0) {
1956		if (!IN_DEV_FORWARD(in_dev))
1957			goto e_hostunreach;
1958		goto no_route;
1959	}
1960	free_res = 1;
1961
1962	RT_CACHE_STAT_INC(in_slow_tot);
1963
1964	if (res.type == RTN_BROADCAST)
1965		goto brd_input;
1966
1967	if (res.type == RTN_LOCAL) {
1968		int result;
1969		result = fib_validate_source(saddr, daddr, tos,
1970					     loopback_dev.ifindex,
1971					     dev, &spec_dst, &itag);
1972		if (result < 0)
1973			goto martian_source;
1974		if (result)
1975			flags |= RTCF_DIRECTSRC;
1976		spec_dst = daddr;
1977		goto local_input;
1978	}
1979
1980	if (!IN_DEV_FORWARD(in_dev))
1981		goto e_hostunreach;
1982	if (res.type != RTN_UNICAST)
1983		goto martian_destination;
1984
1985	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1986	if (err == -ENOBUFS)
1987		goto e_nobufs;
1988	if (err == -EINVAL)
1989		goto e_inval;
1990
1991done:
1992	in_dev_put(in_dev);
1993	if (free_res)
1994		fib_res_put(&res);
1995out:	return err;
1996
1997brd_input:
1998	if (skb->protocol != htons(ETH_P_IP))
1999		goto e_inval;
2000
2001	if (ZERONET(saddr))
2002		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2003	else {
2004		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2005					  &itag);
2006		if (err < 0)
2007			goto martian_source;
2008		if (err)
2009			flags |= RTCF_DIRECTSRC;
2010	}
2011	flags |= RTCF_BROADCAST;
2012	res.type = RTN_BROADCAST;
2013	RT_CACHE_STAT_INC(in_brd);
2014
2015local_input:
2016	rth = dst_alloc(&ipv4_dst_ops);
2017	if (!rth)
2018		goto e_nobufs;
2019
2020	rth->u.dst.output= ip_rt_bug;
2021
2022	atomic_set(&rth->u.dst.__refcnt, 1);
2023	rth->u.dst.flags= DST_HOST;
2024	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2025		rth->u.dst.flags |= DST_NOPOLICY;
2026	rth->fl.fl4_dst	= daddr;
2027	rth->rt_dst	= daddr;
2028	rth->fl.fl4_tos	= tos;
2029	rth->fl.mark    = skb->mark;
2030	rth->fl.fl4_src	= saddr;
2031	rth->rt_src	= saddr;
2032#ifdef CONFIG_NET_CLS_ROUTE
2033	rth->u.dst.tclassid = itag;
2034#endif
2035	rth->rt_iif	=
2036	rth->fl.iif	= dev->ifindex;
2037	rth->u.dst.dev	= &loopback_dev;
2038	dev_hold(rth->u.dst.dev);
2039	rth->idev	= in_dev_get(rth->u.dst.dev);
2040	rth->rt_gateway	= daddr;
2041	rth->rt_spec_dst= spec_dst;
2042	rth->u.dst.input= ip_local_deliver;
2043	rth->rt_flags 	= flags|RTCF_LOCAL;
2044	if (res.type == RTN_UNREACHABLE) {
2045		rth->u.dst.input= ip_error;
2046		rth->u.dst.error= -err;
2047		rth->rt_flags 	&= ~RTCF_LOCAL;
2048	}
2049	rth->rt_type	= res.type;
2050	hash = rt_hash(daddr, saddr, fl.iif);
2051	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2052	goto done;
2053
2054no_route:
2055	RT_CACHE_STAT_INC(in_no_route);
2056	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2057	res.type = RTN_UNREACHABLE;
2058	goto local_input;
2059
2060	/*
2061	 *	Do not cache martian addresses: they should be logged (RFC1812)
2062	 */
2063martian_destination:
2064	RT_CACHE_STAT_INC(in_martian_dst);
2065#ifdef CONFIG_IP_ROUTE_VERBOSE
2066	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2067		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2068			"%u.%u.%u.%u, dev %s\n",
2069			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2070#endif
2071
2072e_hostunreach:
2073	err = -EHOSTUNREACH;
2074	goto done;
2075
2076e_inval:
2077	err = -EINVAL;
2078	goto done;
2079
2080e_nobufs:
2081	err = -ENOBUFS;
2082	goto done;
2083
2084martian_source:
2085	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2086	goto e_inval;
2087}
2088
2089int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2090		   u8 tos, struct net_device *dev)
2091{
2092	struct rtable * rth;
2093	unsigned	hash;
2094	int iif = dev->ifindex;
2095
2096	tos &= IPTOS_RT_MASK;
2097	hash = rt_hash(daddr, saddr, iif);
2098
2099	rcu_read_lock();
2100	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2101	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2102		if (rth->fl.fl4_dst == daddr &&
2103		    rth->fl.fl4_src == saddr &&
2104		    rth->fl.iif == iif &&
2105		    rth->fl.oif == 0 &&
2106		    rth->fl.mark == skb->mark &&
2107		    rth->fl.fl4_tos == tos) {
2108			rth->u.dst.lastuse = jiffies;
2109			dst_hold(&rth->u.dst);
2110			rth->u.dst.__use++;
2111			RT_CACHE_STAT_INC(in_hit);
2112			rcu_read_unlock();
2113			skb->dst = (struct dst_entry*)rth;
2114			return 0;
2115		}
2116		RT_CACHE_STAT_INC(in_hlist_search);
2117	}
2118	rcu_read_unlock();
2119
2120	/* Multicast recognition logic is moved from route cache to here.
2121	   The problem was that too many Ethernet cards have broken/missing
2122	   hardware multicast filters :-( As result the host on multicasting
2123	   network acquires a lot of useless route cache entries, sort of
2124	   SDR messages from all the world. Now we try to get rid of them.
2125	   Really, provided software IP multicast filter is organized
2126	   reasonably (at least, hashed), it does not result in a slowdown
2127	   comparing with route cache reject entries.
2128	   Note, that multicast routers are not affected, because
2129	   route cache entry is created eventually.
2130	 */
2131	if (MULTICAST(daddr)) {
2132		struct in_device *in_dev;
2133
2134		rcu_read_lock();
2135		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2136			int our = ip_check_mc(in_dev, daddr, saddr,
2137				ip_hdr(skb)->protocol);
2138			if (our
2139#ifdef CONFIG_IP_MROUTE
2140			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2141#endif
2142			    ) {
2143				rcu_read_unlock();
2144				return ip_route_input_mc(skb, daddr, saddr,
2145							 tos, dev, our);
2146			}
2147		}
2148		rcu_read_unlock();
2149		return -EINVAL;
2150	}
2151	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2152}
2153
2154static inline int __mkroute_output(struct rtable **result,
2155				   struct fib_result* res,
2156				   const struct flowi *fl,
2157				   const struct flowi *oldflp,
2158				   struct net_device *dev_out,
2159				   unsigned flags)
2160{
2161	struct rtable *rth;
2162	struct in_device *in_dev;
2163	u32 tos = RT_FL_TOS(oldflp);
2164	int err = 0;
2165
2166	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2167		return -EINVAL;
2168
2169	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2170		res->type = RTN_BROADCAST;
2171	else if (MULTICAST(fl->fl4_dst))
2172		res->type = RTN_MULTICAST;
2173	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2174		return -EINVAL;
2175
2176	if (dev_out->flags & IFF_LOOPBACK)
2177		flags |= RTCF_LOCAL;
2178
2179	/* get work reference to inet device */
2180	in_dev = in_dev_get(dev_out);
2181	if (!in_dev)
2182		return -EINVAL;
2183
2184	if (res->type == RTN_BROADCAST) {
2185		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2186		if (res->fi) {
2187			fib_info_put(res->fi);
2188			res->fi = NULL;
2189		}
2190	} else if (res->type == RTN_MULTICAST) {
2191		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2192		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2193				 oldflp->proto))
2194			flags &= ~RTCF_LOCAL;
2195		/* If multicast route do not exist use
2196		   default one, but do not gateway in this case.
2197		   Yes, it is hack.
2198		 */
2199		if (res->fi && res->prefixlen < 4) {
2200			fib_info_put(res->fi);
2201			res->fi = NULL;
2202		}
2203	}
2204
2205
2206	rth = dst_alloc(&ipv4_dst_ops);
2207	if (!rth) {
2208		err = -ENOBUFS;
2209		goto cleanup;
2210	}
2211
2212	atomic_set(&rth->u.dst.__refcnt, 1);
2213	rth->u.dst.flags= DST_HOST;
2214#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2215	if (res->fi) {
2216		rth->rt_multipath_alg = res->fi->fib_mp_alg;
2217		if (res->fi->fib_nhs > 1)
2218			rth->u.dst.flags |= DST_BALANCED;
2219	}
2220#endif
2221	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2222		rth->u.dst.flags |= DST_NOXFRM;
2223	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2224		rth->u.dst.flags |= DST_NOPOLICY;
2225
2226	rth->fl.fl4_dst	= oldflp->fl4_dst;
2227	rth->fl.fl4_tos	= tos;
2228	rth->fl.fl4_src	= oldflp->fl4_src;
2229	rth->fl.oif	= oldflp->oif;
2230	rth->fl.mark    = oldflp->mark;
2231	rth->rt_dst	= fl->fl4_dst;
2232	rth->rt_src	= fl->fl4_src;
2233	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2234	/* get references to the devices that are to be hold by the routing
2235	   cache entry */
2236	rth->u.dst.dev	= dev_out;
2237	dev_hold(dev_out);
2238	rth->idev	= in_dev_get(dev_out);
2239	rth->rt_gateway = fl->fl4_dst;
2240	rth->rt_spec_dst= fl->fl4_src;
2241
2242	rth->u.dst.output=ip_output;
2243
2244	RT_CACHE_STAT_INC(out_slow_tot);
2245
2246	if (flags & RTCF_LOCAL) {
2247		rth->u.dst.input = ip_local_deliver;
2248		rth->rt_spec_dst = fl->fl4_dst;
2249	}
2250	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2251		rth->rt_spec_dst = fl->fl4_src;
2252		if (flags & RTCF_LOCAL &&
2253		    !(dev_out->flags & IFF_LOOPBACK)) {
2254			rth->u.dst.output = ip_mc_output;
2255			RT_CACHE_STAT_INC(out_slow_mc);
2256		}
2257#ifdef CONFIG_IP_MROUTE
2258		if (res->type == RTN_MULTICAST) {
2259			if (IN_DEV_MFORWARD(in_dev) &&
2260			    !LOCAL_MCAST(oldflp->fl4_dst)) {
2261				rth->u.dst.input = ip_mr_input;
2262				rth->u.dst.output = ip_mc_output;
2263			}
2264		}
2265#endif
2266	}
2267
2268	rt_set_nexthop(rth, res, 0);
2269
2270	rth->rt_flags = flags;
2271
2272	*result = rth;
2273 cleanup:
2274	/* release work reference to inet device */
2275	in_dev_put(in_dev);
2276
2277	return err;
2278}
2279
2280static inline int ip_mkroute_output_def(struct rtable **rp,
2281					struct fib_result* res,
2282					const struct flowi *fl,
2283					const struct flowi *oldflp,
2284					struct net_device *dev_out,
2285					unsigned flags)
2286{
2287	struct rtable *rth = NULL;
2288	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2289	unsigned hash;
2290	if (err == 0) {
2291		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2292		err = rt_intern_hash(hash, rth, rp);
2293	}
2294
2295	return err;
2296}
2297
2298static inline int ip_mkroute_output(struct rtable** rp,
2299				    struct fib_result* res,
2300				    const struct flowi *fl,
2301				    const struct flowi *oldflp,
2302				    struct net_device *dev_out,
2303				    unsigned flags)
2304{
2305#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2306	unsigned char hop;
2307	unsigned hash;
2308	int err = -EINVAL;
2309	struct rtable *rth = NULL;
2310
2311	if (res->fi && res->fi->fib_nhs > 1) {
2312		unsigned char hopcount = res->fi->fib_nhs;
2313
2314		for (hop = 0; hop < hopcount; hop++) {
2315			struct net_device *dev2nexthop;
2316
2317			res->nh_sel = hop;
2318
2319			/* hold a work reference to the output device */
2320			dev2nexthop = FIB_RES_DEV(*res);
2321			dev_hold(dev2nexthop);
2322
2323			/* put reference to previous result */
2324			if (hop)
2325				ip_rt_put(*rp);
2326
2327			err = __mkroute_output(&rth, res, fl, oldflp,
2328					       dev2nexthop, flags);
2329
2330			if (err != 0)
2331				goto cleanup;
2332
2333			hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2334					oldflp->oif);
2335			err = rt_intern_hash(hash, rth, rp);
2336
2337			/* forward hop information to multipath impl. */
2338			multipath_set_nhinfo(rth,
2339					     FIB_RES_NETWORK(*res),
2340					     FIB_RES_NETMASK(*res),
2341					     res->prefixlen,
2342					     &FIB_RES_NH(*res));
2343		cleanup:
2344			/* release work reference to output device */
2345			dev_put(dev2nexthop);
2346
2347			if (err != 0)
2348				return err;
2349		}
2350		return err;
2351	} else {
2352		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2353					     flags);
2354	}
2355#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2356	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2357#endif
2358}
2359
2360/*
2361 * Major route resolver routine.
2362 */
2363
2364static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2365{
2366	u32 tos	= RT_FL_TOS(oldflp);
2367	struct flowi fl = { .nl_u = { .ip4_u =
2368				      { .daddr = oldflp->fl4_dst,
2369					.saddr = oldflp->fl4_src,
2370					.tos = tos & IPTOS_RT_MASK,
2371					.scope = ((tos & RTO_ONLINK) ?
2372						  RT_SCOPE_LINK :
2373						  RT_SCOPE_UNIVERSE),
2374				      } },
2375			    .mark = oldflp->mark,
2376			    .iif = loopback_dev.ifindex,
2377			    .oif = oldflp->oif };
2378	struct fib_result res;
2379	unsigned flags = 0;
2380	struct net_device *dev_out = NULL;
2381	int free_res = 0;
2382	int err;
2383
2384
2385	res.fi		= NULL;
2386#ifdef CONFIG_IP_MULTIPLE_TABLES
2387	res.r		= NULL;
2388#endif
2389
2390	if (oldflp->fl4_src) {
2391		err = -EINVAL;
2392		if (MULTICAST(oldflp->fl4_src) ||
2393		    BADCLASS(oldflp->fl4_src) ||
2394		    ZERONET(oldflp->fl4_src))
2395			goto out;
2396
2397		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2398		dev_out = ip_dev_find(oldflp->fl4_src);
2399		if (dev_out == NULL)
2400			goto out;
2401
2402		/* I removed check for oif == dev_out->oif here.
2403		   It was wrong for two reasons:
2404		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2405		      assigned to multiple interfaces.
2406		   2. Moreover, we are allowed to send packets with saddr
2407		      of another iface. --ANK
2408		 */
2409
2410		if (oldflp->oif == 0
2411		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2412
2413			fl.oif = dev_out->ifindex;
2414			goto make_route;
2415		}
2416		if (dev_out)
2417			dev_put(dev_out);
2418		dev_out = NULL;
2419	}
2420
2421
2422	if (oldflp->oif) {
2423		dev_out = dev_get_by_index(oldflp->oif);
2424		err = -ENODEV;
2425		if (dev_out == NULL)
2426			goto out;
2427
2428		/* RACE: Check return value of inet_select_addr instead. */
2429		if (__in_dev_get_rtnl(dev_out) == NULL) {
2430			dev_put(dev_out);
2431			goto out;	/* Wrong error code */
2432		}
2433
2434		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2435			if (!fl.fl4_src)
2436				fl.fl4_src = inet_select_addr(dev_out, 0,
2437							      RT_SCOPE_LINK);
2438			goto make_route;
2439		}
2440		if (!fl.fl4_src) {
2441			if (MULTICAST(oldflp->fl4_dst))
2442				fl.fl4_src = inet_select_addr(dev_out, 0,
2443							      fl.fl4_scope);
2444			else if (!oldflp->fl4_dst)
2445				fl.fl4_src = inet_select_addr(dev_out, 0,
2446							      RT_SCOPE_HOST);
2447		}
2448	}
2449
2450	if (!fl.fl4_dst) {
2451		fl.fl4_dst = fl.fl4_src;
2452		if (!fl.fl4_dst)
2453			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2454		if (dev_out)
2455			dev_put(dev_out);
2456		dev_out = &loopback_dev;
2457		dev_hold(dev_out);
2458		fl.oif = loopback_dev.ifindex;
2459		res.type = RTN_LOCAL;
2460		flags |= RTCF_LOCAL;
2461		goto make_route;
2462	}
2463
2464	if (fib_lookup(&fl, &res)) {
2465		res.fi = NULL;
2466		if (oldflp->oif) {
2467			/* Apparently, routing tables are wrong. Assume,
2468			   that the destination is on link.
2469
2470			   WHY? DW.
2471			   Because we are allowed to send to iface
2472			   even if it has NO routes and NO assigned
2473			   addresses. When oif is specified, routing
2474			   tables are looked up with only one purpose:
2475			   to catch if destination is gatewayed, rather than
2476			   direct. Moreover, if MSG_DONTROUTE is set,
2477			   we send packet, ignoring both routing tables
2478			   and ifaddr state. --ANK
2479
2480
2481			   We could make it even if oif is unknown,
2482			   likely IPv6, but we do not.
2483			 */
2484
2485			if (fl.fl4_src == 0)
2486				fl.fl4_src = inet_select_addr(dev_out, 0,
2487							      RT_SCOPE_LINK);
2488			res.type = RTN_UNICAST;
2489			goto make_route;
2490		}
2491		if (dev_out)
2492			dev_put(dev_out);
2493		err = -ENETUNREACH;
2494		goto out;
2495	}
2496	free_res = 1;
2497
2498	if (res.type == RTN_LOCAL) {
2499		if (!fl.fl4_src)
2500			fl.fl4_src = fl.fl4_dst;
2501		if (dev_out)
2502			dev_put(dev_out);
2503		dev_out = &loopback_dev;
2504		dev_hold(dev_out);
2505		fl.oif = dev_out->ifindex;
2506		if (res.fi)
2507			fib_info_put(res.fi);
2508		res.fi = NULL;
2509		flags |= RTCF_LOCAL;
2510		goto make_route;
2511	}
2512
2513#ifdef CONFIG_IP_ROUTE_MULTIPATH
2514	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2515		fib_select_multipath(&fl, &res);
2516	else
2517#endif
2518	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2519		fib_select_default(&fl, &res);
2520
2521	if (!fl.fl4_src)
2522		fl.fl4_src = FIB_RES_PREFSRC(res);
2523
2524	if (dev_out)
2525		dev_put(dev_out);
2526	dev_out = FIB_RES_DEV(res);
2527	dev_hold(dev_out);
2528	fl.oif = dev_out->ifindex;
2529
2530
2531make_route:
2532	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2533
2534
2535	if (free_res)
2536		fib_res_put(&res);
2537	if (dev_out)
2538		dev_put(dev_out);
2539out:	return err;
2540}
2541
2542int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2543{
2544	unsigned hash;
2545	struct rtable *rth;
2546
2547	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2548
2549	rcu_read_lock_bh();
2550	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2551		rth = rcu_dereference(rth->u.dst.rt_next)) {
2552		if (rth->fl.fl4_dst == flp->fl4_dst &&
2553		    rth->fl.fl4_src == flp->fl4_src &&
2554		    rth->fl.iif == 0 &&
2555		    rth->fl.oif == flp->oif &&
2556		    rth->fl.mark == flp->mark &&
2557		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2558			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2559
2560			/* check for multipath routes and choose one if
2561			 * necessary
2562			 */
2563			if (multipath_select_route(flp, rth, rp)) {
2564				dst_hold(&(*rp)->u.dst);
2565				RT_CACHE_STAT_INC(out_hit);
2566				rcu_read_unlock_bh();
2567				return 0;
2568			}
2569
2570			rth->u.dst.lastuse = jiffies;
2571			dst_hold(&rth->u.dst);
2572			rth->u.dst.__use++;
2573			RT_CACHE_STAT_INC(out_hit);
2574			rcu_read_unlock_bh();
2575			*rp = rth;
2576			return 0;
2577		}
2578		RT_CACHE_STAT_INC(out_hlist_search);
2579	}
2580	rcu_read_unlock_bh();
2581
2582	return ip_route_output_slow(rp, flp);
2583}
2584
2585EXPORT_SYMBOL_GPL(__ip_route_output_key);
2586
2587static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2588{
2589}
2590
2591static struct dst_ops ipv4_dst_blackhole_ops = {
2592	.family			=	AF_INET,
2593	.protocol		=	__constant_htons(ETH_P_IP),
2594	.destroy		=	ipv4_dst_destroy,
2595	.check			=	ipv4_dst_check,
2596	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2597	.entry_size		=	sizeof(struct rtable),
2598};
2599
2600
2601static int ipv4_blackhole_output(struct sk_buff *skb)
2602{
2603	kfree_skb(skb);
2604	return 0;
2605}
2606
2607static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2608{
2609	struct rtable *ort = *rp;
2610	struct rtable *rt = (struct rtable *)
2611		dst_alloc(&ipv4_dst_blackhole_ops);
2612
2613	if (rt) {
2614		struct dst_entry *new = &rt->u.dst;
2615
2616		atomic_set(&new->__refcnt, 1);
2617		new->__use = 1;
2618		new->input = ipv4_blackhole_output;
2619		new->output = ipv4_blackhole_output;
2620		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2621
2622		new->dev = ort->u.dst.dev;
2623		if (new->dev)
2624			dev_hold(new->dev);
2625
2626		rt->fl = ort->fl;
2627
2628		rt->idev = ort->idev;
2629		if (rt->idev)
2630			in_dev_hold(rt->idev);
2631		rt->rt_flags = ort->rt_flags;
2632		rt->rt_type = ort->rt_type;
2633		rt->rt_dst = ort->rt_dst;
2634		rt->rt_src = ort->rt_src;
2635		rt->rt_iif = ort->rt_iif;
2636		rt->rt_gateway = ort->rt_gateway;
2637		rt->rt_spec_dst = ort->rt_spec_dst;
2638		rt->peer = ort->peer;
2639		if (rt->peer)
2640			atomic_inc(&rt->peer->refcnt);
2641
2642		dst_free(new);
2643	}
2644
2645	dst_release(&(*rp)->u.dst);
2646	*rp = rt;
2647	return (rt ? 0 : -ENOMEM);
2648}
2649
2650int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2651{
2652	int err;
2653
2654	if ((err = __ip_route_output_key(rp, flp)) != 0)
2655		return err;
2656
2657	if (flp->proto) {
2658		if (!flp->fl4_src)
2659			flp->fl4_src = (*rp)->rt_src;
2660		if (!flp->fl4_dst)
2661			flp->fl4_dst = (*rp)->rt_dst;
2662		err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2663		if (err == -EREMOTE)
2664			err = ipv4_dst_blackhole(rp, flp, sk);
2665
2666		return err;
2667	}
2668
2669	return 0;
2670}
2671
2672EXPORT_SYMBOL_GPL(ip_route_output_flow);
2673
2674int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2675{
2676	return ip_route_output_flow(rp, flp, NULL, 0);
2677}
2678
2679static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2680			int nowait, unsigned int flags)
2681{
2682	struct rtable *rt = (struct rtable*)skb->dst;
2683	struct rtmsg *r;
2684	struct nlmsghdr *nlh;
2685	long expires;
2686	u32 id = 0, ts = 0, tsage = 0, error;
2687
2688	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2689	if (nlh == NULL)
2690		return -EMSGSIZE;
2691
2692	r = nlmsg_data(nlh);
2693	r->rtm_family	 = AF_INET;
2694	r->rtm_dst_len	= 32;
2695	r->rtm_src_len	= 0;
2696	r->rtm_tos	= rt->fl.fl4_tos;
2697	r->rtm_table	= RT_TABLE_MAIN;
2698	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2699	r->rtm_type	= rt->rt_type;
2700	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2701	r->rtm_protocol = RTPROT_UNSPEC;
2702	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2703	if (rt->rt_flags & RTCF_NOTIFY)
2704		r->rtm_flags |= RTM_F_NOTIFY;
2705
2706	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2707
2708	if (rt->fl.fl4_src) {
2709		r->rtm_src_len = 32;
2710		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2711	}
2712	if (rt->u.dst.dev)
2713		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2714#ifdef CONFIG_NET_CLS_ROUTE
2715	if (rt->u.dst.tclassid)
2716		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2717#endif
2718#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2719	if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2720		NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2721#endif
2722	if (rt->fl.iif)
2723		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2724	else if (rt->rt_src != rt->fl.fl4_src)
2725		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2726
2727	if (rt->rt_dst != rt->rt_gateway)
2728		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2729
2730	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2731		goto nla_put_failure;
2732
2733	error = rt->u.dst.error;
2734	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2735	if (rt->peer) {
2736		id = rt->peer->ip_id_count;
2737		if (rt->peer->tcp_ts_stamp) {
2738			ts = rt->peer->tcp_ts;
2739			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2740		}
2741	}
2742
2743	if (rt->fl.iif) {
2744#ifdef CONFIG_IP_MROUTE
2745		__be32 dst = rt->rt_dst;
2746
2747		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2748		    IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2749			int err = ipmr_get_route(skb, r, nowait);
2750			if (err <= 0) {
2751				if (!nowait) {
2752					if (err == 0)
2753						return 0;
2754					goto nla_put_failure;
2755				} else {
2756					if (err == -EMSGSIZE)
2757						goto nla_put_failure;
2758					error = err;
2759				}
2760			}
2761		} else
2762#endif
2763			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2764	}
2765
2766	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2767			       expires, error) < 0)
2768		goto nla_put_failure;
2769
2770	return nlmsg_end(skb, nlh);
2771
2772nla_put_failure:
2773	nlmsg_cancel(skb, nlh);
2774	return -EMSGSIZE;
2775}
2776
2777static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2778{
2779	struct rtmsg *rtm;
2780	struct nlattr *tb[RTA_MAX+1];
2781	struct rtable *rt = NULL;
2782	__be32 dst = 0;
2783	__be32 src = 0;
2784	u32 iif;
2785	int err;
2786	struct sk_buff *skb;
2787
2788	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2789	if (err < 0)
2790		goto errout;
2791
2792	rtm = nlmsg_data(nlh);
2793
2794	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2795	if (skb == NULL) {
2796		err = -ENOBUFS;
2797		goto errout;
2798	}
2799
2800	/* Reserve room for dummy headers, this skb can pass
2801	   through good chunk of routing engine.
2802	 */
2803	skb_reset_mac_header(skb);
2804	skb_reset_network_header(skb);
2805
2806	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2807	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2808	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2809
2810	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2811	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2812	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2813
2814	if (iif) {
2815		struct net_device *dev;
2816
2817		dev = __dev_get_by_index(iif);
2818		if (dev == NULL) {
2819			err = -ENODEV;
2820			goto errout_free;
2821		}
2822
2823		skb->protocol	= htons(ETH_P_IP);
2824		skb->dev	= dev;
2825		local_bh_disable();
2826		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2827		local_bh_enable();
2828
2829		rt = (struct rtable*) skb->dst;
2830		if (err == 0 && rt->u.dst.error)
2831			err = -rt->u.dst.error;
2832	} else {
2833		struct flowi fl = {
2834			.nl_u = {
2835				.ip4_u = {
2836					.daddr = dst,
2837					.saddr = src,
2838					.tos = rtm->rtm_tos,
2839				},
2840			},
2841			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2842		};
2843		err = ip_route_output_key(&rt, &fl);
2844	}
2845
2846	if (err)
2847		goto errout_free;
2848
2849	skb->dst = &rt->u.dst;
2850	if (rtm->rtm_flags & RTM_F_NOTIFY)
2851		rt->rt_flags |= RTCF_NOTIFY;
2852
2853	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2854				RTM_NEWROUTE, 0, 0);
2855	if (err <= 0)
2856		goto errout_free;
2857
2858	err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2859errout:
2860	return err;
2861
2862errout_free:
2863	kfree_skb(skb);
2864	goto errout;
2865}
2866
2867int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2868{
2869	struct rtable *rt;
2870	int h, s_h;
2871	int idx, s_idx;
2872
2873	s_h = cb->args[0];
2874	s_idx = idx = cb->args[1];
2875	for (h = 0; h <= rt_hash_mask; h++) {
2876		if (h < s_h) continue;
2877		if (h > s_h)
2878			s_idx = 0;
2879		rcu_read_lock_bh();
2880		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2881		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2882			if (idx < s_idx)
2883				continue;
2884			skb->dst = dst_clone(&rt->u.dst);
2885			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2886					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2887					 1, NLM_F_MULTI) <= 0) {
2888				dst_release(xchg(&skb->dst, NULL));
2889				rcu_read_unlock_bh();
2890				goto done;
2891			}
2892			dst_release(xchg(&skb->dst, NULL));
2893		}
2894		rcu_read_unlock_bh();
2895	}
2896
2897done:
2898	cb->args[0] = h;
2899	cb->args[1] = idx;
2900	return skb->len;
2901}
2902
2903void ip_rt_multicast_event(struct in_device *in_dev)
2904{
2905	rt_cache_flush(0);
2906}
2907
2908#ifdef CONFIG_SYSCTL
2909static int flush_delay;
2910
2911static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2912					struct file *filp, void __user *buffer,
2913					size_t *lenp, loff_t *ppos)
2914{
2915	if (write) {
2916		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2917		rt_cache_flush(flush_delay);
2918		return 0;
2919	}
2920
2921	return -EINVAL;
2922}
2923
2924static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2925						int __user *name,
2926						int nlen,
2927						void __user *oldval,
2928						size_t __user *oldlenp,
2929						void __user *newval,
2930						size_t newlen)
2931{
2932	int delay;
2933	if (newlen != sizeof(int))
2934		return -EINVAL;
2935	if (get_user(delay, (int __user *)newval))
2936		return -EFAULT;
2937	rt_cache_flush(delay);
2938	return 0;
2939}
2940
2941ctl_table ipv4_route_table[] = {
2942	{
2943		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2944		.procname	= "flush",
2945		.data		= &flush_delay,
2946		.maxlen		= sizeof(int),
2947		.mode		= 0200,
2948		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2949		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2950	},
2951	{
2952		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
2953		.procname	= "min_delay",
2954		.data		= &ip_rt_min_delay,
2955		.maxlen		= sizeof(int),
2956		.mode		= 0644,
2957		.proc_handler	= &proc_dointvec_jiffies,
2958		.strategy	= &sysctl_jiffies,
2959	},
2960	{
2961		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
2962		.procname	= "max_delay",
2963		.data		= &ip_rt_max_delay,
2964		.maxlen		= sizeof(int),
2965		.mode		= 0644,
2966		.proc_handler	= &proc_dointvec_jiffies,
2967		.strategy	= &sysctl_jiffies,
2968	},
2969	{
2970		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2971		.procname	= "gc_thresh",
2972		.data		= &ipv4_dst_ops.gc_thresh,
2973		.maxlen		= sizeof(int),
2974		.mode		= 0644,
2975		.proc_handler	= &proc_dointvec,
2976	},
2977	{
2978		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2979		.procname	= "max_size",
2980		.data		= &ip_rt_max_size,
2981		.maxlen		= sizeof(int),
2982		.mode		= 0644,
2983		.proc_handler	= &proc_dointvec,
2984	},
2985	{
2986		/*  Deprecated. Use gc_min_interval_ms */
2987
2988		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2989		.procname	= "gc_min_interval",
2990		.data		= &ip_rt_gc_min_interval,
2991		.maxlen		= sizeof(int),
2992		.mode		= 0644,
2993		.proc_handler	= &proc_dointvec_jiffies,
2994		.strategy	= &sysctl_jiffies,
2995	},
2996	{
2997		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2998		.procname	= "gc_min_interval_ms",
2999		.data		= &ip_rt_gc_min_interval,
3000		.maxlen		= sizeof(int),
3001		.mode		= 0644,
3002		.proc_handler	= &proc_dointvec_ms_jiffies,
3003		.strategy	= &sysctl_ms_jiffies,
3004	},
3005	{
3006		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
3007		.procname	= "gc_timeout",
3008		.data		= &ip_rt_gc_timeout,
3009		.maxlen		= sizeof(int),
3010		.mode		= 0644,
3011		.proc_handler	= &proc_dointvec_jiffies,
3012		.strategy	= &sysctl_jiffies,
3013	},
3014	{
3015		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
3016		.procname	= "gc_interval",
3017		.data		= &ip_rt_gc_interval,
3018		.maxlen		= sizeof(int),
3019		.mode		= 0644,
3020		.proc_handler	= &proc_dointvec_jiffies,
3021		.strategy	= &sysctl_jiffies,
3022	},
3023	{
3024		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
3025		.procname	= "redirect_load",
3026		.data		= &ip_rt_redirect_load,
3027		.maxlen		= sizeof(int),
3028		.mode		= 0644,
3029		.proc_handler	= &proc_dointvec,
3030	},
3031	{
3032		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
3033		.procname	= "redirect_number",
3034		.data		= &ip_rt_redirect_number,
3035		.maxlen		= sizeof(int),
3036		.mode		= 0644,
3037		.proc_handler	= &proc_dointvec,
3038	},
3039	{
3040		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
3041		.procname	= "redirect_silence",
3042		.data		= &ip_rt_redirect_silence,
3043		.maxlen		= sizeof(int),
3044		.mode		= 0644,
3045		.proc_handler	= &proc_dointvec,
3046	},
3047	{
3048		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
3049		.procname	= "error_cost",
3050		.data		= &ip_rt_error_cost,
3051		.maxlen		= sizeof(int),
3052		.mode		= 0644,
3053		.proc_handler	= &proc_dointvec,
3054	},
3055	{
3056		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3057		.procname	= "error_burst",
3058		.data		= &ip_rt_error_burst,
3059		.maxlen		= sizeof(int),
3060		.mode		= 0644,
3061		.proc_handler	= &proc_dointvec,
3062	},
3063	{
3064		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3065		.procname	= "gc_elasticity",
3066		.data		= &ip_rt_gc_elasticity,
3067		.maxlen		= sizeof(int),
3068		.mode		= 0644,
3069		.proc_handler	= &proc_dointvec,
3070	},
3071	{
3072		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3073		.procname	= "mtu_expires",
3074		.data		= &ip_rt_mtu_expires,
3075		.maxlen		= sizeof(int),
3076		.mode		= 0644,
3077		.proc_handler	= &proc_dointvec_jiffies,
3078		.strategy	= &sysctl_jiffies,
3079	},
3080	{
3081		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3082		.procname	= "min_pmtu",
3083		.data		= &ip_rt_min_pmtu,
3084		.maxlen		= sizeof(int),
3085		.mode		= 0644,
3086		.proc_handler	= &proc_dointvec,
3087	},
3088	{
3089		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3090		.procname	= "min_adv_mss",
3091		.data		= &ip_rt_min_advmss,
3092		.maxlen		= sizeof(int),
3093		.mode		= 0644,
3094		.proc_handler	= &proc_dointvec,
3095	},
3096	{
3097		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3098		.procname	= "secret_interval",
3099		.data		= &ip_rt_secret_interval,
3100		.maxlen		= sizeof(int),
3101		.mode		= 0644,
3102		.proc_handler	= &proc_dointvec_jiffies,
3103		.strategy	= &sysctl_jiffies,
3104	},
3105	{ .ctl_name = 0 }
3106};
3107#endif
3108
3109#ifdef CONFIG_NET_CLS_ROUTE
3110struct ip_rt_acct *ip_rt_acct;
3111
3112/* This code sucks.  But you should have seen it before! --RR */
3113
3114/* IP route accounting ptr for this logical cpu number. */
3115#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3116
3117#ifdef CONFIG_PROC_FS
3118static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3119			   int length, int *eof, void *data)
3120{
3121	unsigned int i;
3122
3123	if ((offset & 3) || (length & 3))
3124		return -EIO;
3125
3126	if (offset >= sizeof(struct ip_rt_acct) * 256) {
3127		*eof = 1;
3128		return 0;
3129	}
3130
3131	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3132		length = sizeof(struct ip_rt_acct) * 256 - offset;
3133		*eof = 1;
3134	}
3135
3136	offset /= sizeof(u32);
3137
3138	if (length > 0) {
3139		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3140		u32 *dst = (u32 *) buffer;
3141
3142		/* Copy first cpu. */
3143		*start = buffer;
3144		memcpy(dst, src, length);
3145
3146		/* Add the other cpus in, one int at a time */
3147		for_each_possible_cpu(i) {
3148			unsigned int j;
3149
3150			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3151
3152			for (j = 0; j < length/4; j++)
3153				dst[j] += src[j];
3154		}
3155	}
3156	return length;
3157}
3158#endif /* CONFIG_PROC_FS */
3159#endif /* CONFIG_NET_CLS_ROUTE */
3160
3161static __initdata unsigned long rhash_entries;
3162static int __init set_rhash_entries(char *str)
3163{
3164	if (!str)
3165		return 0;
3166	rhash_entries = simple_strtoul(str, &str, 0);
3167	return 1;
3168}
3169__setup("rhash_entries=", set_rhash_entries);
3170
3171int __init ip_rt_init(void)
3172{
3173	int rc = 0;
3174
3175	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3176			     (jiffies ^ (jiffies >> 7)));
3177
3178#ifdef CONFIG_NET_CLS_ROUTE
3179	{
3180	int order;
3181	for (order = 0;
3182	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3183		/* NOTHING */;
3184	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3185	if (!ip_rt_acct)
3186		panic("IP: failed to allocate ip_rt_acct\n");
3187	memset(ip_rt_acct, 0, PAGE_SIZE << order);
3188	}
3189#endif
3190
3191	ipv4_dst_ops.kmem_cachep =
3192		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3193				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3194
3195	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3196
3197	rt_hash_table = (struct rt_hash_bucket *)
3198		alloc_large_system_hash("IP route cache",
3199					sizeof(struct rt_hash_bucket),
3200					rhash_entries,
3201					(num_physpages >= 128 * 1024) ?
3202					15 : 17,
3203					0,
3204					&rt_hash_log,
3205					&rt_hash_mask,
3206					0);
3207	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3208	rt_hash_lock_init();
3209
3210	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3211	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3212
3213	devinet_init();
3214	ip_fib_init();
3215
3216	init_timer(&rt_flush_timer);
3217	rt_flush_timer.function = rt_run_flush;
3218	init_timer(&rt_periodic_timer);
3219	rt_periodic_timer.function = rt_check_expire;
3220	init_timer(&rt_secret_timer);
3221	rt_secret_timer.function = rt_secret_rebuild;
3222
3223	/* All the timers, started at system startup tend
3224	   to synchronize. Perturb it a bit.
3225	 */
3226	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3227					ip_rt_gc_interval;
3228	add_timer(&rt_periodic_timer);
3229
3230	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3231		ip_rt_secret_interval;
3232	add_timer(&rt_secret_timer);
3233
3234#ifdef CONFIG_PROC_FS
3235	{
3236	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3237	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3238	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3239					     proc_net_stat))) {
3240		return -ENOMEM;
3241	}
3242	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3243	}
3244#ifdef CONFIG_NET_CLS_ROUTE
3245	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3246#endif
3247#endif
3248#ifdef CONFIG_XFRM
3249	xfrm_init();
3250	xfrm4_init();
3251#endif
3252	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3253
3254	return rc;
3255}
3256
3257EXPORT_SYMBOL(__ip_select_ident);
3258EXPORT_SYMBOL(ip_route_input);
3259EXPORT_SYMBOL(ip_route_output_key);
3260