1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * ROUTE - implementation of the IP router. 7 * 8 * Version: $Id: route.c,v 1.1.1.1 2007/08/03 18:53:51 Exp $ 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Alan Cox, <gw4pts@gw4pts.ampr.org> 13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 15 * 16 * Fixes: 17 * Alan Cox : Verify area fixes. 18 * Alan Cox : cli() protects routing changes 19 * Rui Oliveira : ICMP routing table updates 20 * (rco@di.uminho.pt) Routing table insertion and update 21 * Linus Torvalds : Rewrote bits to be sensible 22 * Alan Cox : Added BSD route gw semantics 23 * Alan Cox : Super /proc >4K 24 * Alan Cox : MTU in route table 25 * Alan Cox : MSS actually. Also added the window 26 * clamper. 27 * Sam Lantinga : Fixed route matching in rt_del() 28 * Alan Cox : Routing cache support. 29 * Alan Cox : Removed compatibility cruft. 30 * Alan Cox : RTF_REJECT support. 31 * Alan Cox : TCP irtt support. 32 * Jonathan Naylor : Added Metric support. 33 * Miquel van Smoorenburg : BSD API fixes. 34 * Miquel van Smoorenburg : Metrics. 35 * Alan Cox : Use __u32 properly 36 * Alan Cox : Aligned routing errors more closely with BSD 37 * our system is still very different. 38 * Alan Cox : Faster /proc handling 39 * Alexey Kuznetsov : Massive rework to support tree based routing, 40 * routing caches and better behaviour. 41 * 42 * Olaf Erb : irtt wasn't being copied right. 43 * Bjorn Ekwall : Kerneld route support. 44 * Alan Cox : Multicast fixed (I hope) 45 * Pavel Krauz : Limited broadcast fixed 46 * Mike McLagan : Routing by source 47 * Alexey Kuznetsov : End of old history. Split to fib.c and 48 * route.c and rewritten from scratch. 49 * Andi Kleen : Load-limit warning messages. 50 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 54 * Marc Boucher : routing by fwmark 55 * Robert Olsson : Added rt_cache statistics 56 * Arnaldo C. Melo : Convert proc stuff to seq_file 57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 59 * Ilia Sotnikov : Removed TOS from hash calculations 60 * 61 * This program is free software; you can redistribute it and/or 62 * modify it under the terms of the GNU General Public License 63 * as published by the Free Software Foundation; either version 64 * 2 of the License, or (at your option) any later version. 65 */ 66 67#include <linux/module.h> 68#include <asm/uaccess.h> 69#include <asm/system.h> 70#include <linux/bitops.h> 71#include <linux/types.h> 72#include <linux/kernel.h> 73#include <linux/mm.h> 74#include <linux/bootmem.h> 75#include <linux/string.h> 76#include <linux/socket.h> 77#include <linux/sockios.h> 78#include <linux/errno.h> 79#include <linux/in.h> 80#include <linux/inet.h> 81#include <linux/netdevice.h> 82#include <linux/proc_fs.h> 83#include <linux/init.h> 84#include <linux/skbuff.h> 85#include <linux/inetdevice.h> 86#include <linux/igmp.h> 87#include <linux/pkt_sched.h> 88#include <linux/mroute.h> 89#include <linux/netfilter_ipv4.h> 90#include <linux/random.h> 91#include <linux/jhash.h> 92#include <linux/rcupdate.h> 93#include <linux/times.h> 94#include <net/protocol.h> 95#include <net/ip.h> 96#include <net/route.h> 97#include <net/inetpeer.h> 98#include <net/sock.h> 99#include <net/ip_fib.h> 100#include <net/arp.h> 101#include <net/tcp.h> 102#include <net/icmp.h> 103#include <net/xfrm.h> 104#include <net/ip_mp_alg.h> 105#include <net/netevent.h> 106#include <net/rtnetlink.h> 107#ifdef CONFIG_SYSCTL 108#include <linux/sysctl.h> 109#endif 110 111#define RT_FL_TOS(oldflp) \ 112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) 113 114#define IP_MAX_MTU 0xFFF0 115 116#define RT_GC_TIMEOUT (300*HZ) 117 118static int ip_rt_min_delay = 2 * HZ; 119static int ip_rt_max_delay = 10 * HZ; 120static int ip_rt_max_size; 121static int ip_rt_gc_timeout = RT_GC_TIMEOUT; 122static int ip_rt_gc_interval = 60 * HZ; 123static int ip_rt_gc_min_interval = HZ / 2; 124static int ip_rt_redirect_number = 9; 125static int ip_rt_redirect_load = HZ / 50; 126static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1)); 127static int ip_rt_error_cost = HZ; 128static int ip_rt_error_burst = 5 * HZ; 129static int ip_rt_gc_elasticity = 8; 130static int ip_rt_mtu_expires = 10 * 60 * HZ; 131static int ip_rt_min_pmtu = 512 + 20 + 20; 132static int ip_rt_min_advmss = 256; 133static int ip_rt_secret_interval = 10 * 60 * HZ; 134static unsigned long rt_deadline; 135 136#define RTprint(a...) printk(KERN_DEBUG a) 137 138static struct timer_list rt_flush_timer; 139static struct timer_list rt_periodic_timer; 140static struct timer_list rt_secret_timer; 141 142/* 143 * Interface to generic destination cache. 144 */ 145 146static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 147static void ipv4_dst_destroy(struct dst_entry *dst); 148static void ipv4_dst_ifdown(struct dst_entry *dst, 149 struct net_device *dev, int how); 150static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 151static void ipv4_link_failure(struct sk_buff *skb); 152static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 153static int rt_garbage_collect(void); 154 155 156static struct dst_ops ipv4_dst_ops = { 157 .family = AF_INET, 158 .protocol = __constant_htons(ETH_P_IP), 159 .gc = rt_garbage_collect, 160 .check = ipv4_dst_check, 161 .destroy = ipv4_dst_destroy, 162 .ifdown = ipv4_dst_ifdown, 163 .negative_advice = ipv4_negative_advice, 164 .link_failure = ipv4_link_failure, 165 .update_pmtu = ip_rt_update_pmtu, 166 .entry_size = sizeof(struct rtable), 167}; 168 169#define ECN_OR_COST(class) TC_PRIO_##class 170 171__u8 ip_tos2prio[16] = { 172 TC_PRIO_BESTEFFORT, 173 ECN_OR_COST(FILLER), 174 TC_PRIO_BESTEFFORT, 175 ECN_OR_COST(BESTEFFORT), 176 TC_PRIO_BULK, 177 ECN_OR_COST(BULK), 178 TC_PRIO_BULK, 179 ECN_OR_COST(BULK), 180 TC_PRIO_INTERACTIVE, 181 ECN_OR_COST(INTERACTIVE), 182 TC_PRIO_INTERACTIVE, 183 ECN_OR_COST(INTERACTIVE), 184 TC_PRIO_INTERACTIVE_BULK, 185 ECN_OR_COST(INTERACTIVE_BULK), 186 TC_PRIO_INTERACTIVE_BULK, 187 ECN_OR_COST(INTERACTIVE_BULK) 188}; 189 190 191/* 192 * Route cache. 193 */ 194 195/* The locking scheme is rather straight forward: 196 * 197 * 1) Read-Copy Update protects the buckets of the central route hash. 198 * 2) Only writers remove entries, and they hold the lock 199 * as they look at rtable reference counts. 200 * 3) Only readers acquire references to rtable entries, 201 * they do so with atomic increments and with the 202 * lock held. 203 */ 204 205struct rt_hash_bucket { 206 struct rtable *chain; 207}; 208#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ 209 defined(CONFIG_PROVE_LOCKING) 210/* 211 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks 212 * The size of this table is a power of two and depends on the number of CPUS. 213 * (on lockdep we have a quite big spinlock_t, so keep the size down there) 214 */ 215#ifdef CONFIG_LOCKDEP 216# define RT_HASH_LOCK_SZ 256 217#else 218# if NR_CPUS >= 32 219# define RT_HASH_LOCK_SZ 4096 220# elif NR_CPUS >= 16 221# define RT_HASH_LOCK_SZ 2048 222# elif NR_CPUS >= 8 223# define RT_HASH_LOCK_SZ 1024 224# elif NR_CPUS >= 4 225# define RT_HASH_LOCK_SZ 512 226# else 227# define RT_HASH_LOCK_SZ 256 228# endif 229#endif 230 231static spinlock_t *rt_hash_locks; 232# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] 233# define rt_hash_lock_init() { \ 234 int i; \ 235 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \ 236 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \ 237 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \ 238 spin_lock_init(&rt_hash_locks[i]); \ 239 } 240#else 241# define rt_hash_lock_addr(slot) NULL 242# define rt_hash_lock_init() 243#endif 244 245static struct rt_hash_bucket *rt_hash_table; 246static unsigned rt_hash_mask; 247static int rt_hash_log; 248static unsigned int rt_hash_rnd; 249 250static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 251#define RT_CACHE_STAT_INC(field) \ 252 (__raw_get_cpu_var(rt_cache_stat).field++) 253 254static int rt_intern_hash(unsigned hash, struct rtable *rth, 255 struct rtable **res); 256 257static unsigned int rt_hash_code(u32 daddr, u32 saddr) 258{ 259 return (jhash_2words(daddr, saddr, rt_hash_rnd) 260 & rt_hash_mask); 261} 262 263#define rt_hash(daddr, saddr, idx) \ 264 rt_hash_code((__force u32)(__be32)(daddr),\ 265 (__force u32)(__be32)(saddr) ^ ((idx) << 5)) 266 267#ifdef CONFIG_PROC_FS 268struct rt_cache_iter_state { 269 int bucket; 270}; 271 272static struct rtable *rt_cache_get_first(struct seq_file *seq) 273{ 274 struct rtable *r = NULL; 275 struct rt_cache_iter_state *st = seq->private; 276 277 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 278 rcu_read_lock_bh(); 279 r = rt_hash_table[st->bucket].chain; 280 if (r) 281 break; 282 rcu_read_unlock_bh(); 283 } 284 return r; 285} 286 287static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r) 288{ 289 struct rt_cache_iter_state *st = rcu_dereference(seq->private); 290 291 r = r->u.dst.rt_next; 292 while (!r) { 293 rcu_read_unlock_bh(); 294 if (--st->bucket < 0) 295 break; 296 rcu_read_lock_bh(); 297 r = rt_hash_table[st->bucket].chain; 298 } 299 return r; 300} 301 302static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) 303{ 304 struct rtable *r = rt_cache_get_first(seq); 305 306 if (r) 307 while (pos && (r = rt_cache_get_next(seq, r))) 308 --pos; 309 return pos ? NULL : r; 310} 311 312static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 313{ 314 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 315} 316 317static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 318{ 319 struct rtable *r = NULL; 320 321 if (v == SEQ_START_TOKEN) 322 r = rt_cache_get_first(seq); 323 else 324 r = rt_cache_get_next(seq, v); 325 ++*pos; 326 return r; 327} 328 329static void rt_cache_seq_stop(struct seq_file *seq, void *v) 330{ 331 if (v && v != SEQ_START_TOKEN) 332 rcu_read_unlock_bh(); 333} 334 335static int rt_cache_seq_show(struct seq_file *seq, void *v) 336{ 337 if (v == SEQ_START_TOKEN) 338 seq_printf(seq, "%-127s\n", 339 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 340 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 341 "HHUptod\tSpecDst"); 342 else { 343 struct rtable *r = v; 344 char temp[256]; 345 346 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" 347 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X", 348 r->u.dst.dev ? r->u.dst.dev->name : "*", 349 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, 350 r->rt_flags, atomic_read(&r->u.dst.__refcnt), 351 r->u.dst.__use, 0, (unsigned long)r->rt_src, 352 (dst_metric(&r->u.dst, RTAX_ADVMSS) ? 353 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), 354 dst_metric(&r->u.dst, RTAX_WINDOW), 355 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + 356 dst_metric(&r->u.dst, RTAX_RTTVAR)), 357 r->fl.fl4_tos, 358 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, 359 r->u.dst.hh ? (r->u.dst.hh->hh_output == 360 dev_queue_xmit) : 0, 361 r->rt_spec_dst); 362 seq_printf(seq, "%-127s\n", temp); 363 } 364 return 0; 365} 366 367static const struct seq_operations rt_cache_seq_ops = { 368 .start = rt_cache_seq_start, 369 .next = rt_cache_seq_next, 370 .stop = rt_cache_seq_stop, 371 .show = rt_cache_seq_show, 372}; 373 374static int rt_cache_seq_open(struct inode *inode, struct file *file) 375{ 376 struct seq_file *seq; 377 int rc = -ENOMEM; 378 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); 379 380 if (!s) 381 goto out; 382 rc = seq_open(file, &rt_cache_seq_ops); 383 if (rc) 384 goto out_kfree; 385 seq = file->private_data; 386 seq->private = s; 387 memset(s, 0, sizeof(*s)); 388out: 389 return rc; 390out_kfree: 391 kfree(s); 392 goto out; 393} 394 395static const struct file_operations rt_cache_seq_fops = { 396 .owner = THIS_MODULE, 397 .open = rt_cache_seq_open, 398 .read = seq_read, 399 .llseek = seq_lseek, 400 .release = seq_release_private, 401}; 402 403 404static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 405{ 406 int cpu; 407 408 if (*pos == 0) 409 return SEQ_START_TOKEN; 410 411 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { 412 if (!cpu_possible(cpu)) 413 continue; 414 *pos = cpu+1; 415 return &per_cpu(rt_cache_stat, cpu); 416 } 417 return NULL; 418} 419 420static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 421{ 422 int cpu; 423 424 for (cpu = *pos; cpu < NR_CPUS; ++cpu) { 425 if (!cpu_possible(cpu)) 426 continue; 427 *pos = cpu+1; 428 return &per_cpu(rt_cache_stat, cpu); 429 } 430 return NULL; 431 432} 433 434static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 435{ 436 437} 438 439static int rt_cpu_seq_show(struct seq_file *seq, void *v) 440{ 441 struct rt_cache_stat *st = v; 442 443 if (v == SEQ_START_TOKEN) { 444 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 445 return 0; 446 } 447 448 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 449 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 450 atomic_read(&ipv4_dst_ops.entries), 451 st->in_hit, 452 st->in_slow_tot, 453 st->in_slow_mc, 454 st->in_no_route, 455 st->in_brd, 456 st->in_martian_dst, 457 st->in_martian_src, 458 459 st->out_hit, 460 st->out_slow_tot, 461 st->out_slow_mc, 462 463 st->gc_total, 464 st->gc_ignored, 465 st->gc_goal_miss, 466 st->gc_dst_overflow, 467 st->in_hlist_search, 468 st->out_hlist_search 469 ); 470 return 0; 471} 472 473static const struct seq_operations rt_cpu_seq_ops = { 474 .start = rt_cpu_seq_start, 475 .next = rt_cpu_seq_next, 476 .stop = rt_cpu_seq_stop, 477 .show = rt_cpu_seq_show, 478}; 479 480 481static int rt_cpu_seq_open(struct inode *inode, struct file *file) 482{ 483 return seq_open(file, &rt_cpu_seq_ops); 484} 485 486static const struct file_operations rt_cpu_seq_fops = { 487 .owner = THIS_MODULE, 488 .open = rt_cpu_seq_open, 489 .read = seq_read, 490 .llseek = seq_lseek, 491 .release = seq_release, 492}; 493 494#endif /* CONFIG_PROC_FS */ 495 496static __inline__ void rt_free(struct rtable *rt) 497{ 498 multipath_remove(rt); 499 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 500} 501 502static __inline__ void rt_drop(struct rtable *rt) 503{ 504 multipath_remove(rt); 505 ip_rt_put(rt); 506 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 507} 508 509static __inline__ int rt_fast_clean(struct rtable *rth) 510{ 511 /* Kill broadcast/multicast entries very aggresively, if they 512 collide in hash table with more useful entries */ 513 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 514 rth->fl.iif && rth->u.dst.rt_next; 515} 516 517static __inline__ int rt_valuable(struct rtable *rth) 518{ 519 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 520 rth->u.dst.expires; 521} 522 523static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 524{ 525 unsigned long age; 526 int ret = 0; 527 528 if (atomic_read(&rth->u.dst.__refcnt)) 529 goto out; 530 531 ret = 1; 532 if (rth->u.dst.expires && 533 time_after_eq(jiffies, rth->u.dst.expires)) 534 goto out; 535 536 age = jiffies - rth->u.dst.lastuse; 537 ret = 0; 538 if ((age <= tmo1 && !rt_fast_clean(rth)) || 539 (age <= tmo2 && rt_valuable(rth))) 540 goto out; 541 ret = 1; 542out: return ret; 543} 544 545/* Bits of score are: 546 * 31: very valuable 547 * 30: not quite useless 548 * 29..0: usage counter 549 */ 550static inline u32 rt_score(struct rtable *rt) 551{ 552 u32 score = jiffies - rt->u.dst.lastuse; 553 554 score = ~score & ~(3<<30); 555 556 if (rt_valuable(rt)) 557 score |= (1<<31); 558 559 if (!rt->fl.iif || 560 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) 561 score |= (1<<30); 562 563 return score; 564} 565 566static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 567{ 568 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | 569 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) | 570 (fl1->mark ^ fl2->mark) | 571 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ 572 *(u16 *)&fl2->nl_u.ip4_u.tos) | 573 (fl1->oif ^ fl2->oif) | 574 (fl1->iif ^ fl2->iif)) == 0; 575} 576 577#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 578static struct rtable **rt_remove_balanced_route(struct rtable **chain_head, 579 struct rtable *expentry, 580 int *removed_count) 581{ 582 int passedexpired = 0; 583 struct rtable **nextstep = NULL; 584 struct rtable **rthp = chain_head; 585 struct rtable *rth; 586 587 if (removed_count) 588 *removed_count = 0; 589 590 while ((rth = *rthp) != NULL) { 591 if (rth == expentry) 592 passedexpired = 1; 593 594 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 && 595 compare_keys(&(*rthp)->fl, &expentry->fl)) { 596 if (*rthp == expentry) { 597 *rthp = rth->u.dst.rt_next; 598 continue; 599 } else { 600 *rthp = rth->u.dst.rt_next; 601 rt_free(rth); 602 if (removed_count) 603 ++(*removed_count); 604 } 605 } else { 606 if (!((*rthp)->u.dst.flags & DST_BALANCED) && 607 passedexpired && !nextstep) 608 nextstep = &rth->u.dst.rt_next; 609 610 rthp = &rth->u.dst.rt_next; 611 } 612 } 613 614 rt_free(expentry); 615 if (removed_count) 616 ++(*removed_count); 617 618 return nextstep; 619} 620#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 621 622 623/* This runs via a timer and thus is always in BH context. */ 624static void rt_check_expire(unsigned long dummy) 625{ 626 static unsigned int rover; 627 unsigned int i = rover, goal; 628 struct rtable *rth, **rthp; 629 unsigned long now = jiffies; 630 u64 mult; 631 632 mult = ((u64)ip_rt_gc_interval) << rt_hash_log; 633 if (ip_rt_gc_timeout > 1) 634 do_div(mult, ip_rt_gc_timeout); 635 goal = (unsigned int)mult; 636 if (goal > rt_hash_mask) goal = rt_hash_mask + 1; 637 for (; goal > 0; goal--) { 638 unsigned long tmo = ip_rt_gc_timeout; 639 640 i = (i + 1) & rt_hash_mask; 641 rthp = &rt_hash_table[i].chain; 642 643 if (*rthp == 0) 644 continue; 645 spin_lock(rt_hash_lock_addr(i)); 646 while ((rth = *rthp) != NULL) { 647 if (rth->u.dst.expires) { 648 /* Entry is expired even if it is in use */ 649 if (time_before_eq(now, rth->u.dst.expires)) { 650 tmo >>= 1; 651 rthp = &rth->u.dst.rt_next; 652 continue; 653 } 654 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { 655 tmo >>= 1; 656 rthp = &rth->u.dst.rt_next; 657 continue; 658 } 659 660 /* Cleanup aged off entries. */ 661#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 662 /* remove all related balanced entries if necessary */ 663 if (rth->u.dst.flags & DST_BALANCED) { 664 rthp = rt_remove_balanced_route( 665 &rt_hash_table[i].chain, 666 rth, NULL); 667 if (!rthp) 668 break; 669 } else { 670 *rthp = rth->u.dst.rt_next; 671 rt_free(rth); 672 } 673#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 674 *rthp = rth->u.dst.rt_next; 675 rt_free(rth); 676#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 677 } 678 spin_unlock(rt_hash_lock_addr(i)); 679 680 /* Fallback loop breaker. */ 681 if (time_after(jiffies, now)) 682 break; 683 } 684 rover = i; 685 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); 686} 687 688/* This can run from both BH and non-BH contexts, the latter 689 * in the case of a forced flush event. 690 */ 691static void rt_run_flush(unsigned long dummy) 692{ 693 int i; 694 struct rtable *rth, *next; 695 696 rt_deadline = 0; 697 698 get_random_bytes(&rt_hash_rnd, 4); 699 700 for (i = rt_hash_mask; i >= 0; i--) { 701 spin_lock_bh(rt_hash_lock_addr(i)); 702 rth = rt_hash_table[i].chain; 703 if (rth) 704 rt_hash_table[i].chain = NULL; 705 spin_unlock_bh(rt_hash_lock_addr(i)); 706 707 for (; rth; rth = next) { 708 next = rth->u.dst.rt_next; 709 rt_free(rth); 710 } 711 } 712} 713 714static DEFINE_SPINLOCK(rt_flush_lock); 715 716void rt_cache_flush(int delay) 717{ 718 unsigned long now = jiffies; 719 int user_mode = !in_softirq(); 720 721 if (delay < 0) 722 delay = ip_rt_min_delay; 723 724 /* flush existing multipath state*/ 725 multipath_flush(); 726 727 spin_lock_bh(&rt_flush_lock); 728 729 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { 730 long tmo = (long)(rt_deadline - now); 731 732 /* If flush timer is already running 733 and flush request is not immediate (delay > 0): 734 735 if deadline is not achieved, prolongate timer to "delay", 736 otherwise fire it at deadline time. 737 */ 738 739 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay) 740 tmo = 0; 741 742 if (delay > tmo) 743 delay = tmo; 744 } 745 746 if (delay <= 0) { 747 spin_unlock_bh(&rt_flush_lock); 748 rt_run_flush(0); 749 return; 750 } 751 752 if (rt_deadline == 0) 753 rt_deadline = now + ip_rt_max_delay; 754 755 mod_timer(&rt_flush_timer, now+delay); 756 spin_unlock_bh(&rt_flush_lock); 757} 758 759static void rt_secret_rebuild(unsigned long dummy) 760{ 761 unsigned long now = jiffies; 762 763 rt_cache_flush(0); 764 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval); 765} 766 767/* 768 Short description of GC goals. 769 770 We want to build algorithm, which will keep routing cache 771 at some equilibrium point, when number of aged off entries 772 is kept approximately equal to newly generated ones. 773 774 Current expiration strength is variable "expire". 775 We try to adjust it dynamically, so that if networking 776 is idle expires is large enough to keep enough of warm entries, 777 and when load increases it reduces to limit cache size. 778 */ 779 780static int rt_garbage_collect(void) 781{ 782 static unsigned long expire = RT_GC_TIMEOUT; 783 static unsigned long last_gc; 784 static int rover; 785 static int equilibrium; 786 struct rtable *rth, **rthp; 787 unsigned long now = jiffies; 788 int goal; 789 790 /* 791 * Garbage collection is pretty expensive, 792 * do not make it too frequently. 793 */ 794 795 RT_CACHE_STAT_INC(gc_total); 796 797 if (now - last_gc < ip_rt_gc_min_interval && 798 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { 799 RT_CACHE_STAT_INC(gc_ignored); 800 goto out; 801 } 802 803 /* Calculate number of entries, which we want to expire now. */ 804 goal = atomic_read(&ipv4_dst_ops.entries) - 805 (ip_rt_gc_elasticity << rt_hash_log); 806 if (goal <= 0) { 807 if (equilibrium < ipv4_dst_ops.gc_thresh) 808 equilibrium = ipv4_dst_ops.gc_thresh; 809 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 810 if (goal > 0) { 811 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1); 812 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 813 } 814 } else { 815 /* We are in dangerous area. Try to reduce cache really 816 * aggressively. 817 */ 818 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1); 819 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; 820 } 821 822 if (now - last_gc >= ip_rt_gc_min_interval) 823 last_gc = now; 824 825 if (goal <= 0) { 826 equilibrium += goal; 827 goto work_done; 828 } 829 830 do { 831 int i, k; 832 833 for (i = rt_hash_mask, k = rover; i >= 0; i--) { 834 unsigned long tmo = expire; 835 836 k = (k + 1) & rt_hash_mask; 837 rthp = &rt_hash_table[k].chain; 838 spin_lock_bh(rt_hash_lock_addr(k)); 839 while ((rth = *rthp) != NULL) { 840 if (!rt_may_expire(rth, tmo, expire)) { 841 tmo >>= 1; 842 rthp = &rth->u.dst.rt_next; 843 continue; 844 } 845#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 846 /* remove all related balanced entries 847 * if necessary 848 */ 849 if (rth->u.dst.flags & DST_BALANCED) { 850 int r; 851 852 rthp = rt_remove_balanced_route( 853 &rt_hash_table[k].chain, 854 rth, 855 &r); 856 goal -= r; 857 if (!rthp) 858 break; 859 } else { 860 *rthp = rth->u.dst.rt_next; 861 rt_free(rth); 862 goal--; 863 } 864#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 865 *rthp = rth->u.dst.rt_next; 866 rt_free(rth); 867 goal--; 868#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 869 } 870 spin_unlock_bh(rt_hash_lock_addr(k)); 871 if (goal <= 0) 872 break; 873 } 874 rover = k; 875 876 if (goal <= 0) 877 goto work_done; 878 879 /* Goal is not achieved. We stop process if: 880 881 - if expire reduced to zero. Otherwise, expire is halfed. 882 - if table is not full. 883 - if we are called from interrupt. 884 - jiffies check is just fallback/debug loop breaker. 885 We will not spin here for long time in any case. 886 */ 887 888 RT_CACHE_STAT_INC(gc_goal_miss); 889 890 if (expire == 0) 891 break; 892 893 expire >>= 1; 894#if RT_CACHE_DEBUG >= 2 895 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, 896 atomic_read(&ipv4_dst_ops.entries), goal, i); 897#endif 898 899 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 900 goto out; 901 } while (!in_softirq() && time_before_eq(jiffies, now)); 902 903 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 904 goto out; 905 if (net_ratelimit()) 906 printk(KERN_WARNING "dst cache overflow\n"); 907 RT_CACHE_STAT_INC(gc_dst_overflow); 908 return 1; 909 910work_done: 911 expire += ip_rt_gc_min_interval; 912 if (expire > ip_rt_gc_timeout || 913 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) 914 expire = ip_rt_gc_timeout; 915#if RT_CACHE_DEBUG >= 2 916 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, 917 atomic_read(&ipv4_dst_ops.entries), goal, rover); 918#endif 919out: return 0; 920} 921 922static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) 923{ 924 struct rtable *rth, **rthp; 925 unsigned long now; 926 struct rtable *cand, **candp; 927 u32 min_score; 928 int chain_length; 929 int attempts = !in_softirq(); 930 931restart: 932 chain_length = 0; 933 min_score = ~(u32)0; 934 cand = NULL; 935 candp = NULL; 936 now = jiffies; 937 938 rthp = &rt_hash_table[hash].chain; 939 940 spin_lock_bh(rt_hash_lock_addr(hash)); 941 while ((rth = *rthp) != NULL) { 942#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 943 if (!(rth->u.dst.flags & DST_BALANCED) && 944 compare_keys(&rth->fl, &rt->fl)) { 945#else 946 if (compare_keys(&rth->fl, &rt->fl)) { 947#endif 948 /* Put it first */ 949 *rthp = rth->u.dst.rt_next; 950 /* 951 * Since lookup is lockfree, the deletion 952 * must be visible to another weakly ordered CPU before 953 * the insertion at the start of the hash chain. 954 */ 955 rcu_assign_pointer(rth->u.dst.rt_next, 956 rt_hash_table[hash].chain); 957 /* 958 * Since lookup is lockfree, the update writes 959 * must be ordered for consistency on SMP. 960 */ 961 rcu_assign_pointer(rt_hash_table[hash].chain, rth); 962 963 rth->u.dst.__use++; 964 dst_hold(&rth->u.dst); 965 rth->u.dst.lastuse = now; 966 spin_unlock_bh(rt_hash_lock_addr(hash)); 967 968 rt_drop(rt); 969 *rp = rth; 970 return 0; 971 } 972 973 if (!atomic_read(&rth->u.dst.__refcnt)) { 974 u32 score = rt_score(rth); 975 976 if (score <= min_score) { 977 cand = rth; 978 candp = rthp; 979 min_score = score; 980 } 981 } 982 983 chain_length++; 984 985 rthp = &rth->u.dst.rt_next; 986 } 987 988 if (cand) { 989 /* ip_rt_gc_elasticity used to be average length of chain 990 * length, when exceeded gc becomes really aggressive. 991 * 992 * The second limit is less certain. At the moment it allows 993 * only 2 entries per bucket. We will see. 994 */ 995 if (chain_length > ip_rt_gc_elasticity) { 996 *candp = cand->u.dst.rt_next; 997 rt_free(cand); 998 } 999 } 1000 1001 /* Try to bind route to arp only if it is output 1002 route or unicast forwarding path. 1003 */ 1004 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1005 int err = arp_bind_neighbour(&rt->u.dst); 1006 if (err) { 1007 spin_unlock_bh(rt_hash_lock_addr(hash)); 1008 1009 if (err != -ENOBUFS) { 1010 rt_drop(rt); 1011 return err; 1012 } 1013 1014 /* Neighbour tables are full and nothing 1015 can be released. Try to shrink route cache, 1016 it is most likely it holds some neighbour records. 1017 */ 1018 if (attempts-- > 0) { 1019 int saved_elasticity = ip_rt_gc_elasticity; 1020 int saved_int = ip_rt_gc_min_interval; 1021 ip_rt_gc_elasticity = 1; 1022 ip_rt_gc_min_interval = 0; 1023 rt_garbage_collect(); 1024 ip_rt_gc_min_interval = saved_int; 1025 ip_rt_gc_elasticity = saved_elasticity; 1026 goto restart; 1027 } 1028 1029 if (net_ratelimit()) 1030 printk(KERN_WARNING "Neighbour table overflow.\n"); 1031 rt_drop(rt); 1032 return -ENOBUFS; 1033 } 1034 } 1035 1036 rt->u.dst.rt_next = rt_hash_table[hash].chain; 1037#if RT_CACHE_DEBUG >= 2 1038 if (rt->u.dst.rt_next) { 1039 struct rtable *trt; 1040 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash, 1041 NIPQUAD(rt->rt_dst)); 1042 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) 1043 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst)); 1044 printk("\n"); 1045 } 1046#endif 1047 rt_hash_table[hash].chain = rt; 1048 spin_unlock_bh(rt_hash_lock_addr(hash)); 1049 *rp = rt; 1050 return 0; 1051} 1052 1053void rt_bind_peer(struct rtable *rt, int create) 1054{ 1055 static DEFINE_SPINLOCK(rt_peer_lock); 1056 struct inet_peer *peer; 1057 1058 peer = inet_getpeer(rt->rt_dst, create); 1059 1060 spin_lock_bh(&rt_peer_lock); 1061 if (rt->peer == NULL) { 1062 rt->peer = peer; 1063 peer = NULL; 1064 } 1065 spin_unlock_bh(&rt_peer_lock); 1066 if (peer) 1067 inet_putpeer(peer); 1068} 1069 1070/* 1071 * Peer allocation may fail only in serious out-of-memory conditions. However 1072 * we still can generate some output. 1073 * Random ID selection looks a bit dangerous because we have no chances to 1074 * select ID being unique in a reasonable period of time. 1075 * But broken packet identifier may be better than no packet at all. 1076 */ 1077static void ip_select_fb_ident(struct iphdr *iph) 1078{ 1079 static DEFINE_SPINLOCK(ip_fb_id_lock); 1080 static u32 ip_fallback_id; 1081 u32 salt; 1082 1083 spin_lock_bh(&ip_fb_id_lock); 1084 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); 1085 iph->id = htons(salt & 0xFFFF); 1086 ip_fallback_id = salt; 1087 spin_unlock_bh(&ip_fb_id_lock); 1088} 1089 1090void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 1091{ 1092 struct rtable *rt = (struct rtable *) dst; 1093 1094 if (rt) { 1095 if (rt->peer == NULL) 1096 rt_bind_peer(rt, 1); 1097 1098 /* If peer is attached to destination, it is never detached, 1099 so that we need not to grab a lock to dereference it. 1100 */ 1101 if (rt->peer) { 1102 iph->id = htons(inet_getid(rt->peer, more)); 1103 return; 1104 } 1105 } else 1106 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 1107 __builtin_return_address(0)); 1108 1109 ip_select_fb_ident(iph); 1110} 1111 1112static void rt_del(unsigned hash, struct rtable *rt) 1113{ 1114 struct rtable **rthp; 1115 1116 spin_lock_bh(rt_hash_lock_addr(hash)); 1117 ip_rt_put(rt); 1118 for (rthp = &rt_hash_table[hash].chain; *rthp; 1119 rthp = &(*rthp)->u.dst.rt_next) 1120 if (*rthp == rt) { 1121 *rthp = rt->u.dst.rt_next; 1122 rt_free(rt); 1123 break; 1124 } 1125 spin_unlock_bh(rt_hash_lock_addr(hash)); 1126} 1127 1128void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1129 __be32 saddr, struct net_device *dev) 1130{ 1131 int i, k; 1132 struct in_device *in_dev = in_dev_get(dev); 1133 struct rtable *rth, **rthp; 1134 __be32 skeys[2] = { saddr, 0 }; 1135 int ikeys[2] = { dev->ifindex, 0 }; 1136 struct netevent_redirect netevent; 1137 1138 if (!in_dev) 1139 return; 1140 1141 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) 1142 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) 1143 goto reject_redirect; 1144 1145 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1146 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1147 goto reject_redirect; 1148 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 1149 goto reject_redirect; 1150 } else { 1151 if (inet_addr_type(new_gw) != RTN_UNICAST) 1152 goto reject_redirect; 1153 } 1154 1155 for (i = 0; i < 2; i++) { 1156 for (k = 0; k < 2; k++) { 1157 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]); 1158 1159 rthp=&rt_hash_table[hash].chain; 1160 1161 rcu_read_lock(); 1162 while ((rth = rcu_dereference(*rthp)) != NULL) { 1163 struct rtable *rt; 1164 1165 if (rth->fl.fl4_dst != daddr || 1166 rth->fl.fl4_src != skeys[i] || 1167 rth->fl.oif != ikeys[k] || 1168 rth->fl.iif != 0) { 1169 rthp = &rth->u.dst.rt_next; 1170 continue; 1171 } 1172 1173 if (rth->rt_dst != daddr || 1174 rth->rt_src != saddr || 1175 rth->u.dst.error || 1176 rth->rt_gateway != old_gw || 1177 rth->u.dst.dev != dev) 1178 break; 1179 1180 dst_hold(&rth->u.dst); 1181 rcu_read_unlock(); 1182 1183 rt = dst_alloc(&ipv4_dst_ops); 1184 if (rt == NULL) { 1185 ip_rt_put(rth); 1186 in_dev_put(in_dev); 1187 return; 1188 } 1189 1190 /* Copy all the information. */ 1191 *rt = *rth; 1192 INIT_RCU_HEAD(&rt->u.dst.rcu_head); 1193 rt->u.dst.__use = 1; 1194 atomic_set(&rt->u.dst.__refcnt, 1); 1195 rt->u.dst.child = NULL; 1196 if (rt->u.dst.dev) 1197 dev_hold(rt->u.dst.dev); 1198 if (rt->idev) 1199 in_dev_hold(rt->idev); 1200 rt->u.dst.obsolete = 0; 1201 rt->u.dst.lastuse = jiffies; 1202 rt->u.dst.path = &rt->u.dst; 1203 rt->u.dst.neighbour = NULL; 1204 rt->u.dst.hh = NULL; 1205 rt->u.dst.xfrm = NULL; 1206 1207 rt->rt_flags |= RTCF_REDIRECTED; 1208 1209 /* Gateway is different ... */ 1210 rt->rt_gateway = new_gw; 1211 1212 /* Redirect received -> path was valid */ 1213 dst_confirm(&rth->u.dst); 1214 1215 if (rt->peer) 1216 atomic_inc(&rt->peer->refcnt); 1217 1218 if (arp_bind_neighbour(&rt->u.dst) || 1219 !(rt->u.dst.neighbour->nud_state & 1220 NUD_VALID)) { 1221 if (rt->u.dst.neighbour) 1222 neigh_event_send(rt->u.dst.neighbour, NULL); 1223 ip_rt_put(rth); 1224 rt_drop(rt); 1225 goto do_next; 1226 } 1227 1228 netevent.old = &rth->u.dst; 1229 netevent.new = &rt->u.dst; 1230 call_netevent_notifiers(NETEVENT_REDIRECT, 1231 &netevent); 1232 1233 rt_del(hash, rth); 1234 if (!rt_intern_hash(hash, rt, &rt)) 1235 ip_rt_put(rt); 1236 goto do_next; 1237 } 1238 rcu_read_unlock(); 1239 do_next: 1240 ; 1241 } 1242 } 1243 in_dev_put(in_dev); 1244 return; 1245 1246reject_redirect: 1247#ifdef CONFIG_IP_ROUTE_VERBOSE 1248 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 1249 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about " 1250 "%u.%u.%u.%u ignored.\n" 1251 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n", 1252 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw), 1253 NIPQUAD(saddr), NIPQUAD(daddr)); 1254#endif 1255 in_dev_put(in_dev); 1256} 1257 1258static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1259{ 1260 struct rtable *rt = (struct rtable*)dst; 1261 struct dst_entry *ret = dst; 1262 1263 if (rt) { 1264 if (dst->obsolete) { 1265 ip_rt_put(rt); 1266 ret = NULL; 1267 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1268 rt->u.dst.expires) { 1269 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1270 rt->fl.oif); 1271#if RT_CACHE_DEBUG >= 1 1272 printk(KERN_DEBUG "ip_rt_advice: redirect to " 1273 "%u.%u.%u.%u/%02x dropped\n", 1274 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos); 1275#endif 1276 rt_del(hash, rt); 1277 ret = NULL; 1278 } 1279 } 1280 return ret; 1281} 1282 1283/* 1284 * Algorithm: 1285 * 1. The first ip_rt_redirect_number redirects are sent 1286 * with exponential backoff, then we stop sending them at all, 1287 * assuming that the host ignores our redirects. 1288 * 2. If we did not see packets requiring redirects 1289 * during ip_rt_redirect_silence, we assume that the host 1290 * forgot redirected route and start to send redirects again. 1291 * 1292 * This algorithm is much cheaper and more intelligent than dumb load limiting 1293 * in icmp.c. 1294 * 1295 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 1296 * and "frag. need" (breaks PMTU discovery) in icmp.c. 1297 */ 1298 1299void ip_rt_send_redirect(struct sk_buff *skb) 1300{ 1301 struct rtable *rt = (struct rtable*)skb->dst; 1302 struct in_device *in_dev = in_dev_get(rt->u.dst.dev); 1303 1304 if (!in_dev) 1305 return; 1306 1307 if (!IN_DEV_TX_REDIRECTS(in_dev)) 1308 goto out; 1309 1310 /* No redirected packets during ip_rt_redirect_silence; 1311 * reset the algorithm. 1312 */ 1313 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) 1314 rt->u.dst.rate_tokens = 0; 1315 1316 /* Too many ignored redirects; do not send anything 1317 * set u.dst.rate_last to the last seen redirected packet. 1318 */ 1319 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { 1320 rt->u.dst.rate_last = jiffies; 1321 goto out; 1322 } 1323 1324 /* Check for load limit; set rate_last to the latest sent 1325 * redirect. 1326 */ 1327 if (rt->u.dst.rate_tokens == 0 || 1328 time_after(jiffies, 1329 (rt->u.dst.rate_last + 1330 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { 1331 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1332 rt->u.dst.rate_last = jiffies; 1333 ++rt->u.dst.rate_tokens; 1334#ifdef CONFIG_IP_ROUTE_VERBOSE 1335 if (IN_DEV_LOG_MARTIANS(in_dev) && 1336 rt->u.dst.rate_tokens == ip_rt_redirect_number && 1337 net_ratelimit()) 1338 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores " 1339 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n", 1340 NIPQUAD(rt->rt_src), rt->rt_iif, 1341 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway)); 1342#endif 1343 } 1344out: 1345 in_dev_put(in_dev); 1346} 1347 1348static int ip_error(struct sk_buff *skb) 1349{ 1350 struct rtable *rt = (struct rtable*)skb->dst; 1351 unsigned long now; 1352 int code; 1353 1354 switch (rt->u.dst.error) { 1355 case EINVAL: 1356 default: 1357 goto out; 1358 case EHOSTUNREACH: 1359 code = ICMP_HOST_UNREACH; 1360 break; 1361 case ENETUNREACH: 1362 code = ICMP_NET_UNREACH; 1363 break; 1364 case EACCES: 1365 code = ICMP_PKT_FILTERED; 1366 break; 1367 } 1368 1369 now = jiffies; 1370 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; 1371 if (rt->u.dst.rate_tokens > ip_rt_error_burst) 1372 rt->u.dst.rate_tokens = ip_rt_error_burst; 1373 rt->u.dst.rate_last = now; 1374 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { 1375 rt->u.dst.rate_tokens -= ip_rt_error_cost; 1376 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1377 } 1378 1379out: kfree_skb(skb); 1380 return 0; 1381} 1382 1383/* 1384 * The last two values are not from the RFC but 1385 * are needed for AMPRnet AX.25 paths. 1386 */ 1387 1388static const unsigned short mtu_plateau[] = 1389{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; 1390 1391static __inline__ unsigned short guess_mtu(unsigned short old_mtu) 1392{ 1393 int i; 1394 1395 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) 1396 if (old_mtu > mtu_plateau[i]) 1397 return mtu_plateau[i]; 1398 return 68; 1399} 1400 1401unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) 1402{ 1403 int i; 1404 unsigned short old_mtu = ntohs(iph->tot_len); 1405 struct rtable *rth; 1406 __be32 skeys[2] = { iph->saddr, 0, }; 1407 __be32 daddr = iph->daddr; 1408 unsigned short est_mtu = 0; 1409 1410 if (ipv4_config.no_pmtu_disc) 1411 return 0; 1412 1413 for (i = 0; i < 2; i++) { 1414 unsigned hash = rt_hash(daddr, skeys[i], 0); 1415 1416 rcu_read_lock(); 1417 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 1418 rth = rcu_dereference(rth->u.dst.rt_next)) { 1419 if (rth->fl.fl4_dst == daddr && 1420 rth->fl.fl4_src == skeys[i] && 1421 rth->rt_dst == daddr && 1422 rth->rt_src == iph->saddr && 1423 rth->fl.iif == 0 && 1424 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) { 1425 unsigned short mtu = new_mtu; 1426 1427 if (new_mtu < 68 || new_mtu >= old_mtu) { 1428 1429 /* BSD 4.2 compatibility hack :-( */ 1430 if (mtu == 0 && 1431 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] && 1432 old_mtu >= 68 + (iph->ihl << 2)) 1433 old_mtu -= iph->ihl << 2; 1434 1435 mtu = guess_mtu(old_mtu); 1436 } 1437 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) { 1438 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 1439 dst_confirm(&rth->u.dst); 1440 if (mtu < ip_rt_min_pmtu) { 1441 mtu = ip_rt_min_pmtu; 1442 rth->u.dst.metrics[RTAX_LOCK-1] |= 1443 (1 << RTAX_MTU); 1444 } 1445 rth->u.dst.metrics[RTAX_MTU-1] = mtu; 1446 dst_set_expires(&rth->u.dst, 1447 ip_rt_mtu_expires); 1448 } 1449 est_mtu = mtu; 1450 } 1451 } 1452 } 1453 rcu_read_unlock(); 1454 } 1455 return est_mtu ? : new_mtu; 1456} 1457 1458static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1459{ 1460 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 && 1461 !(dst_metric_locked(dst, RTAX_MTU))) { 1462 if (mtu < ip_rt_min_pmtu) { 1463 mtu = ip_rt_min_pmtu; 1464 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); 1465 } 1466 dst->metrics[RTAX_MTU-1] = mtu; 1467 dst_set_expires(dst, ip_rt_mtu_expires); 1468 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); 1469 } 1470} 1471 1472static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1473{ 1474 return NULL; 1475} 1476 1477static void ipv4_dst_destroy(struct dst_entry *dst) 1478{ 1479 struct rtable *rt = (struct rtable *) dst; 1480 struct inet_peer *peer = rt->peer; 1481 struct in_device *idev = rt->idev; 1482 1483 if (peer) { 1484 rt->peer = NULL; 1485 inet_putpeer(peer); 1486 } 1487 1488 if (idev) { 1489 rt->idev = NULL; 1490 in_dev_put(idev); 1491 } 1492} 1493 1494static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 1495 int how) 1496{ 1497 struct rtable *rt = (struct rtable *) dst; 1498 struct in_device *idev = rt->idev; 1499 if (dev != &loopback_dev && idev && idev->dev == dev) { 1500 struct in_device *loopback_idev = in_dev_get(&loopback_dev); 1501 if (loopback_idev) { 1502 rt->idev = loopback_idev; 1503 in_dev_put(idev); 1504 } 1505 } 1506} 1507 1508static void ipv4_link_failure(struct sk_buff *skb) 1509{ 1510 struct rtable *rt; 1511 1512 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1513 1514 rt = (struct rtable *) skb->dst; 1515 if (rt) 1516 dst_set_expires(&rt->u.dst, 0); 1517} 1518 1519static int ip_rt_bug(struct sk_buff *skb) 1520{ 1521 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n", 1522 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr), 1523 skb->dev ? skb->dev->name : "?"); 1524 kfree_skb(skb); 1525 return 0; 1526} 1527 1528/* 1529 We do not cache source address of outgoing interface, 1530 because it is used only by IP RR, TS and SRR options, 1531 so that it out of fast path. 1532 1533 BTW remember: "addr" is allowed to be not aligned 1534 in IP options! 1535 */ 1536 1537void ip_rt_get_source(u8 *addr, struct rtable *rt) 1538{ 1539 __be32 src; 1540 struct fib_result res; 1541 1542 if (rt->fl.iif == 0) 1543 src = rt->rt_src; 1544 else if (fib_lookup(&rt->fl, &res) == 0) { 1545 src = FIB_RES_PREFSRC(res); 1546 fib_res_put(&res); 1547 } else 1548 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, 1549 RT_SCOPE_UNIVERSE); 1550 memcpy(addr, &src, 4); 1551} 1552 1553#ifdef CONFIG_NET_CLS_ROUTE 1554static void set_class_tag(struct rtable *rt, u32 tag) 1555{ 1556 if (!(rt->u.dst.tclassid & 0xFFFF)) 1557 rt->u.dst.tclassid |= tag & 0xFFFF; 1558 if (!(rt->u.dst.tclassid & 0xFFFF0000)) 1559 rt->u.dst.tclassid |= tag & 0xFFFF0000; 1560} 1561#endif 1562 1563static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 1564{ 1565 struct fib_info *fi = res->fi; 1566 1567 if (fi) { 1568 if (FIB_RES_GW(*res) && 1569 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1570 rt->rt_gateway = FIB_RES_GW(*res); 1571 memcpy(rt->u.dst.metrics, fi->fib_metrics, 1572 sizeof(rt->u.dst.metrics)); 1573 if (fi->fib_mtu == 0) { 1574 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; 1575 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) && 1576 rt->rt_gateway != rt->rt_dst && 1577 rt->u.dst.dev->mtu > 576) 1578 rt->u.dst.metrics[RTAX_MTU-1] = 576; 1579 } 1580#ifdef CONFIG_NET_CLS_ROUTE 1581 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 1582#endif 1583 } else 1584 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; 1585 1586 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) 1587 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; 1588 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU) 1589 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; 1590 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0) 1591 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, 1592 ip_rt_min_advmss); 1593 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40) 1594 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; 1595 1596#ifdef CONFIG_NET_CLS_ROUTE 1597#ifdef CONFIG_IP_MULTIPLE_TABLES 1598 set_class_tag(rt, fib_rules_tclass(res)); 1599#endif 1600 set_class_tag(rt, itag); 1601#endif 1602 rt->rt_type = res->type; 1603} 1604 1605static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1606 u8 tos, struct net_device *dev, int our) 1607{ 1608 unsigned hash; 1609 struct rtable *rth; 1610 __be32 spec_dst; 1611 struct in_device *in_dev = in_dev_get(dev); 1612 u32 itag = 0; 1613 1614 /* Primary sanity checks. */ 1615 1616 if (in_dev == NULL) 1617 return -EINVAL; 1618 1619 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || 1620 skb->protocol != htons(ETH_P_IP)) 1621 goto e_inval; 1622 1623 if (ZERONET(saddr)) { 1624 if (!LOCAL_MCAST(daddr)) 1625 goto e_inval; 1626 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1627 } else if (fib_validate_source(saddr, 0, tos, 0, 1628 dev, &spec_dst, &itag) < 0) 1629 goto e_inval; 1630 1631 rth = dst_alloc(&ipv4_dst_ops); 1632 if (!rth) 1633 goto e_nobufs; 1634 1635 rth->u.dst.output= ip_rt_bug; 1636 1637 atomic_set(&rth->u.dst.__refcnt, 1); 1638 rth->u.dst.flags= DST_HOST; 1639 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 1640 rth->u.dst.flags |= DST_NOPOLICY; 1641 rth->fl.fl4_dst = daddr; 1642 rth->rt_dst = daddr; 1643 rth->fl.fl4_tos = tos; 1644 rth->fl.mark = skb->mark; 1645 rth->fl.fl4_src = saddr; 1646 rth->rt_src = saddr; 1647#ifdef CONFIG_NET_CLS_ROUTE 1648 rth->u.dst.tclassid = itag; 1649#endif 1650 rth->rt_iif = 1651 rth->fl.iif = dev->ifindex; 1652 rth->u.dst.dev = &loopback_dev; 1653 dev_hold(rth->u.dst.dev); 1654 rth->idev = in_dev_get(rth->u.dst.dev); 1655 rth->fl.oif = 0; 1656 rth->rt_gateway = daddr; 1657 rth->rt_spec_dst= spec_dst; 1658 rth->rt_type = RTN_MULTICAST; 1659 rth->rt_flags = RTCF_MULTICAST; 1660 if (our) { 1661 rth->u.dst.input= ip_local_deliver; 1662 rth->rt_flags |= RTCF_LOCAL; 1663 } 1664 1665#ifdef CONFIG_IP_MROUTE 1666 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) 1667 rth->u.dst.input = ip_mr_input; 1668#endif 1669 RT_CACHE_STAT_INC(in_slow_mc); 1670 1671 in_dev_put(in_dev); 1672 hash = rt_hash(daddr, saddr, dev->ifindex); 1673 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst); 1674 1675e_nobufs: 1676 in_dev_put(in_dev); 1677 return -ENOBUFS; 1678 1679e_inval: 1680 in_dev_put(in_dev); 1681 return -EINVAL; 1682} 1683 1684 1685static void ip_handle_martian_source(struct net_device *dev, 1686 struct in_device *in_dev, 1687 struct sk_buff *skb, 1688 __be32 daddr, 1689 __be32 saddr) 1690{ 1691 RT_CACHE_STAT_INC(in_martian_src); 1692#ifdef CONFIG_IP_ROUTE_VERBOSE 1693 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 1694 /* 1695 * RFC1812 recommendation, if source is martian, 1696 * the only hint is MAC header. 1697 */ 1698 printk(KERN_WARNING "martian source %u.%u.%u.%u from " 1699 "%u.%u.%u.%u, on dev %s\n", 1700 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 1701 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 1702 int i; 1703 const unsigned char *p = skb_mac_header(skb); 1704 printk(KERN_WARNING "ll header: "); 1705 for (i = 0; i < dev->hard_header_len; i++, p++) { 1706 printk("%02x", *p); 1707 if (i < (dev->hard_header_len - 1)) 1708 printk(":"); 1709 } 1710 printk("\n"); 1711 } 1712 } 1713#endif 1714} 1715 1716static inline int __mkroute_input(struct sk_buff *skb, 1717 struct fib_result* res, 1718 struct in_device *in_dev, 1719 __be32 daddr, __be32 saddr, u32 tos, 1720 struct rtable **result) 1721{ 1722 1723 struct rtable *rth; 1724 int err; 1725 struct in_device *out_dev; 1726 unsigned flags = 0; 1727 __be32 spec_dst; 1728 u32 itag; 1729 1730 /* get a working reference to the output device */ 1731 out_dev = in_dev_get(FIB_RES_DEV(*res)); 1732 if (out_dev == NULL) { 1733 if (net_ratelimit()) 1734 printk(KERN_CRIT "Bug in ip_route_input" \ 1735 "_slow(). Please, report\n"); 1736 return -EINVAL; 1737 } 1738 1739 1740 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 1741 in_dev->dev, &spec_dst, &itag); 1742 if (err < 0) { 1743 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1744 saddr); 1745 1746 err = -EINVAL; 1747 goto cleanup; 1748 } 1749 1750 if (err) 1751 flags |= RTCF_DIRECTSRC; 1752 1753 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) && 1754 (IN_DEV_SHARED_MEDIA(out_dev) || 1755 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) 1756 flags |= RTCF_DOREDIRECT; 1757 1758 if (skb->protocol != htons(ETH_P_IP)) { 1759 /* Not IP (i.e. ARP). Do not create route, if it is 1760 * invalid for proxy arp. DNAT routes are always valid. 1761 */ 1762 if (out_dev == in_dev && !(flags & RTCF_DNAT)) { 1763 err = -EINVAL; 1764 goto cleanup; 1765 } 1766 } 1767 1768 1769 rth = dst_alloc(&ipv4_dst_ops); 1770 if (!rth) { 1771 err = -ENOBUFS; 1772 goto cleanup; 1773 } 1774 1775 atomic_set(&rth->u.dst.__refcnt, 1); 1776 rth->u.dst.flags= DST_HOST; 1777#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 1778 if (res->fi->fib_nhs > 1) 1779 rth->u.dst.flags |= DST_BALANCED; 1780#endif 1781 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 1782 rth->u.dst.flags |= DST_NOPOLICY; 1783 if (IN_DEV_CONF_GET(out_dev, NOXFRM)) 1784 rth->u.dst.flags |= DST_NOXFRM; 1785 rth->fl.fl4_dst = daddr; 1786 rth->rt_dst = daddr; 1787 rth->fl.fl4_tos = tos; 1788 rth->fl.mark = skb->mark; 1789 rth->fl.fl4_src = saddr; 1790 rth->rt_src = saddr; 1791 rth->rt_gateway = daddr; 1792 rth->rt_iif = 1793 rth->fl.iif = in_dev->dev->ifindex; 1794 rth->u.dst.dev = (out_dev)->dev; 1795 dev_hold(rth->u.dst.dev); 1796 rth->idev = in_dev_get(rth->u.dst.dev); 1797 rth->fl.oif = 0; 1798 rth->rt_spec_dst= spec_dst; 1799 1800 rth->u.dst.input = ip_forward; 1801 rth->u.dst.output = ip_output; 1802 1803 rt_set_nexthop(rth, res, itag); 1804 1805 rth->rt_flags = flags; 1806 1807 *result = rth; 1808 err = 0; 1809 cleanup: 1810 /* release the working reference to the output device */ 1811 in_dev_put(out_dev); 1812 return err; 1813} 1814 1815static inline int ip_mkroute_input_def(struct sk_buff *skb, 1816 struct fib_result* res, 1817 const struct flowi *fl, 1818 struct in_device *in_dev, 1819 __be32 daddr, __be32 saddr, u32 tos) 1820{ 1821 struct rtable* rth = NULL; 1822 int err; 1823 unsigned hash; 1824 1825#ifdef CONFIG_IP_ROUTE_MULTIPATH 1826 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) 1827 fib_select_multipath(fl, res); 1828#endif 1829 1830 /* create a routing cache entry */ 1831 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); 1832 if (err) 1833 return err; 1834 1835 /* put it into the cache */ 1836 hash = rt_hash(daddr, saddr, fl->iif); 1837 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); 1838} 1839 1840static inline int ip_mkroute_input(struct sk_buff *skb, 1841 struct fib_result* res, 1842 const struct flowi *fl, 1843 struct in_device *in_dev, 1844 __be32 daddr, __be32 saddr, u32 tos) 1845{ 1846#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 1847 struct rtable* rth = NULL, *rtres; 1848 unsigned char hop, hopcount; 1849 int err = -EINVAL; 1850 unsigned int hash; 1851 1852 if (res->fi) 1853 hopcount = res->fi->fib_nhs; 1854 else 1855 hopcount = 1; 1856 1857 /* distinguish between multipath and singlepath */ 1858 if (hopcount < 2) 1859 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, 1860 saddr, tos); 1861 1862 /* add all alternatives to the routing cache */ 1863 for (hop = 0; hop < hopcount; hop++) { 1864 res->nh_sel = hop; 1865 1866 /* put reference to previous result */ 1867 if (hop) 1868 ip_rt_put(rtres); 1869 1870 /* create a routing cache entry */ 1871 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, 1872 &rth); 1873 if (err) 1874 return err; 1875 1876 /* put it into the cache */ 1877 hash = rt_hash(daddr, saddr, fl->iif); 1878 err = rt_intern_hash(hash, rth, &rtres); 1879 if (err) 1880 return err; 1881 1882 /* forward hop information to multipath impl. */ 1883 multipath_set_nhinfo(rth, 1884 FIB_RES_NETWORK(*res), 1885 FIB_RES_NETMASK(*res), 1886 res->prefixlen, 1887 &FIB_RES_NH(*res)); 1888 } 1889 skb->dst = &rtres->u.dst; 1890 return err; 1891#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 1892 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos); 1893#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 1894} 1895 1896 1897/* 1898 * NOTE. We drop all the packets that has local source 1899 * addresses, because every properly looped back packet 1900 * must have correct destination already attached by output routine. 1901 * 1902 * Such approach solves two big problems: 1903 * 1. Not simplex devices are handled properly. 1904 * 2. IP spoofing attempts are filtered with 100% of guarantee. 1905 */ 1906 1907static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1908 u8 tos, struct net_device *dev) 1909{ 1910 struct fib_result res; 1911 struct in_device *in_dev = in_dev_get(dev); 1912 struct flowi fl = { .nl_u = { .ip4_u = 1913 { .daddr = daddr, 1914 .saddr = saddr, 1915 .tos = tos, 1916 .scope = RT_SCOPE_UNIVERSE, 1917 } }, 1918 .mark = skb->mark, 1919 .iif = dev->ifindex }; 1920 unsigned flags = 0; 1921 u32 itag = 0; 1922 struct rtable * rth; 1923 unsigned hash; 1924 __be32 spec_dst; 1925 int err = -EINVAL; 1926 int free_res = 0; 1927 1928 /* IP on this device is disabled. */ 1929 1930 if (!in_dev) 1931 goto out; 1932 1933 /* Check for the most weird martians, which can be not detected 1934 by fib_lookup. 1935 */ 1936 1937 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) 1938 goto martian_source; 1939 1940 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) 1941 goto brd_input; 1942 1943 /* Accept zero addresses only to limited broadcast; 1944 * I even do not know to fix it or not. Waiting for complains :-) 1945 */ 1946 if (ZERONET(saddr)) 1947 goto martian_source; 1948 1949 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) 1950 goto martian_destination; 1951 1952 /* 1953 * Now we are ready to route packet. 1954 */ 1955 if ((err = fib_lookup(&fl, &res)) != 0) { 1956 if (!IN_DEV_FORWARD(in_dev)) 1957 goto e_hostunreach; 1958 goto no_route; 1959 } 1960 free_res = 1; 1961 1962 RT_CACHE_STAT_INC(in_slow_tot); 1963 1964 if (res.type == RTN_BROADCAST) 1965 goto brd_input; 1966 1967 if (res.type == RTN_LOCAL) { 1968 int result; 1969 result = fib_validate_source(saddr, daddr, tos, 1970 loopback_dev.ifindex, 1971 dev, &spec_dst, &itag); 1972 if (result < 0) 1973 goto martian_source; 1974 if (result) 1975 flags |= RTCF_DIRECTSRC; 1976 spec_dst = daddr; 1977 goto local_input; 1978 } 1979 1980 if (!IN_DEV_FORWARD(in_dev)) 1981 goto e_hostunreach; 1982 if (res.type != RTN_UNICAST) 1983 goto martian_destination; 1984 1985 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 1986 if (err == -ENOBUFS) 1987 goto e_nobufs; 1988 if (err == -EINVAL) 1989 goto e_inval; 1990 1991done: 1992 in_dev_put(in_dev); 1993 if (free_res) 1994 fib_res_put(&res); 1995out: return err; 1996 1997brd_input: 1998 if (skb->protocol != htons(ETH_P_IP)) 1999 goto e_inval; 2000 2001 if (ZERONET(saddr)) 2002 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 2003 else { 2004 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, 2005 &itag); 2006 if (err < 0) 2007 goto martian_source; 2008 if (err) 2009 flags |= RTCF_DIRECTSRC; 2010 } 2011 flags |= RTCF_BROADCAST; 2012 res.type = RTN_BROADCAST; 2013 RT_CACHE_STAT_INC(in_brd); 2014 2015local_input: 2016 rth = dst_alloc(&ipv4_dst_ops); 2017 if (!rth) 2018 goto e_nobufs; 2019 2020 rth->u.dst.output= ip_rt_bug; 2021 2022 atomic_set(&rth->u.dst.__refcnt, 1); 2023 rth->u.dst.flags= DST_HOST; 2024 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2025 rth->u.dst.flags |= DST_NOPOLICY; 2026 rth->fl.fl4_dst = daddr; 2027 rth->rt_dst = daddr; 2028 rth->fl.fl4_tos = tos; 2029 rth->fl.mark = skb->mark; 2030 rth->fl.fl4_src = saddr; 2031 rth->rt_src = saddr; 2032#ifdef CONFIG_NET_CLS_ROUTE 2033 rth->u.dst.tclassid = itag; 2034#endif 2035 rth->rt_iif = 2036 rth->fl.iif = dev->ifindex; 2037 rth->u.dst.dev = &loopback_dev; 2038 dev_hold(rth->u.dst.dev); 2039 rth->idev = in_dev_get(rth->u.dst.dev); 2040 rth->rt_gateway = daddr; 2041 rth->rt_spec_dst= spec_dst; 2042 rth->u.dst.input= ip_local_deliver; 2043 rth->rt_flags = flags|RTCF_LOCAL; 2044 if (res.type == RTN_UNREACHABLE) { 2045 rth->u.dst.input= ip_error; 2046 rth->u.dst.error= -err; 2047 rth->rt_flags &= ~RTCF_LOCAL; 2048 } 2049 rth->rt_type = res.type; 2050 hash = rt_hash(daddr, saddr, fl.iif); 2051 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); 2052 goto done; 2053 2054no_route: 2055 RT_CACHE_STAT_INC(in_no_route); 2056 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 2057 res.type = RTN_UNREACHABLE; 2058 goto local_input; 2059 2060 /* 2061 * Do not cache martian addresses: they should be logged (RFC1812) 2062 */ 2063martian_destination: 2064 RT_CACHE_STAT_INC(in_martian_dst); 2065#ifdef CONFIG_IP_ROUTE_VERBOSE 2066 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 2067 printk(KERN_WARNING "martian destination %u.%u.%u.%u from " 2068 "%u.%u.%u.%u, dev %s\n", 2069 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 2070#endif 2071 2072e_hostunreach: 2073 err = -EHOSTUNREACH; 2074 goto done; 2075 2076e_inval: 2077 err = -EINVAL; 2078 goto done; 2079 2080e_nobufs: 2081 err = -ENOBUFS; 2082 goto done; 2083 2084martian_source: 2085 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2086 goto e_inval; 2087} 2088 2089int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2090 u8 tos, struct net_device *dev) 2091{ 2092 struct rtable * rth; 2093 unsigned hash; 2094 int iif = dev->ifindex; 2095 2096 tos &= IPTOS_RT_MASK; 2097 hash = rt_hash(daddr, saddr, iif); 2098 2099 rcu_read_lock(); 2100 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2101 rth = rcu_dereference(rth->u.dst.rt_next)) { 2102 if (rth->fl.fl4_dst == daddr && 2103 rth->fl.fl4_src == saddr && 2104 rth->fl.iif == iif && 2105 rth->fl.oif == 0 && 2106 rth->fl.mark == skb->mark && 2107 rth->fl.fl4_tos == tos) { 2108 rth->u.dst.lastuse = jiffies; 2109 dst_hold(&rth->u.dst); 2110 rth->u.dst.__use++; 2111 RT_CACHE_STAT_INC(in_hit); 2112 rcu_read_unlock(); 2113 skb->dst = (struct dst_entry*)rth; 2114 return 0; 2115 } 2116 RT_CACHE_STAT_INC(in_hlist_search); 2117 } 2118 rcu_read_unlock(); 2119 2120 /* Multicast recognition logic is moved from route cache to here. 2121 The problem was that too many Ethernet cards have broken/missing 2122 hardware multicast filters :-( As result the host on multicasting 2123 network acquires a lot of useless route cache entries, sort of 2124 SDR messages from all the world. Now we try to get rid of them. 2125 Really, provided software IP multicast filter is organized 2126 reasonably (at least, hashed), it does not result in a slowdown 2127 comparing with route cache reject entries. 2128 Note, that multicast routers are not affected, because 2129 route cache entry is created eventually. 2130 */ 2131 if (MULTICAST(daddr)) { 2132 struct in_device *in_dev; 2133 2134 rcu_read_lock(); 2135 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { 2136 int our = ip_check_mc(in_dev, daddr, saddr, 2137 ip_hdr(skb)->protocol); 2138 if (our 2139#ifdef CONFIG_IP_MROUTE 2140 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) 2141#endif 2142 ) { 2143 rcu_read_unlock(); 2144 return ip_route_input_mc(skb, daddr, saddr, 2145 tos, dev, our); 2146 } 2147 } 2148 rcu_read_unlock(); 2149 return -EINVAL; 2150 } 2151 return ip_route_input_slow(skb, daddr, saddr, tos, dev); 2152} 2153 2154static inline int __mkroute_output(struct rtable **result, 2155 struct fib_result* res, 2156 const struct flowi *fl, 2157 const struct flowi *oldflp, 2158 struct net_device *dev_out, 2159 unsigned flags) 2160{ 2161 struct rtable *rth; 2162 struct in_device *in_dev; 2163 u32 tos = RT_FL_TOS(oldflp); 2164 int err = 0; 2165 2166 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) 2167 return -EINVAL; 2168 2169 if (fl->fl4_dst == htonl(0xFFFFFFFF)) 2170 res->type = RTN_BROADCAST; 2171 else if (MULTICAST(fl->fl4_dst)) 2172 res->type = RTN_MULTICAST; 2173 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst)) 2174 return -EINVAL; 2175 2176 if (dev_out->flags & IFF_LOOPBACK) 2177 flags |= RTCF_LOCAL; 2178 2179 /* get work reference to inet device */ 2180 in_dev = in_dev_get(dev_out); 2181 if (!in_dev) 2182 return -EINVAL; 2183 2184 if (res->type == RTN_BROADCAST) { 2185 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2186 if (res->fi) { 2187 fib_info_put(res->fi); 2188 res->fi = NULL; 2189 } 2190 } else if (res->type == RTN_MULTICAST) { 2191 flags |= RTCF_MULTICAST|RTCF_LOCAL; 2192 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 2193 oldflp->proto)) 2194 flags &= ~RTCF_LOCAL; 2195 /* If multicast route do not exist use 2196 default one, but do not gateway in this case. 2197 Yes, it is hack. 2198 */ 2199 if (res->fi && res->prefixlen < 4) { 2200 fib_info_put(res->fi); 2201 res->fi = NULL; 2202 } 2203 } 2204 2205 2206 rth = dst_alloc(&ipv4_dst_ops); 2207 if (!rth) { 2208 err = -ENOBUFS; 2209 goto cleanup; 2210 } 2211 2212 atomic_set(&rth->u.dst.__refcnt, 1); 2213 rth->u.dst.flags= DST_HOST; 2214#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 2215 if (res->fi) { 2216 rth->rt_multipath_alg = res->fi->fib_mp_alg; 2217 if (res->fi->fib_nhs > 1) 2218 rth->u.dst.flags |= DST_BALANCED; 2219 } 2220#endif 2221 if (IN_DEV_CONF_GET(in_dev, NOXFRM)) 2222 rth->u.dst.flags |= DST_NOXFRM; 2223 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2224 rth->u.dst.flags |= DST_NOPOLICY; 2225 2226 rth->fl.fl4_dst = oldflp->fl4_dst; 2227 rth->fl.fl4_tos = tos; 2228 rth->fl.fl4_src = oldflp->fl4_src; 2229 rth->fl.oif = oldflp->oif; 2230 rth->fl.mark = oldflp->mark; 2231 rth->rt_dst = fl->fl4_dst; 2232 rth->rt_src = fl->fl4_src; 2233 rth->rt_iif = oldflp->oif ? : dev_out->ifindex; 2234 /* get references to the devices that are to be hold by the routing 2235 cache entry */ 2236 rth->u.dst.dev = dev_out; 2237 dev_hold(dev_out); 2238 rth->idev = in_dev_get(dev_out); 2239 rth->rt_gateway = fl->fl4_dst; 2240 rth->rt_spec_dst= fl->fl4_src; 2241 2242 rth->u.dst.output=ip_output; 2243 2244 RT_CACHE_STAT_INC(out_slow_tot); 2245 2246 if (flags & RTCF_LOCAL) { 2247 rth->u.dst.input = ip_local_deliver; 2248 rth->rt_spec_dst = fl->fl4_dst; 2249 } 2250 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2251 rth->rt_spec_dst = fl->fl4_src; 2252 if (flags & RTCF_LOCAL && 2253 !(dev_out->flags & IFF_LOOPBACK)) { 2254 rth->u.dst.output = ip_mc_output; 2255 RT_CACHE_STAT_INC(out_slow_mc); 2256 } 2257#ifdef CONFIG_IP_MROUTE 2258 if (res->type == RTN_MULTICAST) { 2259 if (IN_DEV_MFORWARD(in_dev) && 2260 !LOCAL_MCAST(oldflp->fl4_dst)) { 2261 rth->u.dst.input = ip_mr_input; 2262 rth->u.dst.output = ip_mc_output; 2263 } 2264 } 2265#endif 2266 } 2267 2268 rt_set_nexthop(rth, res, 0); 2269 2270 rth->rt_flags = flags; 2271 2272 *result = rth; 2273 cleanup: 2274 /* release work reference to inet device */ 2275 in_dev_put(in_dev); 2276 2277 return err; 2278} 2279 2280static inline int ip_mkroute_output_def(struct rtable **rp, 2281 struct fib_result* res, 2282 const struct flowi *fl, 2283 const struct flowi *oldflp, 2284 struct net_device *dev_out, 2285 unsigned flags) 2286{ 2287 struct rtable *rth = NULL; 2288 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); 2289 unsigned hash; 2290 if (err == 0) { 2291 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif); 2292 err = rt_intern_hash(hash, rth, rp); 2293 } 2294 2295 return err; 2296} 2297 2298static inline int ip_mkroute_output(struct rtable** rp, 2299 struct fib_result* res, 2300 const struct flowi *fl, 2301 const struct flowi *oldflp, 2302 struct net_device *dev_out, 2303 unsigned flags) 2304{ 2305#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 2306 unsigned char hop; 2307 unsigned hash; 2308 int err = -EINVAL; 2309 struct rtable *rth = NULL; 2310 2311 if (res->fi && res->fi->fib_nhs > 1) { 2312 unsigned char hopcount = res->fi->fib_nhs; 2313 2314 for (hop = 0; hop < hopcount; hop++) { 2315 struct net_device *dev2nexthop; 2316 2317 res->nh_sel = hop; 2318 2319 /* hold a work reference to the output device */ 2320 dev2nexthop = FIB_RES_DEV(*res); 2321 dev_hold(dev2nexthop); 2322 2323 /* put reference to previous result */ 2324 if (hop) 2325 ip_rt_put(*rp); 2326 2327 err = __mkroute_output(&rth, res, fl, oldflp, 2328 dev2nexthop, flags); 2329 2330 if (err != 0) 2331 goto cleanup; 2332 2333 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, 2334 oldflp->oif); 2335 err = rt_intern_hash(hash, rth, rp); 2336 2337 /* forward hop information to multipath impl. */ 2338 multipath_set_nhinfo(rth, 2339 FIB_RES_NETWORK(*res), 2340 FIB_RES_NETMASK(*res), 2341 res->prefixlen, 2342 &FIB_RES_NH(*res)); 2343 cleanup: 2344 /* release work reference to output device */ 2345 dev_put(dev2nexthop); 2346 2347 if (err != 0) 2348 return err; 2349 } 2350 return err; 2351 } else { 2352 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, 2353 flags); 2354 } 2355#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 2356 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags); 2357#endif 2358} 2359 2360/* 2361 * Major route resolver routine. 2362 */ 2363 2364static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) 2365{ 2366 u32 tos = RT_FL_TOS(oldflp); 2367 struct flowi fl = { .nl_u = { .ip4_u = 2368 { .daddr = oldflp->fl4_dst, 2369 .saddr = oldflp->fl4_src, 2370 .tos = tos & IPTOS_RT_MASK, 2371 .scope = ((tos & RTO_ONLINK) ? 2372 RT_SCOPE_LINK : 2373 RT_SCOPE_UNIVERSE), 2374 } }, 2375 .mark = oldflp->mark, 2376 .iif = loopback_dev.ifindex, 2377 .oif = oldflp->oif }; 2378 struct fib_result res; 2379 unsigned flags = 0; 2380 struct net_device *dev_out = NULL; 2381 int free_res = 0; 2382 int err; 2383 2384 2385 res.fi = NULL; 2386#ifdef CONFIG_IP_MULTIPLE_TABLES 2387 res.r = NULL; 2388#endif 2389 2390 if (oldflp->fl4_src) { 2391 err = -EINVAL; 2392 if (MULTICAST(oldflp->fl4_src) || 2393 BADCLASS(oldflp->fl4_src) || 2394 ZERONET(oldflp->fl4_src)) 2395 goto out; 2396 2397 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2398 dev_out = ip_dev_find(oldflp->fl4_src); 2399 if (dev_out == NULL) 2400 goto out; 2401 2402 /* I removed check for oif == dev_out->oif here. 2403 It was wrong for two reasons: 2404 1. ip_dev_find(saddr) can return wrong iface, if saddr is 2405 assigned to multiple interfaces. 2406 2. Moreover, we are allowed to send packets with saddr 2407 of another iface. --ANK 2408 */ 2409 2410 if (oldflp->oif == 0 2411 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) { 2412 2413 fl.oif = dev_out->ifindex; 2414 goto make_route; 2415 } 2416 if (dev_out) 2417 dev_put(dev_out); 2418 dev_out = NULL; 2419 } 2420 2421 2422 if (oldflp->oif) { 2423 dev_out = dev_get_by_index(oldflp->oif); 2424 err = -ENODEV; 2425 if (dev_out == NULL) 2426 goto out; 2427 2428 /* RACE: Check return value of inet_select_addr instead. */ 2429 if (__in_dev_get_rtnl(dev_out) == NULL) { 2430 dev_put(dev_out); 2431 goto out; /* Wrong error code */ 2432 } 2433 2434 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) { 2435 if (!fl.fl4_src) 2436 fl.fl4_src = inet_select_addr(dev_out, 0, 2437 RT_SCOPE_LINK); 2438 goto make_route; 2439 } 2440 if (!fl.fl4_src) { 2441 if (MULTICAST(oldflp->fl4_dst)) 2442 fl.fl4_src = inet_select_addr(dev_out, 0, 2443 fl.fl4_scope); 2444 else if (!oldflp->fl4_dst) 2445 fl.fl4_src = inet_select_addr(dev_out, 0, 2446 RT_SCOPE_HOST); 2447 } 2448 } 2449 2450 if (!fl.fl4_dst) { 2451 fl.fl4_dst = fl.fl4_src; 2452 if (!fl.fl4_dst) 2453 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); 2454 if (dev_out) 2455 dev_put(dev_out); 2456 dev_out = &loopback_dev; 2457 dev_hold(dev_out); 2458 fl.oif = loopback_dev.ifindex; 2459 res.type = RTN_LOCAL; 2460 flags |= RTCF_LOCAL; 2461 goto make_route; 2462 } 2463 2464 if (fib_lookup(&fl, &res)) { 2465 res.fi = NULL; 2466 if (oldflp->oif) { 2467 /* Apparently, routing tables are wrong. Assume, 2468 that the destination is on link. 2469 2470 WHY? DW. 2471 Because we are allowed to send to iface 2472 even if it has NO routes and NO assigned 2473 addresses. When oif is specified, routing 2474 tables are looked up with only one purpose: 2475 to catch if destination is gatewayed, rather than 2476 direct. Moreover, if MSG_DONTROUTE is set, 2477 we send packet, ignoring both routing tables 2478 and ifaddr state. --ANK 2479 2480 2481 We could make it even if oif is unknown, 2482 likely IPv6, but we do not. 2483 */ 2484 2485 if (fl.fl4_src == 0) 2486 fl.fl4_src = inet_select_addr(dev_out, 0, 2487 RT_SCOPE_LINK); 2488 res.type = RTN_UNICAST; 2489 goto make_route; 2490 } 2491 if (dev_out) 2492 dev_put(dev_out); 2493 err = -ENETUNREACH; 2494 goto out; 2495 } 2496 free_res = 1; 2497 2498 if (res.type == RTN_LOCAL) { 2499 if (!fl.fl4_src) 2500 fl.fl4_src = fl.fl4_dst; 2501 if (dev_out) 2502 dev_put(dev_out); 2503 dev_out = &loopback_dev; 2504 dev_hold(dev_out); 2505 fl.oif = dev_out->ifindex; 2506 if (res.fi) 2507 fib_info_put(res.fi); 2508 res.fi = NULL; 2509 flags |= RTCF_LOCAL; 2510 goto make_route; 2511 } 2512 2513#ifdef CONFIG_IP_ROUTE_MULTIPATH 2514 if (res.fi->fib_nhs > 1 && fl.oif == 0) 2515 fib_select_multipath(&fl, &res); 2516 else 2517#endif 2518 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) 2519 fib_select_default(&fl, &res); 2520 2521 if (!fl.fl4_src) 2522 fl.fl4_src = FIB_RES_PREFSRC(res); 2523 2524 if (dev_out) 2525 dev_put(dev_out); 2526 dev_out = FIB_RES_DEV(res); 2527 dev_hold(dev_out); 2528 fl.oif = dev_out->ifindex; 2529 2530 2531make_route: 2532 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2533 2534 2535 if (free_res) 2536 fib_res_put(&res); 2537 if (dev_out) 2538 dev_put(dev_out); 2539out: return err; 2540} 2541 2542int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) 2543{ 2544 unsigned hash; 2545 struct rtable *rth; 2546 2547 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif); 2548 2549 rcu_read_lock_bh(); 2550 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2551 rth = rcu_dereference(rth->u.dst.rt_next)) { 2552 if (rth->fl.fl4_dst == flp->fl4_dst && 2553 rth->fl.fl4_src == flp->fl4_src && 2554 rth->fl.iif == 0 && 2555 rth->fl.oif == flp->oif && 2556 rth->fl.mark == flp->mark && 2557 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2558 (IPTOS_RT_MASK | RTO_ONLINK))) { 2559 2560 /* check for multipath routes and choose one if 2561 * necessary 2562 */ 2563 if (multipath_select_route(flp, rth, rp)) { 2564 dst_hold(&(*rp)->u.dst); 2565 RT_CACHE_STAT_INC(out_hit); 2566 rcu_read_unlock_bh(); 2567 return 0; 2568 } 2569 2570 rth->u.dst.lastuse = jiffies; 2571 dst_hold(&rth->u.dst); 2572 rth->u.dst.__use++; 2573 RT_CACHE_STAT_INC(out_hit); 2574 rcu_read_unlock_bh(); 2575 *rp = rth; 2576 return 0; 2577 } 2578 RT_CACHE_STAT_INC(out_hlist_search); 2579 } 2580 rcu_read_unlock_bh(); 2581 2582 return ip_route_output_slow(rp, flp); 2583} 2584 2585EXPORT_SYMBOL_GPL(__ip_route_output_key); 2586 2587static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2588{ 2589} 2590 2591static struct dst_ops ipv4_dst_blackhole_ops = { 2592 .family = AF_INET, 2593 .protocol = __constant_htons(ETH_P_IP), 2594 .destroy = ipv4_dst_destroy, 2595 .check = ipv4_dst_check, 2596 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2597 .entry_size = sizeof(struct rtable), 2598}; 2599 2600 2601static int ipv4_blackhole_output(struct sk_buff *skb) 2602{ 2603 kfree_skb(skb); 2604 return 0; 2605} 2606 2607static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk) 2608{ 2609 struct rtable *ort = *rp; 2610 struct rtable *rt = (struct rtable *) 2611 dst_alloc(&ipv4_dst_blackhole_ops); 2612 2613 if (rt) { 2614 struct dst_entry *new = &rt->u.dst; 2615 2616 atomic_set(&new->__refcnt, 1); 2617 new->__use = 1; 2618 new->input = ipv4_blackhole_output; 2619 new->output = ipv4_blackhole_output; 2620 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); 2621 2622 new->dev = ort->u.dst.dev; 2623 if (new->dev) 2624 dev_hold(new->dev); 2625 2626 rt->fl = ort->fl; 2627 2628 rt->idev = ort->idev; 2629 if (rt->idev) 2630 in_dev_hold(rt->idev); 2631 rt->rt_flags = ort->rt_flags; 2632 rt->rt_type = ort->rt_type; 2633 rt->rt_dst = ort->rt_dst; 2634 rt->rt_src = ort->rt_src; 2635 rt->rt_iif = ort->rt_iif; 2636 rt->rt_gateway = ort->rt_gateway; 2637 rt->rt_spec_dst = ort->rt_spec_dst; 2638 rt->peer = ort->peer; 2639 if (rt->peer) 2640 atomic_inc(&rt->peer->refcnt); 2641 2642 dst_free(new); 2643 } 2644 2645 dst_release(&(*rp)->u.dst); 2646 *rp = rt; 2647 return (rt ? 0 : -ENOMEM); 2648} 2649 2650int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) 2651{ 2652 int err; 2653 2654 if ((err = __ip_route_output_key(rp, flp)) != 0) 2655 return err; 2656 2657 if (flp->proto) { 2658 if (!flp->fl4_src) 2659 flp->fl4_src = (*rp)->rt_src; 2660 if (!flp->fl4_dst) 2661 flp->fl4_dst = (*rp)->rt_dst; 2662 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags); 2663 if (err == -EREMOTE) 2664 err = ipv4_dst_blackhole(rp, flp, sk); 2665 2666 return err; 2667 } 2668 2669 return 0; 2670} 2671 2672EXPORT_SYMBOL_GPL(ip_route_output_flow); 2673 2674int ip_route_output_key(struct rtable **rp, struct flowi *flp) 2675{ 2676 return ip_route_output_flow(rp, flp, NULL, 0); 2677} 2678 2679static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 2680 int nowait, unsigned int flags) 2681{ 2682 struct rtable *rt = (struct rtable*)skb->dst; 2683 struct rtmsg *r; 2684 struct nlmsghdr *nlh; 2685 long expires; 2686 u32 id = 0, ts = 0, tsage = 0, error; 2687 2688 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2689 if (nlh == NULL) 2690 return -EMSGSIZE; 2691 2692 r = nlmsg_data(nlh); 2693 r->rtm_family = AF_INET; 2694 r->rtm_dst_len = 32; 2695 r->rtm_src_len = 0; 2696 r->rtm_tos = rt->fl.fl4_tos; 2697 r->rtm_table = RT_TABLE_MAIN; 2698 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); 2699 r->rtm_type = rt->rt_type; 2700 r->rtm_scope = RT_SCOPE_UNIVERSE; 2701 r->rtm_protocol = RTPROT_UNSPEC; 2702 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2703 if (rt->rt_flags & RTCF_NOTIFY) 2704 r->rtm_flags |= RTM_F_NOTIFY; 2705 2706 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); 2707 2708 if (rt->fl.fl4_src) { 2709 r->rtm_src_len = 32; 2710 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); 2711 } 2712 if (rt->u.dst.dev) 2713 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex); 2714#ifdef CONFIG_NET_CLS_ROUTE 2715 if (rt->u.dst.tclassid) 2716 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid); 2717#endif 2718#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 2719 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) 2720 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg); 2721#endif 2722 if (rt->fl.iif) 2723 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2724 else if (rt->rt_src != rt->fl.fl4_src) 2725 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 2726 2727 if (rt->rt_dst != rt->rt_gateway) 2728 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2729 2730 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) 2731 goto nla_put_failure; 2732 2733 error = rt->u.dst.error; 2734 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; 2735 if (rt->peer) { 2736 id = rt->peer->ip_id_count; 2737 if (rt->peer->tcp_ts_stamp) { 2738 ts = rt->peer->tcp_ts; 2739 tsage = get_seconds() - rt->peer->tcp_ts_stamp; 2740 } 2741 } 2742 2743 if (rt->fl.iif) { 2744#ifdef CONFIG_IP_MROUTE 2745 __be32 dst = rt->rt_dst; 2746 2747 if (MULTICAST(dst) && !LOCAL_MCAST(dst) && 2748 IPV4_DEVCONF_ALL(MC_FORWARDING)) { 2749 int err = ipmr_get_route(skb, r, nowait); 2750 if (err <= 0) { 2751 if (!nowait) { 2752 if (err == 0) 2753 return 0; 2754 goto nla_put_failure; 2755 } else { 2756 if (err == -EMSGSIZE) 2757 goto nla_put_failure; 2758 error = err; 2759 } 2760 } 2761 } else 2762#endif 2763 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); 2764 } 2765 2766 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage, 2767 expires, error) < 0) 2768 goto nla_put_failure; 2769 2770 return nlmsg_end(skb, nlh); 2771 2772nla_put_failure: 2773 nlmsg_cancel(skb, nlh); 2774 return -EMSGSIZE; 2775} 2776 2777static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) 2778{ 2779 struct rtmsg *rtm; 2780 struct nlattr *tb[RTA_MAX+1]; 2781 struct rtable *rt = NULL; 2782 __be32 dst = 0; 2783 __be32 src = 0; 2784 u32 iif; 2785 int err; 2786 struct sk_buff *skb; 2787 2788 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); 2789 if (err < 0) 2790 goto errout; 2791 2792 rtm = nlmsg_data(nlh); 2793 2794 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2795 if (skb == NULL) { 2796 err = -ENOBUFS; 2797 goto errout; 2798 } 2799 2800 /* Reserve room for dummy headers, this skb can pass 2801 through good chunk of routing engine. 2802 */ 2803 skb_reset_mac_header(skb); 2804 skb_reset_network_header(skb); 2805 2806 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ 2807 ip_hdr(skb)->protocol = IPPROTO_ICMP; 2808 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 2809 2810 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; 2811 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; 2812 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2813 2814 if (iif) { 2815 struct net_device *dev; 2816 2817 dev = __dev_get_by_index(iif); 2818 if (dev == NULL) { 2819 err = -ENODEV; 2820 goto errout_free; 2821 } 2822 2823 skb->protocol = htons(ETH_P_IP); 2824 skb->dev = dev; 2825 local_bh_disable(); 2826 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 2827 local_bh_enable(); 2828 2829 rt = (struct rtable*) skb->dst; 2830 if (err == 0 && rt->u.dst.error) 2831 err = -rt->u.dst.error; 2832 } else { 2833 struct flowi fl = { 2834 .nl_u = { 2835 .ip4_u = { 2836 .daddr = dst, 2837 .saddr = src, 2838 .tos = rtm->rtm_tos, 2839 }, 2840 }, 2841 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 2842 }; 2843 err = ip_route_output_key(&rt, &fl); 2844 } 2845 2846 if (err) 2847 goto errout_free; 2848 2849 skb->dst = &rt->u.dst; 2850 if (rtm->rtm_flags & RTM_F_NOTIFY) 2851 rt->rt_flags |= RTCF_NOTIFY; 2852 2853 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 2854 RTM_NEWROUTE, 0, 0); 2855 if (err <= 0) 2856 goto errout_free; 2857 2858 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid); 2859errout: 2860 return err; 2861 2862errout_free: 2863 kfree_skb(skb); 2864 goto errout; 2865} 2866 2867int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) 2868{ 2869 struct rtable *rt; 2870 int h, s_h; 2871 int idx, s_idx; 2872 2873 s_h = cb->args[0]; 2874 s_idx = idx = cb->args[1]; 2875 for (h = 0; h <= rt_hash_mask; h++) { 2876 if (h < s_h) continue; 2877 if (h > s_h) 2878 s_idx = 0; 2879 rcu_read_lock_bh(); 2880 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; 2881 rt = rcu_dereference(rt->u.dst.rt_next), idx++) { 2882 if (idx < s_idx) 2883 continue; 2884 skb->dst = dst_clone(&rt->u.dst); 2885 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, 2886 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 2887 1, NLM_F_MULTI) <= 0) { 2888 dst_release(xchg(&skb->dst, NULL)); 2889 rcu_read_unlock_bh(); 2890 goto done; 2891 } 2892 dst_release(xchg(&skb->dst, NULL)); 2893 } 2894 rcu_read_unlock_bh(); 2895 } 2896 2897done: 2898 cb->args[0] = h; 2899 cb->args[1] = idx; 2900 return skb->len; 2901} 2902 2903void ip_rt_multicast_event(struct in_device *in_dev) 2904{ 2905 rt_cache_flush(0); 2906} 2907 2908#ifdef CONFIG_SYSCTL 2909static int flush_delay; 2910 2911static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, 2912 struct file *filp, void __user *buffer, 2913 size_t *lenp, loff_t *ppos) 2914{ 2915 if (write) { 2916 proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 2917 rt_cache_flush(flush_delay); 2918 return 0; 2919 } 2920 2921 return -EINVAL; 2922} 2923 2924static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, 2925 int __user *name, 2926 int nlen, 2927 void __user *oldval, 2928 size_t __user *oldlenp, 2929 void __user *newval, 2930 size_t newlen) 2931{ 2932 int delay; 2933 if (newlen != sizeof(int)) 2934 return -EINVAL; 2935 if (get_user(delay, (int __user *)newval)) 2936 return -EFAULT; 2937 rt_cache_flush(delay); 2938 return 0; 2939} 2940 2941ctl_table ipv4_route_table[] = { 2942 { 2943 .ctl_name = NET_IPV4_ROUTE_FLUSH, 2944 .procname = "flush", 2945 .data = &flush_delay, 2946 .maxlen = sizeof(int), 2947 .mode = 0200, 2948 .proc_handler = &ipv4_sysctl_rtcache_flush, 2949 .strategy = &ipv4_sysctl_rtcache_flush_strategy, 2950 }, 2951 { 2952 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY, 2953 .procname = "min_delay", 2954 .data = &ip_rt_min_delay, 2955 .maxlen = sizeof(int), 2956 .mode = 0644, 2957 .proc_handler = &proc_dointvec_jiffies, 2958 .strategy = &sysctl_jiffies, 2959 }, 2960 { 2961 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY, 2962 .procname = "max_delay", 2963 .data = &ip_rt_max_delay, 2964 .maxlen = sizeof(int), 2965 .mode = 0644, 2966 .proc_handler = &proc_dointvec_jiffies, 2967 .strategy = &sysctl_jiffies, 2968 }, 2969 { 2970 .ctl_name = NET_IPV4_ROUTE_GC_THRESH, 2971 .procname = "gc_thresh", 2972 .data = &ipv4_dst_ops.gc_thresh, 2973 .maxlen = sizeof(int), 2974 .mode = 0644, 2975 .proc_handler = &proc_dointvec, 2976 }, 2977 { 2978 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE, 2979 .procname = "max_size", 2980 .data = &ip_rt_max_size, 2981 .maxlen = sizeof(int), 2982 .mode = 0644, 2983 .proc_handler = &proc_dointvec, 2984 }, 2985 { 2986 /* Deprecated. Use gc_min_interval_ms */ 2987 2988 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL, 2989 .procname = "gc_min_interval", 2990 .data = &ip_rt_gc_min_interval, 2991 .maxlen = sizeof(int), 2992 .mode = 0644, 2993 .proc_handler = &proc_dointvec_jiffies, 2994 .strategy = &sysctl_jiffies, 2995 }, 2996 { 2997 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, 2998 .procname = "gc_min_interval_ms", 2999 .data = &ip_rt_gc_min_interval, 3000 .maxlen = sizeof(int), 3001 .mode = 0644, 3002 .proc_handler = &proc_dointvec_ms_jiffies, 3003 .strategy = &sysctl_ms_jiffies, 3004 }, 3005 { 3006 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT, 3007 .procname = "gc_timeout", 3008 .data = &ip_rt_gc_timeout, 3009 .maxlen = sizeof(int), 3010 .mode = 0644, 3011 .proc_handler = &proc_dointvec_jiffies, 3012 .strategy = &sysctl_jiffies, 3013 }, 3014 { 3015 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL, 3016 .procname = "gc_interval", 3017 .data = &ip_rt_gc_interval, 3018 .maxlen = sizeof(int), 3019 .mode = 0644, 3020 .proc_handler = &proc_dointvec_jiffies, 3021 .strategy = &sysctl_jiffies, 3022 }, 3023 { 3024 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD, 3025 .procname = "redirect_load", 3026 .data = &ip_rt_redirect_load, 3027 .maxlen = sizeof(int), 3028 .mode = 0644, 3029 .proc_handler = &proc_dointvec, 3030 }, 3031 { 3032 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER, 3033 .procname = "redirect_number", 3034 .data = &ip_rt_redirect_number, 3035 .maxlen = sizeof(int), 3036 .mode = 0644, 3037 .proc_handler = &proc_dointvec, 3038 }, 3039 { 3040 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE, 3041 .procname = "redirect_silence", 3042 .data = &ip_rt_redirect_silence, 3043 .maxlen = sizeof(int), 3044 .mode = 0644, 3045 .proc_handler = &proc_dointvec, 3046 }, 3047 { 3048 .ctl_name = NET_IPV4_ROUTE_ERROR_COST, 3049 .procname = "error_cost", 3050 .data = &ip_rt_error_cost, 3051 .maxlen = sizeof(int), 3052 .mode = 0644, 3053 .proc_handler = &proc_dointvec, 3054 }, 3055 { 3056 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST, 3057 .procname = "error_burst", 3058 .data = &ip_rt_error_burst, 3059 .maxlen = sizeof(int), 3060 .mode = 0644, 3061 .proc_handler = &proc_dointvec, 3062 }, 3063 { 3064 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY, 3065 .procname = "gc_elasticity", 3066 .data = &ip_rt_gc_elasticity, 3067 .maxlen = sizeof(int), 3068 .mode = 0644, 3069 .proc_handler = &proc_dointvec, 3070 }, 3071 { 3072 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES, 3073 .procname = "mtu_expires", 3074 .data = &ip_rt_mtu_expires, 3075 .maxlen = sizeof(int), 3076 .mode = 0644, 3077 .proc_handler = &proc_dointvec_jiffies, 3078 .strategy = &sysctl_jiffies, 3079 }, 3080 { 3081 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU, 3082 .procname = "min_pmtu", 3083 .data = &ip_rt_min_pmtu, 3084 .maxlen = sizeof(int), 3085 .mode = 0644, 3086 .proc_handler = &proc_dointvec, 3087 }, 3088 { 3089 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS, 3090 .procname = "min_adv_mss", 3091 .data = &ip_rt_min_advmss, 3092 .maxlen = sizeof(int), 3093 .mode = 0644, 3094 .proc_handler = &proc_dointvec, 3095 }, 3096 { 3097 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL, 3098 .procname = "secret_interval", 3099 .data = &ip_rt_secret_interval, 3100 .maxlen = sizeof(int), 3101 .mode = 0644, 3102 .proc_handler = &proc_dointvec_jiffies, 3103 .strategy = &sysctl_jiffies, 3104 }, 3105 { .ctl_name = 0 } 3106}; 3107#endif 3108 3109#ifdef CONFIG_NET_CLS_ROUTE 3110struct ip_rt_acct *ip_rt_acct; 3111 3112/* This code sucks. But you should have seen it before! --RR */ 3113 3114/* IP route accounting ptr for this logical cpu number. */ 3115#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256) 3116 3117#ifdef CONFIG_PROC_FS 3118static int ip_rt_acct_read(char *buffer, char **start, off_t offset, 3119 int length, int *eof, void *data) 3120{ 3121 unsigned int i; 3122 3123 if ((offset & 3) || (length & 3)) 3124 return -EIO; 3125 3126 if (offset >= sizeof(struct ip_rt_acct) * 256) { 3127 *eof = 1; 3128 return 0; 3129 } 3130 3131 if (offset + length >= sizeof(struct ip_rt_acct) * 256) { 3132 length = sizeof(struct ip_rt_acct) * 256 - offset; 3133 *eof = 1; 3134 } 3135 3136 offset /= sizeof(u32); 3137 3138 if (length > 0) { 3139 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset; 3140 u32 *dst = (u32 *) buffer; 3141 3142 /* Copy first cpu. */ 3143 *start = buffer; 3144 memcpy(dst, src, length); 3145 3146 /* Add the other cpus in, one int at a time */ 3147 for_each_possible_cpu(i) { 3148 unsigned int j; 3149 3150 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset; 3151 3152 for (j = 0; j < length/4; j++) 3153 dst[j] += src[j]; 3154 } 3155 } 3156 return length; 3157} 3158#endif /* CONFIG_PROC_FS */ 3159#endif /* CONFIG_NET_CLS_ROUTE */ 3160 3161static __initdata unsigned long rhash_entries; 3162static int __init set_rhash_entries(char *str) 3163{ 3164 if (!str) 3165 return 0; 3166 rhash_entries = simple_strtoul(str, &str, 0); 3167 return 1; 3168} 3169__setup("rhash_entries=", set_rhash_entries); 3170 3171int __init ip_rt_init(void) 3172{ 3173 int rc = 0; 3174 3175 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ 3176 (jiffies ^ (jiffies >> 7))); 3177 3178#ifdef CONFIG_NET_CLS_ROUTE 3179 { 3180 int order; 3181 for (order = 0; 3182 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) 3183 /* NOTHING */; 3184 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order); 3185 if (!ip_rt_acct) 3186 panic("IP: failed to allocate ip_rt_acct\n"); 3187 memset(ip_rt_acct, 0, PAGE_SIZE << order); 3188 } 3189#endif 3190 3191 ipv4_dst_ops.kmem_cachep = 3192 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3193 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 3194 3195 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3196 3197 rt_hash_table = (struct rt_hash_bucket *) 3198 alloc_large_system_hash("IP route cache", 3199 sizeof(struct rt_hash_bucket), 3200 rhash_entries, 3201 (num_physpages >= 128 * 1024) ? 3202 15 : 17, 3203 0, 3204 &rt_hash_log, 3205 &rt_hash_mask, 3206 0); 3207 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); 3208 rt_hash_lock_init(); 3209 3210 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3211 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3212 3213 devinet_init(); 3214 ip_fib_init(); 3215 3216 init_timer(&rt_flush_timer); 3217 rt_flush_timer.function = rt_run_flush; 3218 init_timer(&rt_periodic_timer); 3219 rt_periodic_timer.function = rt_check_expire; 3220 init_timer(&rt_secret_timer); 3221 rt_secret_timer.function = rt_secret_rebuild; 3222 3223 /* All the timers, started at system startup tend 3224 to synchronize. Perturb it a bit. 3225 */ 3226 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval + 3227 ip_rt_gc_interval; 3228 add_timer(&rt_periodic_timer); 3229 3230 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + 3231 ip_rt_secret_interval; 3232 add_timer(&rt_secret_timer); 3233 3234#ifdef CONFIG_PROC_FS 3235 { 3236 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ 3237 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) || 3238 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 3239 proc_net_stat))) { 3240 return -ENOMEM; 3241 } 3242 rtstat_pde->proc_fops = &rt_cpu_seq_fops; 3243 } 3244#ifdef CONFIG_NET_CLS_ROUTE 3245 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL); 3246#endif 3247#endif 3248#ifdef CONFIG_XFRM 3249 xfrm_init(); 3250 xfrm4_init(); 3251#endif 3252 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); 3253 3254 return rc; 3255} 3256 3257EXPORT_SYMBOL(__ip_select_ident); 3258EXPORT_SYMBOL(ip_route_input); 3259EXPORT_SYMBOL(ip_route_output_key); 3260