1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		IPv4 Forwarding Information Base: semantics.
7 *
8 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 *
10 *		This program is free software; you can redistribute it and/or
11 *		modify it under the terms of the GNU General Public License
12 *		as published by the Free Software Foundation; either version
13 *		2 of the License, or (at your option) any later version.
14 */
15
16#include <asm/uaccess.h>
17#include <asm/system.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/jiffies.h>
22#include <linux/mm.h>
23#include <linux/string.h>
24#include <linux/socket.h>
25#include <linux/sockios.h>
26#include <linux/errno.h>
27#include <linux/in.h>
28#include <linux/inet.h>
29#include <linux/inetdevice.h>
30#include <linux/netdevice.h>
31#include <linux/if_arp.h>
32#include <linux/proc_fs.h>
33#include <linux/skbuff.h>
34#include <linux/init.h>
35#include <linux/slab.h>
36
37#include <net/arp.h>
38#include <net/ip.h>
39#include <net/protocol.h>
40#include <net/route.h>
41#include <net/tcp.h>
42#include <net/sock.h>
43#include <net/ip_fib.h>
44#include <net/netlink.h>
45#include <net/nexthop.h>
46
47#include "fib_lookup.h"
48
49static DEFINE_SPINLOCK(fib_info_lock);
50static struct hlist_head *fib_info_hash;
51static struct hlist_head *fib_info_laddrhash;
52static unsigned int fib_hash_size;
53static unsigned int fib_info_cnt;
54
55#define DEVINDEX_HASHBITS 8
56#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
57static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
58
59#ifdef CONFIG_IP_ROUTE_MULTIPATH
60
61static DEFINE_SPINLOCK(fib_multipath_lock);
62
63#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
64for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
65
66#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
67for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
68
69#else /* CONFIG_IP_ROUTE_MULTIPATH */
70
71/* Hope, that gcc will optimize it to get rid of dummy loop */
72
73#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
74for (nhsel=0; nhsel < 1; nhsel++)
75
76#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
77for (nhsel=0; nhsel < 1; nhsel++)
78
79#endif /* CONFIG_IP_ROUTE_MULTIPATH */
80
81#define endfor_nexthops(fi) }
82
83
84static const struct
85{
86	int	error;
87	u8	scope;
88} fib_props[RTN_MAX + 1] = {
89	{
90		.error	= 0,
91		.scope	= RT_SCOPE_NOWHERE,
92	},	/* RTN_UNSPEC */
93	{
94		.error	= 0,
95		.scope	= RT_SCOPE_UNIVERSE,
96	},	/* RTN_UNICAST */
97	{
98		.error	= 0,
99		.scope	= RT_SCOPE_HOST,
100	},	/* RTN_LOCAL */
101	{
102		.error	= 0,
103		.scope	= RT_SCOPE_LINK,
104	},	/* RTN_BROADCAST */
105	{
106		.error	= 0,
107		.scope	= RT_SCOPE_LINK,
108	},	/* RTN_ANYCAST */
109	{
110		.error	= 0,
111		.scope	= RT_SCOPE_UNIVERSE,
112	},	/* RTN_MULTICAST */
113	{
114		.error	= -EINVAL,
115		.scope	= RT_SCOPE_UNIVERSE,
116	},	/* RTN_BLACKHOLE */
117	{
118		.error	= -EHOSTUNREACH,
119		.scope	= RT_SCOPE_UNIVERSE,
120	},	/* RTN_UNREACHABLE */
121	{
122		.error	= -EACCES,
123		.scope	= RT_SCOPE_UNIVERSE,
124	},	/* RTN_PROHIBIT */
125	{
126		.error	= -EAGAIN,
127		.scope	= RT_SCOPE_UNIVERSE,
128	},	/* RTN_THROW */
129	{
130		.error	= -EINVAL,
131		.scope	= RT_SCOPE_NOWHERE,
132	},	/* RTN_NAT */
133	{
134		.error	= -EINVAL,
135		.scope	= RT_SCOPE_NOWHERE,
136	},	/* RTN_XRESOLVE */
137};
138
139
140/* Release a nexthop info record */
141
142void free_fib_info(struct fib_info *fi)
143{
144	if (fi->fib_dead == 0) {
145		printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
146		return;
147	}
148	change_nexthops(fi) {
149		if (nexthop_nh->nh_dev)
150			dev_put(nexthop_nh->nh_dev);
151		nexthop_nh->nh_dev = NULL;
152	} endfor_nexthops(fi);
153	fib_info_cnt--;
154	release_net(fi->fib_net);
155	kfree(fi);
156}
157
158void fib_release_info(struct fib_info *fi)
159{
160	spin_lock_bh(&fib_info_lock);
161	if (fi && --fi->fib_treeref == 0) {
162		hlist_del(&fi->fib_hash);
163		if (fi->fib_prefsrc)
164			hlist_del(&fi->fib_lhash);
165		change_nexthops(fi) {
166			if (!nexthop_nh->nh_dev)
167				continue;
168			hlist_del(&nexthop_nh->nh_hash);
169		} endfor_nexthops(fi)
170		fi->fib_dead = 1;
171		fib_info_put(fi);
172	}
173	spin_unlock_bh(&fib_info_lock);
174}
175
176static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
177{
178	const struct fib_nh *onh = ofi->fib_nh;
179
180	for_nexthops(fi) {
181		if (nh->nh_oif != onh->nh_oif ||
182		    nh->nh_gw  != onh->nh_gw ||
183		    nh->nh_scope != onh->nh_scope ||
184#ifdef CONFIG_IP_ROUTE_MULTIPATH
185		    nh->nh_weight != onh->nh_weight ||
186#endif
187#ifdef CONFIG_NET_CLS_ROUTE
188		    nh->nh_tclassid != onh->nh_tclassid ||
189#endif
190		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
191			return -1;
192		onh++;
193	} endfor_nexthops(fi);
194	return 0;
195}
196
197static inline unsigned int fib_devindex_hashfn(unsigned int val)
198{
199	unsigned int mask = DEVINDEX_HASHSIZE - 1;
200
201	return (val ^
202		(val >> DEVINDEX_HASHBITS) ^
203		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
204}
205
206static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
207{
208	unsigned int mask = (fib_hash_size - 1);
209	unsigned int val = fi->fib_nhs;
210
211	val ^= fi->fib_protocol;
212	val ^= (__force u32)fi->fib_prefsrc;
213	val ^= fi->fib_priority;
214	for_nexthops(fi) {
215		val ^= fib_devindex_hashfn(nh->nh_oif);
216	} endfor_nexthops(fi)
217
218	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
219}
220
221static struct fib_info *fib_find_info(const struct fib_info *nfi)
222{
223	struct hlist_head *head;
224	struct hlist_node *node;
225	struct fib_info *fi;
226	unsigned int hash;
227
228	hash = fib_info_hashfn(nfi);
229	head = &fib_info_hash[hash];
230
231	hlist_for_each_entry(fi, node, head, fib_hash) {
232		if (!net_eq(fi->fib_net, nfi->fib_net))
233			continue;
234		if (fi->fib_nhs != nfi->fib_nhs)
235			continue;
236		if (nfi->fib_protocol == fi->fib_protocol &&
237		    nfi->fib_prefsrc == fi->fib_prefsrc &&
238		    nfi->fib_priority == fi->fib_priority &&
239		    memcmp(nfi->fib_metrics, fi->fib_metrics,
240			   sizeof(fi->fib_metrics)) == 0 &&
241		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
242		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
243			return fi;
244	}
245
246	return NULL;
247}
248
249/* Check, that the gateway is already configured.
250   Used only by redirect accept routine.
251 */
252
253int ip_fib_check_default(__be32 gw, struct net_device *dev)
254{
255	struct hlist_head *head;
256	struct hlist_node *node;
257	struct fib_nh *nh;
258	unsigned int hash;
259
260	spin_lock(&fib_info_lock);
261
262	hash = fib_devindex_hashfn(dev->ifindex);
263	head = &fib_info_devhash[hash];
264	hlist_for_each_entry(nh, node, head, nh_hash) {
265		if (nh->nh_dev == dev &&
266		    nh->nh_gw == gw &&
267		    !(nh->nh_flags&RTNH_F_DEAD)) {
268			spin_unlock(&fib_info_lock);
269			return 0;
270		}
271	}
272
273	spin_unlock(&fib_info_lock);
274
275	return -1;
276}
277
278static inline size_t fib_nlmsg_size(struct fib_info *fi)
279{
280	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
281			 + nla_total_size(4) /* RTA_TABLE */
282			 + nla_total_size(4) /* RTA_DST */
283			 + nla_total_size(4) /* RTA_PRIORITY */
284			 + nla_total_size(4); /* RTA_PREFSRC */
285
286	/* space for nested metrics */
287	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
288
289	if (fi->fib_nhs) {
290		/* Also handles the special case fib_nhs == 1 */
291
292		/* each nexthop is packed in an attribute */
293		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
294
295		/* may contain flow and gateway attribute */
296		nhsize += 2 * nla_total_size(4);
297
298		/* all nexthops are packed in a nested attribute */
299		payload += nla_total_size(fi->fib_nhs * nhsize);
300	}
301
302	return payload;
303}
304
305void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
306	       int dst_len, u32 tb_id, struct nl_info *info,
307	       unsigned int nlm_flags)
308{
309	struct sk_buff *skb;
310	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
311	int err = -ENOBUFS;
312
313	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
314	if (skb == NULL)
315		goto errout;
316
317	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
318			    fa->fa_type, fa->fa_scope, key, dst_len,
319			    fa->fa_tos, fa->fa_info, nlm_flags);
320	if (err < 0) {
321		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
322		WARN_ON(err == -EMSGSIZE);
323		kfree_skb(skb);
324		goto errout;
325	}
326	rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
327		    info->nlh, GFP_KERNEL);
328	return;
329errout:
330	if (err < 0)
331		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
332}
333
334/* Return the first fib alias matching TOS with
335 * priority less than or equal to PRIO.
336 */
337struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
338{
339	if (fah) {
340		struct fib_alias *fa;
341		list_for_each_entry(fa, fah, fa_list) {
342			if (fa->fa_tos > tos)
343				continue;
344			if (fa->fa_info->fib_priority >= prio ||
345			    fa->fa_tos < tos)
346				return fa;
347		}
348	}
349	return NULL;
350}
351
352int fib_detect_death(struct fib_info *fi, int order,
353		     struct fib_info **last_resort, int *last_idx, int dflt)
354{
355	struct neighbour *n;
356	int state = NUD_NONE;
357
358	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
359	if (n) {
360		state = n->nud_state;
361		neigh_release(n);
362	}
363	if (state == NUD_REACHABLE)
364		return 0;
365	if ((state&NUD_VALID) && order != dflt)
366		return 0;
367	if ((state&NUD_VALID) ||
368	    (*last_idx<0 && order > dflt)) {
369		*last_resort = fi;
370		*last_idx = order;
371	}
372	return 1;
373}
374
375#ifdef CONFIG_IP_ROUTE_MULTIPATH
376
377static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
378{
379	int nhs = 0;
380
381	while (rtnh_ok(rtnh, remaining)) {
382		nhs++;
383		rtnh = rtnh_next(rtnh, &remaining);
384	}
385
386	/* leftover implies invalid nexthop configuration, discard it */
387	return remaining > 0 ? 0 : nhs;
388}
389
390static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
391		       int remaining, struct fib_config *cfg)
392{
393	change_nexthops(fi) {
394		int attrlen;
395
396		if (!rtnh_ok(rtnh, remaining))
397			return -EINVAL;
398
399		nexthop_nh->nh_flags =
400			(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
401		nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
402		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
403
404		attrlen = rtnh_attrlen(rtnh);
405		if (attrlen > 0) {
406			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
407
408			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
409			nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
410#ifdef CONFIG_NET_CLS_ROUTE
411			nla = nla_find(attrs, attrlen, RTA_FLOW);
412			nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
413#endif
414		}
415
416		rtnh = rtnh_next(rtnh, &remaining);
417	} endfor_nexthops(fi);
418
419	return 0;
420}
421
422#endif
423
424int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
425{
426#ifdef CONFIG_IP_ROUTE_MULTIPATH
427	struct rtnexthop *rtnh;
428	int remaining;
429#endif
430
431	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
432		return 1;
433
434	if (cfg->fc_oif || cfg->fc_gw) {
435		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
436		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
437			return 0;
438		return 1;
439	}
440
441#ifdef CONFIG_IP_ROUTE_MULTIPATH
442	if (cfg->fc_mp == NULL)
443		return 0;
444
445	rtnh = cfg->fc_mp;
446	remaining = cfg->fc_mp_len;
447
448	for_nexthops(fi) {
449		int attrlen;
450
451		if (!rtnh_ok(rtnh, remaining))
452			return -EINVAL;
453
454		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
455			return 1;
456
457		attrlen = rtnh_attrlen(rtnh);
458		if (attrlen < 0) {
459			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
460
461			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
462			if (nla && nla_get_be32(nla) != nh->nh_gw)
463				return 1;
464#ifdef CONFIG_NET_CLS_ROUTE
465			nla = nla_find(attrs, attrlen, RTA_FLOW);
466			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
467				return 1;
468#endif
469		}
470
471		rtnh = rtnh_next(rtnh, &remaining);
472	} endfor_nexthops(fi);
473#endif
474	return 0;
475}
476
477
478/*
479   Picture
480   -------
481
482   Semantics of nexthop is very messy by historical reasons.
483   We have to take into account, that:
484   a) gateway can be actually local interface address,
485      so that gatewayed route is direct.
486   b) gateway must be on-link address, possibly
487      described not by an ifaddr, but also by a direct route.
488   c) If both gateway and interface are specified, they should not
489      contradict.
490   d) If we use tunnel routes, gateway could be not on-link.
491
492   Attempt to reconcile all of these (alas, self-contradictory) conditions
493   results in pretty ugly and hairy code with obscure logic.
494
495   I chose to generalized it instead, so that the size
496   of code does not increase practically, but it becomes
497   much more general.
498   Every prefix is assigned a "scope" value: "host" is local address,
499   "link" is direct route,
500   [ ... "site" ... "interior" ... ]
501   and "universe" is true gateway route with global meaning.
502
503   Every prefix refers to a set of "nexthop"s (gw, oif),
504   where gw must have narrower scope. This recursion stops
505   when gw has LOCAL scope or if "nexthop" is declared ONLINK,
506   which means that gw is forced to be on link.
507
508   Code is still hairy, but now it is apparently logically
509   consistent and very flexible. F.e. as by-product it allows
510   to co-exists in peace independent exterior and interior
511   routing processes.
512
513   Normally it looks as following.
514
515   {universe prefix}  -> (gw, oif) [scope link]
516			  |
517			  |-> {link prefix} -> (gw, oif) [scope local]
518						|
519						|-> {local prefix} (terminal node)
520 */
521
522static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
523			struct fib_nh *nh)
524{
525	int err;
526	struct net *net;
527
528	net = cfg->fc_nlinfo.nl_net;
529	if (nh->nh_gw) {
530		struct fib_result res;
531
532		if (nh->nh_flags&RTNH_F_ONLINK) {
533			struct net_device *dev;
534
535			if (cfg->fc_scope >= RT_SCOPE_LINK)
536				return -EINVAL;
537			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
538				return -EINVAL;
539			if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
540				return -ENODEV;
541			if (!(dev->flags&IFF_UP))
542				return -ENETDOWN;
543			nh->nh_dev = dev;
544			dev_hold(dev);
545			nh->nh_scope = RT_SCOPE_LINK;
546			return 0;
547		}
548		{
549			struct flowi fl = {
550				.nl_u = {
551					.ip4_u = {
552						.daddr = nh->nh_gw,
553						.scope = cfg->fc_scope + 1,
554					},
555				},
556				.oif = nh->nh_oif,
557			};
558
559			/* It is not necessary, but requires a bit of thinking */
560			if (fl.fl4_scope < RT_SCOPE_LINK)
561				fl.fl4_scope = RT_SCOPE_LINK;
562			if ((err = fib_lookup(net, &fl, &res)) != 0)
563				return err;
564		}
565		err = -EINVAL;
566		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
567			goto out;
568		nh->nh_scope = res.scope;
569		nh->nh_oif = FIB_RES_OIF(res);
570		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
571			goto out;
572		dev_hold(nh->nh_dev);
573		err = -ENETDOWN;
574		if (!(nh->nh_dev->flags & IFF_UP))
575			goto out;
576		err = 0;
577out:
578		fib_res_put(&res);
579		return err;
580	} else {
581		struct in_device *in_dev;
582
583		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
584			return -EINVAL;
585
586		in_dev = inetdev_by_index(net, nh->nh_oif);
587		if (in_dev == NULL)
588			return -ENODEV;
589		if (!(in_dev->dev->flags&IFF_UP)) {
590			in_dev_put(in_dev);
591			return -ENETDOWN;
592		}
593		nh->nh_dev = in_dev->dev;
594		dev_hold(nh->nh_dev);
595		nh->nh_scope = RT_SCOPE_HOST;
596		in_dev_put(in_dev);
597	}
598	return 0;
599}
600
601static inline unsigned int fib_laddr_hashfn(__be32 val)
602{
603	unsigned int mask = (fib_hash_size - 1);
604
605	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
606}
607
608static struct hlist_head *fib_hash_alloc(int bytes)
609{
610	if (bytes <= PAGE_SIZE)
611		return kzalloc(bytes, GFP_KERNEL);
612	else
613		return (struct hlist_head *)
614			__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
615}
616
617static void fib_hash_free(struct hlist_head *hash, int bytes)
618{
619	if (!hash)
620		return;
621
622	if (bytes <= PAGE_SIZE)
623		kfree(hash);
624	else
625		free_pages((unsigned long) hash, get_order(bytes));
626}
627
628static void fib_hash_move(struct hlist_head *new_info_hash,
629			  struct hlist_head *new_laddrhash,
630			  unsigned int new_size)
631{
632	struct hlist_head *old_info_hash, *old_laddrhash;
633	unsigned int old_size = fib_hash_size;
634	unsigned int i, bytes;
635
636	spin_lock_bh(&fib_info_lock);
637	old_info_hash = fib_info_hash;
638	old_laddrhash = fib_info_laddrhash;
639	fib_hash_size = new_size;
640
641	for (i = 0; i < old_size; i++) {
642		struct hlist_head *head = &fib_info_hash[i];
643		struct hlist_node *node, *n;
644		struct fib_info *fi;
645
646		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
647			struct hlist_head *dest;
648			unsigned int new_hash;
649
650			hlist_del(&fi->fib_hash);
651
652			new_hash = fib_info_hashfn(fi);
653			dest = &new_info_hash[new_hash];
654			hlist_add_head(&fi->fib_hash, dest);
655		}
656	}
657	fib_info_hash = new_info_hash;
658
659	for (i = 0; i < old_size; i++) {
660		struct hlist_head *lhead = &fib_info_laddrhash[i];
661		struct hlist_node *node, *n;
662		struct fib_info *fi;
663
664		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
665			struct hlist_head *ldest;
666			unsigned int new_hash;
667
668			hlist_del(&fi->fib_lhash);
669
670			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
671			ldest = &new_laddrhash[new_hash];
672			hlist_add_head(&fi->fib_lhash, ldest);
673		}
674	}
675	fib_info_laddrhash = new_laddrhash;
676
677	spin_unlock_bh(&fib_info_lock);
678
679	bytes = old_size * sizeof(struct hlist_head *);
680	fib_hash_free(old_info_hash, bytes);
681	fib_hash_free(old_laddrhash, bytes);
682}
683
684struct fib_info *fib_create_info(struct fib_config *cfg)
685{
686	int err;
687	struct fib_info *fi = NULL;
688	struct fib_info *ofi;
689	int nhs = 1;
690	struct net *net = cfg->fc_nlinfo.nl_net;
691
692	/* Fast check to catch the most weird cases */
693	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
694		goto err_inval;
695
696#ifdef CONFIG_IP_ROUTE_MULTIPATH
697	if (cfg->fc_mp) {
698		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
699		if (nhs == 0)
700			goto err_inval;
701	}
702#endif
703
704	err = -ENOBUFS;
705	if (fib_info_cnt >= fib_hash_size) {
706		unsigned int new_size = fib_hash_size << 1;
707		struct hlist_head *new_info_hash;
708		struct hlist_head *new_laddrhash;
709		unsigned int bytes;
710
711		if (!new_size)
712			new_size = 1;
713		bytes = new_size * sizeof(struct hlist_head *);
714		new_info_hash = fib_hash_alloc(bytes);
715		new_laddrhash = fib_hash_alloc(bytes);
716		if (!new_info_hash || !new_laddrhash) {
717			fib_hash_free(new_info_hash, bytes);
718			fib_hash_free(new_laddrhash, bytes);
719		} else
720			fib_hash_move(new_info_hash, new_laddrhash, new_size);
721
722		if (!fib_hash_size)
723			goto failure;
724	}
725
726	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
727	if (fi == NULL)
728		goto failure;
729	fib_info_cnt++;
730
731	fi->fib_net = hold_net(net);
732	fi->fib_protocol = cfg->fc_protocol;
733	fi->fib_flags = cfg->fc_flags;
734	fi->fib_priority = cfg->fc_priority;
735	fi->fib_prefsrc = cfg->fc_prefsrc;
736
737	fi->fib_nhs = nhs;
738	change_nexthops(fi) {
739		nexthop_nh->nh_parent = fi;
740	} endfor_nexthops(fi)
741
742	if (cfg->fc_mx) {
743		struct nlattr *nla;
744		int remaining;
745
746		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
747			int type = nla_type(nla);
748
749			if (type) {
750				if (type > RTAX_MAX)
751					goto err_inval;
752				fi->fib_metrics[type - 1] = nla_get_u32(nla);
753			}
754		}
755	}
756
757	if (cfg->fc_mp) {
758#ifdef CONFIG_IP_ROUTE_MULTIPATH
759		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
760		if (err != 0)
761			goto failure;
762		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
763			goto err_inval;
764		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
765			goto err_inval;
766#ifdef CONFIG_NET_CLS_ROUTE
767		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
768			goto err_inval;
769#endif
770#else
771		goto err_inval;
772#endif
773	} else {
774		struct fib_nh *nh = fi->fib_nh;
775
776		nh->nh_oif = cfg->fc_oif;
777		nh->nh_gw = cfg->fc_gw;
778		nh->nh_flags = cfg->fc_flags;
779#ifdef CONFIG_NET_CLS_ROUTE
780		nh->nh_tclassid = cfg->fc_flow;
781#endif
782#ifdef CONFIG_IP_ROUTE_MULTIPATH
783		nh->nh_weight = 1;
784#endif
785	}
786
787	if (fib_props[cfg->fc_type].error) {
788		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
789			goto err_inval;
790		goto link_it;
791	}
792
793	if (cfg->fc_scope > RT_SCOPE_HOST)
794		goto err_inval;
795
796	if (cfg->fc_scope == RT_SCOPE_HOST) {
797		struct fib_nh *nh = fi->fib_nh;
798
799		/* Local address is added. */
800		if (nhs != 1 || nh->nh_gw)
801			goto err_inval;
802		nh->nh_scope = RT_SCOPE_NOWHERE;
803		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
804		err = -ENODEV;
805		if (nh->nh_dev == NULL)
806			goto failure;
807	} else {
808		change_nexthops(fi) {
809			if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
810				goto failure;
811		} endfor_nexthops(fi)
812	}
813
814	if (fi->fib_prefsrc) {
815		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
816		    fi->fib_prefsrc != cfg->fc_dst)
817			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
818				goto err_inval;
819	}
820
821link_it:
822	if ((ofi = fib_find_info(fi)) != NULL) {
823		fi->fib_dead = 1;
824		free_fib_info(fi);
825		ofi->fib_treeref++;
826		return ofi;
827	}
828
829	fi->fib_treeref++;
830	atomic_inc(&fi->fib_clntref);
831	spin_lock_bh(&fib_info_lock);
832	hlist_add_head(&fi->fib_hash,
833		       &fib_info_hash[fib_info_hashfn(fi)]);
834	if (fi->fib_prefsrc) {
835		struct hlist_head *head;
836
837		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
838		hlist_add_head(&fi->fib_lhash, head);
839	}
840	change_nexthops(fi) {
841		struct hlist_head *head;
842		unsigned int hash;
843
844		if (!nexthop_nh->nh_dev)
845			continue;
846		hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
847		head = &fib_info_devhash[hash];
848		hlist_add_head(&nexthop_nh->nh_hash, head);
849	} endfor_nexthops(fi)
850	spin_unlock_bh(&fib_info_lock);
851	return fi;
852
853err_inval:
854	err = -EINVAL;
855
856failure:
857	if (fi) {
858		fi->fib_dead = 1;
859		free_fib_info(fi);
860	}
861
862	return ERR_PTR(err);
863}
864
865/* Note! fib_semantic_match intentionally uses  RCU list functions. */
866int fib_semantic_match(struct list_head *head, const struct flowi *flp,
867		       struct fib_result *res, int prefixlen)
868{
869	struct fib_alias *fa;
870	int nh_sel = 0;
871
872	list_for_each_entry_rcu(fa, head, fa_list) {
873		int err;
874
875		if (fa->fa_tos &&
876		    fa->fa_tos != flp->fl4_tos)
877			continue;
878
879		if (fa->fa_scope < flp->fl4_scope)
880			continue;
881
882		fa->fa_state |= FA_S_ACCESSED;
883
884		err = fib_props[fa->fa_type].error;
885		if (err == 0) {
886			struct fib_info *fi = fa->fa_info;
887
888			if (fi->fib_flags & RTNH_F_DEAD)
889				continue;
890
891			switch (fa->fa_type) {
892			case RTN_UNICAST:
893			case RTN_LOCAL:
894			case RTN_BROADCAST:
895			case RTN_ANYCAST:
896			case RTN_MULTICAST:
897				for_nexthops(fi) {
898					if (nh->nh_flags&RTNH_F_DEAD)
899						continue;
900					if (!flp->oif || flp->oif == nh->nh_oif)
901						break;
902				}
903#ifdef CONFIG_IP_ROUTE_MULTIPATH
904				if (nhsel < fi->fib_nhs) {
905					nh_sel = nhsel;
906					goto out_fill_res;
907				}
908#else
909				if (nhsel < 1) {
910					goto out_fill_res;
911				}
912#endif
913				endfor_nexthops(fi);
914				continue;
915
916			default:
917				printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
918					fa->fa_type);
919				return -EINVAL;
920			}
921		}
922		return err;
923	}
924	return 1;
925
926out_fill_res:
927	res->prefixlen = prefixlen;
928	res->nh_sel = nh_sel;
929	res->type = fa->fa_type;
930	res->scope = fa->fa_scope;
931	res->fi = fa->fa_info;
932	atomic_inc(&res->fi->fib_clntref);
933	return 0;
934}
935
936/* Find appropriate source address to this destination */
937
938__be32 __fib_res_prefsrc(struct fib_result *res)
939{
940	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
941}
942
943int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
944		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
945		  struct fib_info *fi, unsigned int flags)
946{
947	struct nlmsghdr *nlh;
948	struct rtmsg *rtm;
949
950	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
951	if (nlh == NULL)
952		return -EMSGSIZE;
953
954	rtm = nlmsg_data(nlh);
955	rtm->rtm_family = AF_INET;
956	rtm->rtm_dst_len = dst_len;
957	rtm->rtm_src_len = 0;
958	rtm->rtm_tos = tos;
959	if (tb_id < 256)
960		rtm->rtm_table = tb_id;
961	else
962		rtm->rtm_table = RT_TABLE_COMPAT;
963	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
964	rtm->rtm_type = type;
965	rtm->rtm_flags = fi->fib_flags;
966	rtm->rtm_scope = scope;
967	rtm->rtm_protocol = fi->fib_protocol;
968
969	if (rtm->rtm_dst_len)
970		NLA_PUT_BE32(skb, RTA_DST, dst);
971
972	if (fi->fib_priority)
973		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
974
975	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
976		goto nla_put_failure;
977
978	if (fi->fib_prefsrc)
979		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
980
981	if (fi->fib_nhs == 1) {
982		if (fi->fib_nh->nh_gw)
983			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
984
985		if (fi->fib_nh->nh_oif)
986			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
987#ifdef CONFIG_NET_CLS_ROUTE
988		if (fi->fib_nh[0].nh_tclassid)
989			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
990#endif
991	}
992#ifdef CONFIG_IP_ROUTE_MULTIPATH
993	if (fi->fib_nhs > 1) {
994		struct rtnexthop *rtnh;
995		struct nlattr *mp;
996
997		mp = nla_nest_start(skb, RTA_MULTIPATH);
998		if (mp == NULL)
999			goto nla_put_failure;
1000
1001		for_nexthops(fi) {
1002			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1003			if (rtnh == NULL)
1004				goto nla_put_failure;
1005
1006			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1007			rtnh->rtnh_hops = nh->nh_weight - 1;
1008			rtnh->rtnh_ifindex = nh->nh_oif;
1009
1010			if (nh->nh_gw)
1011				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1012#ifdef CONFIG_NET_CLS_ROUTE
1013			if (nh->nh_tclassid)
1014				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1015#endif
1016			/* length of rtnetlink header + attributes */
1017			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1018		} endfor_nexthops(fi);
1019
1020		nla_nest_end(skb, mp);
1021	}
1022#endif
1023	return nlmsg_end(skb, nlh);
1024
1025nla_put_failure:
1026	nlmsg_cancel(skb, nlh);
1027	return -EMSGSIZE;
1028}
1029
1030/*
1031   Update FIB if:
1032   - local address disappeared -> we must delete all the entries
1033     referring to it.
1034   - device went down -> we must shutdown all nexthops going via it.
1035 */
1036int fib_sync_down_addr(struct net *net, __be32 local)
1037{
1038	int ret = 0;
1039	unsigned int hash = fib_laddr_hashfn(local);
1040	struct hlist_head *head = &fib_info_laddrhash[hash];
1041	struct hlist_node *node;
1042	struct fib_info *fi;
1043
1044	if (fib_info_laddrhash == NULL || local == 0)
1045		return 0;
1046
1047	hlist_for_each_entry(fi, node, head, fib_lhash) {
1048		if (!net_eq(fi->fib_net, net))
1049			continue;
1050		if (fi->fib_prefsrc == local) {
1051			fi->fib_flags |= RTNH_F_DEAD;
1052			ret++;
1053		}
1054	}
1055	return ret;
1056}
1057
1058int fib_sync_down_dev(struct net_device *dev, int force)
1059{
1060	int ret = 0;
1061	int scope = RT_SCOPE_NOWHERE;
1062	struct fib_info *prev_fi = NULL;
1063	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1064	struct hlist_head *head = &fib_info_devhash[hash];
1065	struct hlist_node *node;
1066	struct fib_nh *nh;
1067
1068	if (force)
1069		scope = -1;
1070
1071	hlist_for_each_entry(nh, node, head, nh_hash) {
1072		struct fib_info *fi = nh->nh_parent;
1073		int dead;
1074
1075		BUG_ON(!fi->fib_nhs);
1076		if (nh->nh_dev != dev || fi == prev_fi)
1077			continue;
1078		prev_fi = fi;
1079		dead = 0;
1080		change_nexthops(fi) {
1081			if (nexthop_nh->nh_flags&RTNH_F_DEAD)
1082				dead++;
1083			else if (nexthop_nh->nh_dev == dev &&
1084				 nexthop_nh->nh_scope != scope) {
1085				nexthop_nh->nh_flags |= RTNH_F_DEAD;
1086#ifdef CONFIG_IP_ROUTE_MULTIPATH
1087				spin_lock_bh(&fib_multipath_lock);
1088				fi->fib_power -= nexthop_nh->nh_power;
1089				nexthop_nh->nh_power = 0;
1090				spin_unlock_bh(&fib_multipath_lock);
1091#endif
1092				dead++;
1093			}
1094#ifdef CONFIG_IP_ROUTE_MULTIPATH
1095			if (force > 1 && nexthop_nh->nh_dev == dev) {
1096				dead = fi->fib_nhs;
1097				break;
1098			}
1099#endif
1100		} endfor_nexthops(fi)
1101		if (dead == fi->fib_nhs) {
1102			fi->fib_flags |= RTNH_F_DEAD;
1103			ret++;
1104		}
1105	}
1106
1107	return ret;
1108}
1109
1110#ifdef CONFIG_IP_ROUTE_MULTIPATH
1111
1112/*
1113   Dead device goes up. We wake up dead nexthops.
1114   It takes sense only on multipath routes.
1115 */
1116
1117int fib_sync_up(struct net_device *dev)
1118{
1119	struct fib_info *prev_fi;
1120	unsigned int hash;
1121	struct hlist_head *head;
1122	struct hlist_node *node;
1123	struct fib_nh *nh;
1124	int ret;
1125
1126	if (!(dev->flags&IFF_UP))
1127		return 0;
1128
1129	prev_fi = NULL;
1130	hash = fib_devindex_hashfn(dev->ifindex);
1131	head = &fib_info_devhash[hash];
1132	ret = 0;
1133
1134	hlist_for_each_entry(nh, node, head, nh_hash) {
1135		struct fib_info *fi = nh->nh_parent;
1136		int alive;
1137
1138		BUG_ON(!fi->fib_nhs);
1139		if (nh->nh_dev != dev || fi == prev_fi)
1140			continue;
1141
1142		prev_fi = fi;
1143		alive = 0;
1144		change_nexthops(fi) {
1145			if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
1146				alive++;
1147				continue;
1148			}
1149			if (nexthop_nh->nh_dev == NULL ||
1150			    !(nexthop_nh->nh_dev->flags&IFF_UP))
1151				continue;
1152			if (nexthop_nh->nh_dev != dev ||
1153			    !__in_dev_get_rtnl(dev))
1154				continue;
1155			alive++;
1156			spin_lock_bh(&fib_multipath_lock);
1157			nexthop_nh->nh_power = 0;
1158			nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
1159			spin_unlock_bh(&fib_multipath_lock);
1160		} endfor_nexthops(fi)
1161
1162		if (alive > 0) {
1163			fi->fib_flags &= ~RTNH_F_DEAD;
1164			ret++;
1165		}
1166	}
1167
1168	return ret;
1169}
1170
1171/*
1172   The algorithm is suboptimal, but it provides really
1173   fair weighted route distribution.
1174 */
1175
1176void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1177{
1178	struct fib_info *fi = res->fi;
1179	int w;
1180
1181	spin_lock_bh(&fib_multipath_lock);
1182	if (fi->fib_power <= 0) {
1183		int power = 0;
1184		change_nexthops(fi) {
1185			if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
1186				power += nexthop_nh->nh_weight;
1187				nexthop_nh->nh_power = nexthop_nh->nh_weight;
1188			}
1189		} endfor_nexthops(fi);
1190		fi->fib_power = power;
1191		if (power <= 0) {
1192			spin_unlock_bh(&fib_multipath_lock);
1193			/* Race condition: route has just become dead. */
1194			res->nh_sel = 0;
1195			return;
1196		}
1197	}
1198
1199
1200	/* w should be random number [0..fi->fib_power-1],
1201	   it is pretty bad approximation.
1202	 */
1203
1204	w = jiffies % fi->fib_power;
1205
1206	change_nexthops(fi) {
1207		if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
1208		    nexthop_nh->nh_power) {
1209			if ((w -= nexthop_nh->nh_power) <= 0) {
1210				nexthop_nh->nh_power--;
1211				fi->fib_power--;
1212				res->nh_sel = nhsel;
1213				spin_unlock_bh(&fib_multipath_lock);
1214				return;
1215			}
1216		}
1217	} endfor_nexthops(fi);
1218
1219	/* Race condition: route has just become dead. */
1220	res->nh_sel = 0;
1221	spin_unlock_bh(&fib_multipath_lock);
1222}
1223#endif
1224