1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 *	Linux INET6 implementation
4 *	FIB front-end.
5 *
6 *	Authors:
7 *	Pedro Roque		<roque@di.fc.ul.pt>
8 */
9
10/*	Changes:
11 *
12 *	YOSHIFUJI Hideaki @USAGI
13 *		reworked default router selection.
14 *		- respect outgoing interface
15 *		- select from (probably) reachable routers (i.e.
16 *		routers in REACHABLE, STALE, DELAY or PROBE states).
17 *		- always select the same router if it is (probably)
18 *		reachable.  otherwise, round-robin the list.
19 *	Ville Nuorvala
20 *		Fixed routing subtrees.
21 */
22
23#define pr_fmt(fmt) "IPv6: " fmt
24
25#include <linux/capability.h>
26#include <linux/errno.h>
27#include <linux/export.h>
28#include <linux/types.h>
29#include <linux/times.h>
30#include <linux/socket.h>
31#include <linux/sockios.h>
32#include <linux/net.h>
33#include <linux/route.h>
34#include <linux/netdevice.h>
35#include <linux/in6.h>
36#include <linux/mroute6.h>
37#include <linux/init.h>
38#include <linux/if_arp.h>
39#include <linux/proc_fs.h>
40#include <linux/seq_file.h>
41#include <linux/nsproxy.h>
42#include <linux/slab.h>
43#include <linux/jhash.h>
44#include <linux/siphash.h>
45#include <net/net_namespace.h>
46#include <net/snmp.h>
47#include <net/ipv6.h>
48#include <net/ip6_fib.h>
49#include <net/ip6_route.h>
50#include <net/ndisc.h>
51#include <net/addrconf.h>
52#include <net/tcp.h>
53#include <linux/rtnetlink.h>
54#include <net/dst.h>
55#include <net/dst_metadata.h>
56#include <net/xfrm.h>
57#include <net/netevent.h>
58#include <net/netlink.h>
59#include <net/rtnh.h>
60#include <net/lwtunnel.h>
61#include <net/ip_tunnels.h>
62#include <net/l3mdev.h>
63#include <net/ip.h>
64#include <linux/uaccess.h>
65#include <linux/btf_ids.h>
66
67#ifdef CONFIG_SYSCTL
68#include <linux/sysctl.h>
69#endif
70
71static int ip6_rt_type_to_error(u8 fib6_type);
72
73#define CREATE_TRACE_POINTS
74#include <trace/events/fib6.h>
75EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
76#undef CREATE_TRACE_POINTS
77
78enum rt6_nud_state {
79	RT6_NUD_FAIL_HARD = -3,
80	RT6_NUD_FAIL_PROBE = -2,
81	RT6_NUD_FAIL_DO_RR = -1,
82	RT6_NUD_SUCCEED = 1
83};
84
85INDIRECT_CALLABLE_SCOPE
86struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
87static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
88INDIRECT_CALLABLE_SCOPE
89unsigned int		ip6_mtu(const struct dst_entry *dst);
90static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91static void		ip6_dst_destroy(struct dst_entry *);
92static void		ip6_dst_ifdown(struct dst_entry *,
93				       struct net_device *dev);
94static void		 ip6_dst_gc(struct dst_ops *ops);
95
96static int		ip6_pkt_discard(struct sk_buff *skb);
97static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98static int		ip6_pkt_prohibit(struct sk_buff *skb);
99static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100static void		ip6_link_failure(struct sk_buff *skb);
101static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102					   struct sk_buff *skb, u32 mtu,
103					   bool confirm_neigh);
104static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
105					struct sk_buff *skb);
106static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
107			   int strict);
108static size_t rt6_nlmsg_size(struct fib6_info *f6i);
109static int rt6_fill_node(struct net *net, struct sk_buff *skb,
110			 struct fib6_info *rt, struct dst_entry *dst,
111			 struct in6_addr *dest, struct in6_addr *src,
112			 int iif, int type, u32 portid, u32 seq,
113			 unsigned int flags);
114static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
115					   const struct in6_addr *daddr,
116					   const struct in6_addr *saddr);
117
118#ifdef CONFIG_IPV6_ROUTE_INFO
119static struct fib6_info *rt6_add_route_info(struct net *net,
120					   const struct in6_addr *prefix, int prefixlen,
121					   const struct in6_addr *gwaddr,
122					   struct net_device *dev,
123					   unsigned int pref);
124static struct fib6_info *rt6_get_route_info(struct net *net,
125					   const struct in6_addr *prefix, int prefixlen,
126					   const struct in6_addr *gwaddr,
127					   struct net_device *dev);
128#endif
129
130struct uncached_list {
131	spinlock_t		lock;
132	struct list_head	head;
133	struct list_head	quarantine;
134};
135
136static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
137
138void rt6_uncached_list_add(struct rt6_info *rt)
139{
140	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
141
142	rt->dst.rt_uncached_list = ul;
143
144	spin_lock_bh(&ul->lock);
145	list_add_tail(&rt->dst.rt_uncached, &ul->head);
146	spin_unlock_bh(&ul->lock);
147}
148
149void rt6_uncached_list_del(struct rt6_info *rt)
150{
151	if (!list_empty(&rt->dst.rt_uncached)) {
152		struct uncached_list *ul = rt->dst.rt_uncached_list;
153
154		spin_lock_bh(&ul->lock);
155		list_del_init(&rt->dst.rt_uncached);
156		spin_unlock_bh(&ul->lock);
157	}
158}
159
160static void rt6_uncached_list_flush_dev(struct net_device *dev)
161{
162	int cpu;
163
164	for_each_possible_cpu(cpu) {
165		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
166		struct rt6_info *rt, *safe;
167
168		if (list_empty(&ul->head))
169			continue;
170
171		spin_lock_bh(&ul->lock);
172		list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
173			struct inet6_dev *rt_idev = rt->rt6i_idev;
174			struct net_device *rt_dev = rt->dst.dev;
175			bool handled = false;
176
177			if (rt_idev->dev == dev) {
178				rt->rt6i_idev = in6_dev_get(blackhole_netdev);
179				in6_dev_put(rt_idev);
180				handled = true;
181			}
182
183			if (rt_dev == dev) {
184				rt->dst.dev = blackhole_netdev;
185				netdev_ref_replace(rt_dev, blackhole_netdev,
186						   &rt->dst.dev_tracker,
187						   GFP_ATOMIC);
188				handled = true;
189			}
190			if (handled)
191				list_move(&rt->dst.rt_uncached,
192					  &ul->quarantine);
193		}
194		spin_unlock_bh(&ul->lock);
195	}
196}
197
198static inline const void *choose_neigh_daddr(const struct in6_addr *p,
199					     struct sk_buff *skb,
200					     const void *daddr)
201{
202	if (!ipv6_addr_any(p))
203		return (const void *) p;
204	else if (skb)
205		return &ipv6_hdr(skb)->daddr;
206	return daddr;
207}
208
209struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
210				   struct net_device *dev,
211				   struct sk_buff *skb,
212				   const void *daddr)
213{
214	struct neighbour *n;
215
216	daddr = choose_neigh_daddr(gw, skb, daddr);
217	n = __ipv6_neigh_lookup(dev, daddr);
218	if (n)
219		return n;
220
221	n = neigh_create(&nd_tbl, daddr, dev);
222	return IS_ERR(n) ? NULL : n;
223}
224
225static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
226					      struct sk_buff *skb,
227					      const void *daddr)
228{
229	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
230
231	return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
232				dst->dev, skb, daddr);
233}
234
235static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
236{
237	struct net_device *dev = dst->dev;
238	struct rt6_info *rt = (struct rt6_info *)dst;
239
240	daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
241	if (!daddr)
242		return;
243	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
244		return;
245	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
246		return;
247	__ipv6_confirm_neigh(dev, daddr);
248}
249
250static struct dst_ops ip6_dst_ops_template = {
251	.family			=	AF_INET6,
252	.gc			=	ip6_dst_gc,
253	.gc_thresh		=	1024,
254	.check			=	ip6_dst_check,
255	.default_advmss		=	ip6_default_advmss,
256	.mtu			=	ip6_mtu,
257	.cow_metrics		=	dst_cow_metrics_generic,
258	.destroy		=	ip6_dst_destroy,
259	.ifdown			=	ip6_dst_ifdown,
260	.negative_advice	=	ip6_negative_advice,
261	.link_failure		=	ip6_link_failure,
262	.update_pmtu		=	ip6_rt_update_pmtu,
263	.redirect		=	rt6_do_redirect,
264	.local_out		=	__ip6_local_out,
265	.neigh_lookup		=	ip6_dst_neigh_lookup,
266	.confirm_neigh		=	ip6_confirm_neigh,
267};
268
269static struct dst_ops ip6_dst_blackhole_ops = {
270	.family			= AF_INET6,
271	.default_advmss		= ip6_default_advmss,
272	.neigh_lookup		= ip6_dst_neigh_lookup,
273	.check			= ip6_dst_check,
274	.destroy		= ip6_dst_destroy,
275	.cow_metrics		= dst_cow_metrics_generic,
276	.update_pmtu		= dst_blackhole_update_pmtu,
277	.redirect		= dst_blackhole_redirect,
278	.mtu			= dst_blackhole_mtu,
279};
280
281static const u32 ip6_template_metrics[RTAX_MAX] = {
282	[RTAX_HOPLIMIT - 1] = 0,
283};
284
285static const struct fib6_info fib6_null_entry_template = {
286	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
287	.fib6_protocol  = RTPROT_KERNEL,
288	.fib6_metric	= ~(u32)0,
289	.fib6_ref	= REFCOUNT_INIT(1),
290	.fib6_type	= RTN_UNREACHABLE,
291	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
292};
293
294static const struct rt6_info ip6_null_entry_template = {
295	.dst = {
296		.__rcuref	= RCUREF_INIT(1),
297		.__use		= 1,
298		.obsolete	= DST_OBSOLETE_FORCE_CHK,
299		.error		= -ENETUNREACH,
300		.input		= ip6_pkt_discard,
301		.output		= ip6_pkt_discard_out,
302	},
303	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
304};
305
306#ifdef CONFIG_IPV6_MULTIPLE_TABLES
307
308static const struct rt6_info ip6_prohibit_entry_template = {
309	.dst = {
310		.__rcuref	= RCUREF_INIT(1),
311		.__use		= 1,
312		.obsolete	= DST_OBSOLETE_FORCE_CHK,
313		.error		= -EACCES,
314		.input		= ip6_pkt_prohibit,
315		.output		= ip6_pkt_prohibit_out,
316	},
317	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
318};
319
320static const struct rt6_info ip6_blk_hole_entry_template = {
321	.dst = {
322		.__rcuref	= RCUREF_INIT(1),
323		.__use		= 1,
324		.obsolete	= DST_OBSOLETE_FORCE_CHK,
325		.error		= -EINVAL,
326		.input		= dst_discard,
327		.output		= dst_discard_out,
328	},
329	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
330};
331
332#endif
333
334static void rt6_info_init(struct rt6_info *rt)
335{
336	memset_after(rt, 0, dst);
337}
338
339/* allocate dst with ip6_dst_ops */
340struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
341			       int flags)
342{
343	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
344					DST_OBSOLETE_FORCE_CHK, flags);
345
346	if (rt) {
347		rt6_info_init(rt);
348		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
349	}
350
351	return rt;
352}
353EXPORT_SYMBOL(ip6_dst_alloc);
354
355static void ip6_dst_destroy(struct dst_entry *dst)
356{
357	struct rt6_info *rt = (struct rt6_info *)dst;
358	struct fib6_info *from;
359	struct inet6_dev *idev;
360
361	ip_dst_metrics_put(dst);
362	rt6_uncached_list_del(rt);
363
364	idev = rt->rt6i_idev;
365	if (idev) {
366		rt->rt6i_idev = NULL;
367		in6_dev_put(idev);
368	}
369
370	from = xchg((__force struct fib6_info **)&rt->from, NULL);
371	fib6_info_release(from);
372}
373
374static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
375{
376	struct rt6_info *rt = (struct rt6_info *)dst;
377	struct inet6_dev *idev = rt->rt6i_idev;
378
379	if (idev && idev->dev != blackhole_netdev) {
380		struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev);
381
382		if (blackhole_idev) {
383			rt->rt6i_idev = blackhole_idev;
384			in6_dev_put(idev);
385		}
386	}
387}
388
389static bool __rt6_check_expired(const struct rt6_info *rt)
390{
391	if (rt->rt6i_flags & RTF_EXPIRES)
392		return time_after(jiffies, rt->dst.expires);
393	else
394		return false;
395}
396
397static bool rt6_check_expired(const struct rt6_info *rt)
398{
399	struct fib6_info *from;
400
401	from = rcu_dereference(rt->from);
402
403	if (rt->rt6i_flags & RTF_EXPIRES) {
404		if (time_after(jiffies, rt->dst.expires))
405			return true;
406	} else if (from) {
407		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
408			fib6_check_expired(from);
409	}
410	return false;
411}
412
413void fib6_select_path(const struct net *net, struct fib6_result *res,
414		      struct flowi6 *fl6, int oif, bool have_oif_match,
415		      const struct sk_buff *skb, int strict)
416{
417	struct fib6_info *sibling, *next_sibling;
418	struct fib6_info *match = res->f6i;
419
420	if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
421		goto out;
422
423	if (match->nh && have_oif_match && res->nh)
424		return;
425
426	if (skb)
427		IP6CB(skb)->flags |= IP6SKB_MULTIPATH;
428
429	/* We might have already computed the hash for ICMPv6 errors. In such
430	 * case it will always be non-zero. Otherwise now is the time to do it.
431	 */
432	if (!fl6->mp_hash &&
433	    (!match->nh || nexthop_is_multipath(match->nh)))
434		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
435
436	if (unlikely(match->nh)) {
437		nexthop_path_fib6_result(res, fl6->mp_hash);
438		return;
439	}
440
441	if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
442		goto out;
443
444	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
445				 fib6_siblings) {
446		const struct fib6_nh *nh = sibling->fib6_nh;
447		int nh_upper_bound;
448
449		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
450		if (fl6->mp_hash > nh_upper_bound)
451			continue;
452		if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
453			break;
454		match = sibling;
455		break;
456	}
457
458out:
459	res->f6i = match;
460	res->nh = match->fib6_nh;
461}
462
463/*
464 *	Route lookup. rcu_read_lock() should be held.
465 */
466
467static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
468			       const struct in6_addr *saddr, int oif, int flags)
469{
470	const struct net_device *dev;
471
472	if (nh->fib_nh_flags & RTNH_F_DEAD)
473		return false;
474
475	dev = nh->fib_nh_dev;
476	if (oif) {
477		if (dev->ifindex == oif)
478			return true;
479	} else {
480		if (ipv6_chk_addr(net, saddr, dev,
481				  flags & RT6_LOOKUP_F_IFACE))
482			return true;
483	}
484
485	return false;
486}
487
488struct fib6_nh_dm_arg {
489	struct net		*net;
490	const struct in6_addr	*saddr;
491	int			oif;
492	int			flags;
493	struct fib6_nh		*nh;
494};
495
496static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
497{
498	struct fib6_nh_dm_arg *arg = _arg;
499
500	arg->nh = nh;
501	return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
502				  arg->flags);
503}
504
505/* returns fib6_nh from nexthop or NULL */
506static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
507					struct fib6_result *res,
508					const struct in6_addr *saddr,
509					int oif, int flags)
510{
511	struct fib6_nh_dm_arg arg = {
512		.net   = net,
513		.saddr = saddr,
514		.oif   = oif,
515		.flags = flags,
516	};
517
518	if (nexthop_is_blackhole(nh))
519		return NULL;
520
521	if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
522		return arg.nh;
523
524	return NULL;
525}
526
527static void rt6_device_match(struct net *net, struct fib6_result *res,
528			     const struct in6_addr *saddr, int oif, int flags)
529{
530	struct fib6_info *f6i = res->f6i;
531	struct fib6_info *spf6i;
532	struct fib6_nh *nh;
533
534	if (!oif && ipv6_addr_any(saddr)) {
535		if (unlikely(f6i->nh)) {
536			nh = nexthop_fib6_nh(f6i->nh);
537			if (nexthop_is_blackhole(f6i->nh))
538				goto out_blackhole;
539		} else {
540			nh = f6i->fib6_nh;
541		}
542		if (!(nh->fib_nh_flags & RTNH_F_DEAD))
543			goto out;
544	}
545
546	for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
547		bool matched = false;
548
549		if (unlikely(spf6i->nh)) {
550			nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
551					      oif, flags);
552			if (nh)
553				matched = true;
554		} else {
555			nh = spf6i->fib6_nh;
556			if (__rt6_device_match(net, nh, saddr, oif, flags))
557				matched = true;
558		}
559		if (matched) {
560			res->f6i = spf6i;
561			goto out;
562		}
563	}
564
565	if (oif && flags & RT6_LOOKUP_F_IFACE) {
566		res->f6i = net->ipv6.fib6_null_entry;
567		nh = res->f6i->fib6_nh;
568		goto out;
569	}
570
571	if (unlikely(f6i->nh)) {
572		nh = nexthop_fib6_nh(f6i->nh);
573		if (nexthop_is_blackhole(f6i->nh))
574			goto out_blackhole;
575	} else {
576		nh = f6i->fib6_nh;
577	}
578
579	if (nh->fib_nh_flags & RTNH_F_DEAD) {
580		res->f6i = net->ipv6.fib6_null_entry;
581		nh = res->f6i->fib6_nh;
582	}
583out:
584	res->nh = nh;
585	res->fib6_type = res->f6i->fib6_type;
586	res->fib6_flags = res->f6i->fib6_flags;
587	return;
588
589out_blackhole:
590	res->fib6_flags |= RTF_REJECT;
591	res->fib6_type = RTN_BLACKHOLE;
592	res->nh = nh;
593}
594
595#ifdef CONFIG_IPV6_ROUTER_PREF
596struct __rt6_probe_work {
597	struct work_struct work;
598	struct in6_addr target;
599	struct net_device *dev;
600	netdevice_tracker dev_tracker;
601};
602
603static void rt6_probe_deferred(struct work_struct *w)
604{
605	struct in6_addr mcaddr;
606	struct __rt6_probe_work *work =
607		container_of(w, struct __rt6_probe_work, work);
608
609	addrconf_addr_solict_mult(&work->target, &mcaddr);
610	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
611	netdev_put(work->dev, &work->dev_tracker);
612	kfree(work);
613}
614
615static void rt6_probe(struct fib6_nh *fib6_nh)
616{
617	struct __rt6_probe_work *work = NULL;
618	const struct in6_addr *nh_gw;
619	unsigned long last_probe;
620	struct neighbour *neigh;
621	struct net_device *dev;
622	struct inet6_dev *idev;
623
624	/*
625	 * Okay, this does not seem to be appropriate
626	 * for now, however, we need to check if it
627	 * is really so; aka Router Reachability Probing.
628	 *
629	 * Router Reachability Probe MUST be rate-limited
630	 * to no more than one per minute.
631	 */
632	if (!fib6_nh->fib_nh_gw_family)
633		return;
634
635	nh_gw = &fib6_nh->fib_nh_gw6;
636	dev = fib6_nh->fib_nh_dev;
637	rcu_read_lock();
638	last_probe = READ_ONCE(fib6_nh->last_probe);
639	idev = __in6_dev_get(dev);
640	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
641	if (neigh) {
642		if (READ_ONCE(neigh->nud_state) & NUD_VALID)
643			goto out;
644
645		write_lock_bh(&neigh->lock);
646		if (!(neigh->nud_state & NUD_VALID) &&
647		    time_after(jiffies,
648			       neigh->updated +
649			       READ_ONCE(idev->cnf.rtr_probe_interval))) {
650			work = kmalloc(sizeof(*work), GFP_ATOMIC);
651			if (work)
652				__neigh_set_probe_once(neigh);
653		}
654		write_unlock_bh(&neigh->lock);
655	} else if (time_after(jiffies, last_probe +
656				       READ_ONCE(idev->cnf.rtr_probe_interval))) {
657		work = kmalloc(sizeof(*work), GFP_ATOMIC);
658	}
659
660	if (!work || cmpxchg(&fib6_nh->last_probe,
661			     last_probe, jiffies) != last_probe) {
662		kfree(work);
663	} else {
664		INIT_WORK(&work->work, rt6_probe_deferred);
665		work->target = *nh_gw;
666		netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC);
667		work->dev = dev;
668		schedule_work(&work->work);
669	}
670
671out:
672	rcu_read_unlock();
673}
674#else
675static inline void rt6_probe(struct fib6_nh *fib6_nh)
676{
677}
678#endif
679
680/*
681 * Default Router Selection (RFC 2461 6.3.6)
682 */
683static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
684{
685	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
686	struct neighbour *neigh;
687
688	rcu_read_lock();
689	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
690					  &fib6_nh->fib_nh_gw6);
691	if (neigh) {
692		u8 nud_state = READ_ONCE(neigh->nud_state);
693
694		if (nud_state & NUD_VALID)
695			ret = RT6_NUD_SUCCEED;
696#ifdef CONFIG_IPV6_ROUTER_PREF
697		else if (!(nud_state & NUD_FAILED))
698			ret = RT6_NUD_SUCCEED;
699		else
700			ret = RT6_NUD_FAIL_PROBE;
701#endif
702	} else {
703		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
704		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
705	}
706	rcu_read_unlock();
707
708	return ret;
709}
710
711static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
712			   int strict)
713{
714	int m = 0;
715
716	if (!oif || nh->fib_nh_dev->ifindex == oif)
717		m = 2;
718
719	if (!m && (strict & RT6_LOOKUP_F_IFACE))
720		return RT6_NUD_FAIL_HARD;
721#ifdef CONFIG_IPV6_ROUTER_PREF
722	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
723#endif
724	if ((strict & RT6_LOOKUP_F_REACHABLE) &&
725	    !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
726		int n = rt6_check_neigh(nh);
727		if (n < 0)
728			return n;
729	}
730	return m;
731}
732
733static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
734		       int oif, int strict, int *mpri, bool *do_rr)
735{
736	bool match_do_rr = false;
737	bool rc = false;
738	int m;
739
740	if (nh->fib_nh_flags & RTNH_F_DEAD)
741		goto out;
742
743	if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
744	    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
745	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
746		goto out;
747
748	m = rt6_score_route(nh, fib6_flags, oif, strict);
749	if (m == RT6_NUD_FAIL_DO_RR) {
750		match_do_rr = true;
751		m = 0; /* lowest valid score */
752	} else if (m == RT6_NUD_FAIL_HARD) {
753		goto out;
754	}
755
756	if (strict & RT6_LOOKUP_F_REACHABLE)
757		rt6_probe(nh);
758
759	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
760	if (m > *mpri) {
761		*do_rr = match_do_rr;
762		*mpri = m;
763		rc = true;
764	}
765out:
766	return rc;
767}
768
769struct fib6_nh_frl_arg {
770	u32		flags;
771	int		oif;
772	int		strict;
773	int		*mpri;
774	bool		*do_rr;
775	struct fib6_nh	*nh;
776};
777
778static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
779{
780	struct fib6_nh_frl_arg *arg = _arg;
781
782	arg->nh = nh;
783	return find_match(nh, arg->flags, arg->oif, arg->strict,
784			  arg->mpri, arg->do_rr);
785}
786
787static void __find_rr_leaf(struct fib6_info *f6i_start,
788			   struct fib6_info *nomatch, u32 metric,
789			   struct fib6_result *res, struct fib6_info **cont,
790			   int oif, int strict, bool *do_rr, int *mpri)
791{
792	struct fib6_info *f6i;
793
794	for (f6i = f6i_start;
795	     f6i && f6i != nomatch;
796	     f6i = rcu_dereference(f6i->fib6_next)) {
797		bool matched = false;
798		struct fib6_nh *nh;
799
800		if (cont && f6i->fib6_metric != metric) {
801			*cont = f6i;
802			return;
803		}
804
805		if (fib6_check_expired(f6i))
806			continue;
807
808		if (unlikely(f6i->nh)) {
809			struct fib6_nh_frl_arg arg = {
810				.flags  = f6i->fib6_flags,
811				.oif    = oif,
812				.strict = strict,
813				.mpri   = mpri,
814				.do_rr  = do_rr
815			};
816
817			if (nexthop_is_blackhole(f6i->nh)) {
818				res->fib6_flags = RTF_REJECT;
819				res->fib6_type = RTN_BLACKHOLE;
820				res->f6i = f6i;
821				res->nh = nexthop_fib6_nh(f6i->nh);
822				return;
823			}
824			if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
825						     &arg)) {
826				matched = true;
827				nh = arg.nh;
828			}
829		} else {
830			nh = f6i->fib6_nh;
831			if (find_match(nh, f6i->fib6_flags, oif, strict,
832				       mpri, do_rr))
833				matched = true;
834		}
835		if (matched) {
836			res->f6i = f6i;
837			res->nh = nh;
838			res->fib6_flags = f6i->fib6_flags;
839			res->fib6_type = f6i->fib6_type;
840		}
841	}
842}
843
844static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
845			 struct fib6_info *rr_head, int oif, int strict,
846			 bool *do_rr, struct fib6_result *res)
847{
848	u32 metric = rr_head->fib6_metric;
849	struct fib6_info *cont = NULL;
850	int mpri = -1;
851
852	__find_rr_leaf(rr_head, NULL, metric, res, &cont,
853		       oif, strict, do_rr, &mpri);
854
855	__find_rr_leaf(leaf, rr_head, metric, res, &cont,
856		       oif, strict, do_rr, &mpri);
857
858	if (res->f6i || !cont)
859		return;
860
861	__find_rr_leaf(cont, NULL, metric, res, NULL,
862		       oif, strict, do_rr, &mpri);
863}
864
865static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
866		       struct fib6_result *res, int strict)
867{
868	struct fib6_info *leaf = rcu_dereference(fn->leaf);
869	struct fib6_info *rt0;
870	bool do_rr = false;
871	int key_plen;
872
873	/* make sure this function or its helpers sets f6i */
874	res->f6i = NULL;
875
876	if (!leaf || leaf == net->ipv6.fib6_null_entry)
877		goto out;
878
879	rt0 = rcu_dereference(fn->rr_ptr);
880	if (!rt0)
881		rt0 = leaf;
882
883	/* Double check to make sure fn is not an intermediate node
884	 * and fn->leaf does not points to its child's leaf
885	 * (This might happen if all routes under fn are deleted from
886	 * the tree and fib6_repair_tree() is called on the node.)
887	 */
888	key_plen = rt0->fib6_dst.plen;
889#ifdef CONFIG_IPV6_SUBTREES
890	if (rt0->fib6_src.plen)
891		key_plen = rt0->fib6_src.plen;
892#endif
893	if (fn->fn_bit != key_plen)
894		goto out;
895
896	find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
897	if (do_rr) {
898		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
899
900		/* no entries matched; do round-robin */
901		if (!next || next->fib6_metric != rt0->fib6_metric)
902			next = leaf;
903
904		if (next != rt0) {
905			spin_lock_bh(&leaf->fib6_table->tb6_lock);
906			/* make sure next is not being deleted from the tree */
907			if (next->fib6_node)
908				rcu_assign_pointer(fn->rr_ptr, next);
909			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
910		}
911	}
912
913out:
914	if (!res->f6i) {
915		res->f6i = net->ipv6.fib6_null_entry;
916		res->nh = res->f6i->fib6_nh;
917		res->fib6_flags = res->f6i->fib6_flags;
918		res->fib6_type = res->f6i->fib6_type;
919	}
920}
921
922static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
923{
924	return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
925	       res->nh->fib_nh_gw_family;
926}
927
928#ifdef CONFIG_IPV6_ROUTE_INFO
929int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
930		  const struct in6_addr *gwaddr)
931{
932	struct net *net = dev_net(dev);
933	struct route_info *rinfo = (struct route_info *) opt;
934	struct in6_addr prefix_buf, *prefix;
935	struct fib6_table *table;
936	unsigned int pref;
937	unsigned long lifetime;
938	struct fib6_info *rt;
939
940	if (len < sizeof(struct route_info)) {
941		return -EINVAL;
942	}
943
944	/* Sanity check for prefix_len and length */
945	if (rinfo->length > 3) {
946		return -EINVAL;
947	} else if (rinfo->prefix_len > 128) {
948		return -EINVAL;
949	} else if (rinfo->prefix_len > 64) {
950		if (rinfo->length < 2) {
951			return -EINVAL;
952		}
953	} else if (rinfo->prefix_len > 0) {
954		if (rinfo->length < 1) {
955			return -EINVAL;
956		}
957	}
958
959	pref = rinfo->route_pref;
960	if (pref == ICMPV6_ROUTER_PREF_INVALID)
961		return -EINVAL;
962
963	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
964
965	if (rinfo->length == 3)
966		prefix = (struct in6_addr *)rinfo->prefix;
967	else {
968		/* this function is safe */
969		ipv6_addr_prefix(&prefix_buf,
970				 (struct in6_addr *)rinfo->prefix,
971				 rinfo->prefix_len);
972		prefix = &prefix_buf;
973	}
974
975	if (rinfo->prefix_len == 0)
976		rt = rt6_get_dflt_router(net, gwaddr, dev);
977	else
978		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
979					gwaddr, dev);
980
981	if (rt && !lifetime) {
982		ip6_del_rt(net, rt, false);
983		rt = NULL;
984	}
985
986	if (!rt && lifetime)
987		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
988					dev, pref);
989	else if (rt)
990		rt->fib6_flags = RTF_ROUTEINFO |
991				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
992
993	if (rt) {
994		table = rt->fib6_table;
995		spin_lock_bh(&table->tb6_lock);
996
997		if (!addrconf_finite_timeout(lifetime)) {
998			fib6_clean_expires(rt);
999			fib6_remove_gc_list(rt);
1000		} else {
1001			fib6_set_expires(rt, jiffies + HZ * lifetime);
1002			fib6_add_gc_list(rt);
1003		}
1004
1005		spin_unlock_bh(&table->tb6_lock);
1006
1007		fib6_info_release(rt);
1008	}
1009	return 0;
1010}
1011#endif
1012
1013/*
1014 *	Misc support functions
1015 */
1016
1017/* called with rcu_lock held */
1018static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
1019{
1020	struct net_device *dev = res->nh->fib_nh_dev;
1021
1022	if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1023		/* for copies of local routes, dst->dev needs to be the
1024		 * device if it is a master device, the master device if
1025		 * device is enslaved, and the loopback as the default
1026		 */
1027		if (netif_is_l3_slave(dev) &&
1028		    !rt6_need_strict(&res->f6i->fib6_dst.addr))
1029			dev = l3mdev_master_dev_rcu(dev);
1030		else if (!netif_is_l3_master(dev))
1031			dev = dev_net(dev)->loopback_dev;
1032		/* last case is netif_is_l3_master(dev) is true in which
1033		 * case we want dev returned to be dev
1034		 */
1035	}
1036
1037	return dev;
1038}
1039
1040static const int fib6_prop[RTN_MAX + 1] = {
1041	[RTN_UNSPEC]	= 0,
1042	[RTN_UNICAST]	= 0,
1043	[RTN_LOCAL]	= 0,
1044	[RTN_BROADCAST]	= 0,
1045	[RTN_ANYCAST]	= 0,
1046	[RTN_MULTICAST]	= 0,
1047	[RTN_BLACKHOLE]	= -EINVAL,
1048	[RTN_UNREACHABLE] = -EHOSTUNREACH,
1049	[RTN_PROHIBIT]	= -EACCES,
1050	[RTN_THROW]	= -EAGAIN,
1051	[RTN_NAT]	= -EINVAL,
1052	[RTN_XRESOLVE]	= -EINVAL,
1053};
1054
1055static int ip6_rt_type_to_error(u8 fib6_type)
1056{
1057	return fib6_prop[fib6_type];
1058}
1059
1060static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
1061{
1062	unsigned short flags = 0;
1063
1064	if (rt->dst_nocount)
1065		flags |= DST_NOCOUNT;
1066	if (rt->dst_nopolicy)
1067		flags |= DST_NOPOLICY;
1068
1069	return flags;
1070}
1071
1072static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
1073{
1074	rt->dst.error = ip6_rt_type_to_error(fib6_type);
1075
1076	switch (fib6_type) {
1077	case RTN_BLACKHOLE:
1078		rt->dst.output = dst_discard_out;
1079		rt->dst.input = dst_discard;
1080		break;
1081	case RTN_PROHIBIT:
1082		rt->dst.output = ip6_pkt_prohibit_out;
1083		rt->dst.input = ip6_pkt_prohibit;
1084		break;
1085	case RTN_THROW:
1086	case RTN_UNREACHABLE:
1087	default:
1088		rt->dst.output = ip6_pkt_discard_out;
1089		rt->dst.input = ip6_pkt_discard;
1090		break;
1091	}
1092}
1093
1094static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
1095{
1096	struct fib6_info *f6i = res->f6i;
1097
1098	if (res->fib6_flags & RTF_REJECT) {
1099		ip6_rt_init_dst_reject(rt, res->fib6_type);
1100		return;
1101	}
1102
1103	rt->dst.error = 0;
1104	rt->dst.output = ip6_output;
1105
1106	if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
1107		rt->dst.input = ip6_input;
1108	} else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
1109		rt->dst.input = ip6_mc_input;
1110	} else {
1111		rt->dst.input = ip6_forward;
1112	}
1113
1114	if (res->nh->fib_nh_lws) {
1115		rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
1116		lwtunnel_set_redirect(&rt->dst);
1117	}
1118
1119	rt->dst.lastuse = jiffies;
1120}
1121
1122/* Caller must already hold reference to @from */
1123static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1124{
1125	rt->rt6i_flags &= ~RTF_EXPIRES;
1126	rcu_assign_pointer(rt->from, from);
1127	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1128}
1129
1130/* Caller must already hold reference to f6i in result */
1131static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1132{
1133	const struct fib6_nh *nh = res->nh;
1134	const struct net_device *dev = nh->fib_nh_dev;
1135	struct fib6_info *f6i = res->f6i;
1136
1137	ip6_rt_init_dst(rt, res);
1138
1139	rt->rt6i_dst = f6i->fib6_dst;
1140	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1141	rt->rt6i_flags = res->fib6_flags;
1142	if (nh->fib_nh_gw_family) {
1143		rt->rt6i_gateway = nh->fib_nh_gw6;
1144		rt->rt6i_flags |= RTF_GATEWAY;
1145	}
1146	rt6_set_from(rt, f6i);
1147#ifdef CONFIG_IPV6_SUBTREES
1148	rt->rt6i_src = f6i->fib6_src;
1149#endif
1150}
1151
1152static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1153					struct in6_addr *saddr)
1154{
1155	struct fib6_node *pn, *sn;
1156	while (1) {
1157		if (fn->fn_flags & RTN_TL_ROOT)
1158			return NULL;
1159		pn = rcu_dereference(fn->parent);
1160		sn = FIB6_SUBTREE(pn);
1161		if (sn && sn != fn)
1162			fn = fib6_node_lookup(sn, NULL, saddr);
1163		else
1164			fn = pn;
1165		if (fn->fn_flags & RTN_RTINFO)
1166			return fn;
1167	}
1168}
1169
1170static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1171{
1172	struct rt6_info *rt = *prt;
1173
1174	if (dst_hold_safe(&rt->dst))
1175		return true;
1176	if (net) {
1177		rt = net->ipv6.ip6_null_entry;
1178		dst_hold(&rt->dst);
1179	} else {
1180		rt = NULL;
1181	}
1182	*prt = rt;
1183	return false;
1184}
1185
1186/* called with rcu_lock held */
1187static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1188{
1189	struct net_device *dev = res->nh->fib_nh_dev;
1190	struct fib6_info *f6i = res->f6i;
1191	unsigned short flags;
1192	struct rt6_info *nrt;
1193
1194	if (!fib6_info_hold_safe(f6i))
1195		goto fallback;
1196
1197	flags = fib6_info_dst_flags(f6i);
1198	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1199	if (!nrt) {
1200		fib6_info_release(f6i);
1201		goto fallback;
1202	}
1203
1204	ip6_rt_copy_init(nrt, res);
1205	return nrt;
1206
1207fallback:
1208	nrt = dev_net(dev)->ipv6.ip6_null_entry;
1209	dst_hold(&nrt->dst);
1210	return nrt;
1211}
1212
1213INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net,
1214					     struct fib6_table *table,
1215					     struct flowi6 *fl6,
1216					     const struct sk_buff *skb,
1217					     int flags)
1218{
1219	struct fib6_result res = {};
1220	struct fib6_node *fn;
1221	struct rt6_info *rt;
1222
1223	rcu_read_lock();
1224	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1225restart:
1226	res.f6i = rcu_dereference(fn->leaf);
1227	if (!res.f6i)
1228		res.f6i = net->ipv6.fib6_null_entry;
1229	else
1230		rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1231				 flags);
1232
1233	if (res.f6i == net->ipv6.fib6_null_entry) {
1234		fn = fib6_backtrack(fn, &fl6->saddr);
1235		if (fn)
1236			goto restart;
1237
1238		rt = net->ipv6.ip6_null_entry;
1239		dst_hold(&rt->dst);
1240		goto out;
1241	} else if (res.fib6_flags & RTF_REJECT) {
1242		goto do_create;
1243	}
1244
1245	fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1246			 fl6->flowi6_oif != 0, skb, flags);
1247
1248	/* Search through exception table */
1249	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1250	if (rt) {
1251		if (ip6_hold_safe(net, &rt))
1252			dst_use_noref(&rt->dst, jiffies);
1253	} else {
1254do_create:
1255		rt = ip6_create_rt_rcu(&res);
1256	}
1257
1258out:
1259	trace_fib6_table_lookup(net, &res, table, fl6);
1260
1261	rcu_read_unlock();
1262
1263	return rt;
1264}
1265
1266struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1267				   const struct sk_buff *skb, int flags)
1268{
1269	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1270}
1271EXPORT_SYMBOL_GPL(ip6_route_lookup);
1272
1273struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1274			    const struct in6_addr *saddr, int oif,
1275			    const struct sk_buff *skb, int strict)
1276{
1277	struct flowi6 fl6 = {
1278		.flowi6_oif = oif,
1279		.daddr = *daddr,
1280	};
1281	struct dst_entry *dst;
1282	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1283
1284	if (saddr) {
1285		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1286		flags |= RT6_LOOKUP_F_HAS_SADDR;
1287	}
1288
1289	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1290	if (dst->error == 0)
1291		return (struct rt6_info *) dst;
1292
1293	dst_release(dst);
1294
1295	return NULL;
1296}
1297EXPORT_SYMBOL(rt6_lookup);
1298
1299/* ip6_ins_rt is called with FREE table->tb6_lock.
1300 * It takes new route entry, the addition fails by any reason the
1301 * route is released.
1302 * Caller must hold dst before calling it.
1303 */
1304
1305static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1306			struct netlink_ext_ack *extack)
1307{
1308	int err;
1309	struct fib6_table *table;
1310
1311	table = rt->fib6_table;
1312	spin_lock_bh(&table->tb6_lock);
1313	err = fib6_add(&table->tb6_root, rt, info, extack);
1314	spin_unlock_bh(&table->tb6_lock);
1315
1316	return err;
1317}
1318
1319int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1320{
1321	struct nl_info info = {	.nl_net = net, };
1322
1323	return __ip6_ins_rt(rt, &info, NULL);
1324}
1325
1326static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1327					   const struct in6_addr *daddr,
1328					   const struct in6_addr *saddr)
1329{
1330	struct fib6_info *f6i = res->f6i;
1331	struct net_device *dev;
1332	struct rt6_info *rt;
1333
1334	/*
1335	 *	Clone the route.
1336	 */
1337
1338	if (!fib6_info_hold_safe(f6i))
1339		return NULL;
1340
1341	dev = ip6_rt_get_dev_rcu(res);
1342	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1343	if (!rt) {
1344		fib6_info_release(f6i);
1345		return NULL;
1346	}
1347
1348	ip6_rt_copy_init(rt, res);
1349	rt->rt6i_flags |= RTF_CACHE;
1350	rt->rt6i_dst.addr = *daddr;
1351	rt->rt6i_dst.plen = 128;
1352
1353	if (!rt6_is_gw_or_nonexthop(res)) {
1354		if (f6i->fib6_dst.plen != 128 &&
1355		    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1356			rt->rt6i_flags |= RTF_ANYCAST;
1357#ifdef CONFIG_IPV6_SUBTREES
1358		if (rt->rt6i_src.plen && saddr) {
1359			rt->rt6i_src.addr = *saddr;
1360			rt->rt6i_src.plen = 128;
1361		}
1362#endif
1363	}
1364
1365	return rt;
1366}
1367
1368static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1369{
1370	struct fib6_info *f6i = res->f6i;
1371	unsigned short flags = fib6_info_dst_flags(f6i);
1372	struct net_device *dev;
1373	struct rt6_info *pcpu_rt;
1374
1375	if (!fib6_info_hold_safe(f6i))
1376		return NULL;
1377
1378	rcu_read_lock();
1379	dev = ip6_rt_get_dev_rcu(res);
1380	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT);
1381	rcu_read_unlock();
1382	if (!pcpu_rt) {
1383		fib6_info_release(f6i);
1384		return NULL;
1385	}
1386	ip6_rt_copy_init(pcpu_rt, res);
1387	pcpu_rt->rt6i_flags |= RTF_PCPU;
1388
1389	if (f6i->nh)
1390		pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev));
1391
1392	return pcpu_rt;
1393}
1394
1395static bool rt6_is_valid(const struct rt6_info *rt6)
1396{
1397	return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev));
1398}
1399
1400/* It should be called with rcu_read_lock() acquired */
1401static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1402{
1403	struct rt6_info *pcpu_rt;
1404
1405	pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
1406
1407	if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) {
1408		struct rt6_info *prev, **p;
1409
1410		p = this_cpu_ptr(res->nh->rt6i_pcpu);
1411		prev = xchg(p, NULL);
1412		if (prev) {
1413			dst_dev_put(&prev->dst);
1414			dst_release(&prev->dst);
1415		}
1416
1417		pcpu_rt = NULL;
1418	}
1419
1420	return pcpu_rt;
1421}
1422
1423static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1424					    const struct fib6_result *res)
1425{
1426	struct rt6_info *pcpu_rt, *prev, **p;
1427
1428	pcpu_rt = ip6_rt_pcpu_alloc(res);
1429	if (!pcpu_rt)
1430		return NULL;
1431
1432	p = this_cpu_ptr(res->nh->rt6i_pcpu);
1433	prev = cmpxchg(p, NULL, pcpu_rt);
1434	BUG_ON(prev);
1435
1436	if (res->f6i->fib6_destroying) {
1437		struct fib6_info *from;
1438
1439		from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1440		fib6_info_release(from);
1441	}
1442
1443	return pcpu_rt;
1444}
1445
1446/* exception hash table implementation
1447 */
1448static DEFINE_SPINLOCK(rt6_exception_lock);
1449
1450/* Remove rt6_ex from hash table and free the memory
1451 * Caller must hold rt6_exception_lock
1452 */
1453static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1454				 struct rt6_exception *rt6_ex)
1455{
1456	struct fib6_info *from;
1457	struct net *net;
1458
1459	if (!bucket || !rt6_ex)
1460		return;
1461
1462	net = dev_net(rt6_ex->rt6i->dst.dev);
1463	net->ipv6.rt6_stats->fib_rt_cache--;
1464
1465	/* purge completely the exception to allow releasing the held resources:
1466	 * some [sk] cache may keep the dst around for unlimited time
1467	 */
1468	from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1469	fib6_info_release(from);
1470	dst_dev_put(&rt6_ex->rt6i->dst);
1471
1472	hlist_del_rcu(&rt6_ex->hlist);
1473	dst_release(&rt6_ex->rt6i->dst);
1474	kfree_rcu(rt6_ex, rcu);
1475	WARN_ON_ONCE(!bucket->depth);
1476	bucket->depth--;
1477}
1478
1479/* Remove oldest rt6_ex in bucket and free the memory
1480 * Caller must hold rt6_exception_lock
1481 */
1482static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1483{
1484	struct rt6_exception *rt6_ex, *oldest = NULL;
1485
1486	if (!bucket)
1487		return;
1488
1489	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1490		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1491			oldest = rt6_ex;
1492	}
1493	rt6_remove_exception(bucket, oldest);
1494}
1495
1496static u32 rt6_exception_hash(const struct in6_addr *dst,
1497			      const struct in6_addr *src)
1498{
1499	static siphash_aligned_key_t rt6_exception_key;
1500	struct {
1501		struct in6_addr dst;
1502		struct in6_addr src;
1503	} __aligned(SIPHASH_ALIGNMENT) combined = {
1504		.dst = *dst,
1505	};
1506	u64 val;
1507
1508	net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key));
1509
1510#ifdef CONFIG_IPV6_SUBTREES
1511	if (src)
1512		combined.src = *src;
1513#endif
1514	val = siphash(&combined, sizeof(combined), &rt6_exception_key);
1515
1516	return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1517}
1518
1519/* Helper function to find the cached rt in the hash table
1520 * and update bucket pointer to point to the bucket for this
1521 * (daddr, saddr) pair
1522 * Caller must hold rt6_exception_lock
1523 */
1524static struct rt6_exception *
1525__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1526			      const struct in6_addr *daddr,
1527			      const struct in6_addr *saddr)
1528{
1529	struct rt6_exception *rt6_ex;
1530	u32 hval;
1531
1532	if (!(*bucket) || !daddr)
1533		return NULL;
1534
1535	hval = rt6_exception_hash(daddr, saddr);
1536	*bucket += hval;
1537
1538	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1539		struct rt6_info *rt6 = rt6_ex->rt6i;
1540		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1541
1542#ifdef CONFIG_IPV6_SUBTREES
1543		if (matched && saddr)
1544			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1545#endif
1546		if (matched)
1547			return rt6_ex;
1548	}
1549	return NULL;
1550}
1551
1552/* Helper function to find the cached rt in the hash table
1553 * and update bucket pointer to point to the bucket for this
1554 * (daddr, saddr) pair
1555 * Caller must hold rcu_read_lock()
1556 */
1557static struct rt6_exception *
1558__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1559			 const struct in6_addr *daddr,
1560			 const struct in6_addr *saddr)
1561{
1562	struct rt6_exception *rt6_ex;
1563	u32 hval;
1564
1565	WARN_ON_ONCE(!rcu_read_lock_held());
1566
1567	if (!(*bucket) || !daddr)
1568		return NULL;
1569
1570	hval = rt6_exception_hash(daddr, saddr);
1571	*bucket += hval;
1572
1573	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1574		struct rt6_info *rt6 = rt6_ex->rt6i;
1575		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1576
1577#ifdef CONFIG_IPV6_SUBTREES
1578		if (matched && saddr)
1579			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1580#endif
1581		if (matched)
1582			return rt6_ex;
1583	}
1584	return NULL;
1585}
1586
1587static unsigned int fib6_mtu(const struct fib6_result *res)
1588{
1589	const struct fib6_nh *nh = res->nh;
1590	unsigned int mtu;
1591
1592	if (res->f6i->fib6_pmtu) {
1593		mtu = res->f6i->fib6_pmtu;
1594	} else {
1595		struct net_device *dev = nh->fib_nh_dev;
1596		struct inet6_dev *idev;
1597
1598		rcu_read_lock();
1599		idev = __in6_dev_get(dev);
1600		mtu = READ_ONCE(idev->cnf.mtu6);
1601		rcu_read_unlock();
1602	}
1603
1604	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1605
1606	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1607}
1608
1609#define FIB6_EXCEPTION_BUCKET_FLUSHED  0x1UL
1610
1611/* used when the flushed bit is not relevant, only access to the bucket
1612 * (ie., all bucket users except rt6_insert_exception);
1613 *
1614 * called under rcu lock; sometimes called with rt6_exception_lock held
1615 */
1616static
1617struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
1618						       spinlock_t *lock)
1619{
1620	struct rt6_exception_bucket *bucket;
1621
1622	if (lock)
1623		bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1624						   lockdep_is_held(lock));
1625	else
1626		bucket = rcu_dereference(nh->rt6i_exception_bucket);
1627
1628	/* remove bucket flushed bit if set */
1629	if (bucket) {
1630		unsigned long p = (unsigned long)bucket;
1631
1632		p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
1633		bucket = (struct rt6_exception_bucket *)p;
1634	}
1635
1636	return bucket;
1637}
1638
1639static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
1640{
1641	unsigned long p = (unsigned long)bucket;
1642
1643	return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
1644}
1645
1646/* called with rt6_exception_lock held */
1647static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
1648					      spinlock_t *lock)
1649{
1650	struct rt6_exception_bucket *bucket;
1651	unsigned long p;
1652
1653	bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1654					   lockdep_is_held(lock));
1655
1656	p = (unsigned long)bucket;
1657	p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
1658	bucket = (struct rt6_exception_bucket *)p;
1659	rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
1660}
1661
1662static int rt6_insert_exception(struct rt6_info *nrt,
1663				const struct fib6_result *res)
1664{
1665	struct net *net = dev_net(nrt->dst.dev);
1666	struct rt6_exception_bucket *bucket;
1667	struct fib6_info *f6i = res->f6i;
1668	struct in6_addr *src_key = NULL;
1669	struct rt6_exception *rt6_ex;
1670	struct fib6_nh *nh = res->nh;
1671	int max_depth;
1672	int err = 0;
1673
1674	spin_lock_bh(&rt6_exception_lock);
1675
1676	bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1677					  lockdep_is_held(&rt6_exception_lock));
1678	if (!bucket) {
1679		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1680				 GFP_ATOMIC);
1681		if (!bucket) {
1682			err = -ENOMEM;
1683			goto out;
1684		}
1685		rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
1686	} else if (fib6_nh_excptn_bucket_flushed(bucket)) {
1687		err = -EINVAL;
1688		goto out;
1689	}
1690
1691#ifdef CONFIG_IPV6_SUBTREES
1692	/* fib6_src.plen != 0 indicates f6i is in subtree
1693	 * and exception table is indexed by a hash of
1694	 * both fib6_dst and fib6_src.
1695	 * Otherwise, the exception table is indexed by
1696	 * a hash of only fib6_dst.
1697	 */
1698	if (f6i->fib6_src.plen)
1699		src_key = &nrt->rt6i_src.addr;
1700#endif
1701	/* rt6_mtu_change() might lower mtu on f6i.
1702	 * Only insert this exception route if its mtu
1703	 * is less than f6i's mtu value.
1704	 */
1705	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1706		err = -EINVAL;
1707		goto out;
1708	}
1709
1710	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1711					       src_key);
1712	if (rt6_ex)
1713		rt6_remove_exception(bucket, rt6_ex);
1714
1715	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1716	if (!rt6_ex) {
1717		err = -ENOMEM;
1718		goto out;
1719	}
1720	rt6_ex->rt6i = nrt;
1721	rt6_ex->stamp = jiffies;
1722	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1723	bucket->depth++;
1724	net->ipv6.rt6_stats->fib_rt_cache++;
1725
1726	/* Randomize max depth to avoid some side channels attacks. */
1727	max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH);
1728	while (bucket->depth > max_depth)
1729		rt6_exception_remove_oldest(bucket);
1730
1731out:
1732	spin_unlock_bh(&rt6_exception_lock);
1733
1734	/* Update fn->fn_sernum to invalidate all cached dst */
1735	if (!err) {
1736		spin_lock_bh(&f6i->fib6_table->tb6_lock);
1737		fib6_update_sernum(net, f6i);
1738		spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1739		fib6_force_start_gc(net);
1740	}
1741
1742	return err;
1743}
1744
1745static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
1746{
1747	struct rt6_exception_bucket *bucket;
1748	struct rt6_exception *rt6_ex;
1749	struct hlist_node *tmp;
1750	int i;
1751
1752	spin_lock_bh(&rt6_exception_lock);
1753
1754	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1755	if (!bucket)
1756		goto out;
1757
1758	/* Prevent rt6_insert_exception() to recreate the bucket list */
1759	if (!from)
1760		fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
1761
1762	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1763		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
1764			if (!from ||
1765			    rcu_access_pointer(rt6_ex->rt6i->from) == from)
1766				rt6_remove_exception(bucket, rt6_ex);
1767		}
1768		WARN_ON_ONCE(!from && bucket->depth);
1769		bucket++;
1770	}
1771out:
1772	spin_unlock_bh(&rt6_exception_lock);
1773}
1774
1775static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
1776{
1777	struct fib6_info *f6i = arg;
1778
1779	fib6_nh_flush_exceptions(nh, f6i);
1780
1781	return 0;
1782}
1783
1784void rt6_flush_exceptions(struct fib6_info *f6i)
1785{
1786	if (f6i->nh)
1787		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions,
1788					 f6i);
1789	else
1790		fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
1791}
1792
1793/* Find cached rt in the hash table inside passed in rt
1794 * Caller has to hold rcu_read_lock()
1795 */
1796static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1797					   const struct in6_addr *daddr,
1798					   const struct in6_addr *saddr)
1799{
1800	const struct in6_addr *src_key = NULL;
1801	struct rt6_exception_bucket *bucket;
1802	struct rt6_exception *rt6_ex;
1803	struct rt6_info *ret = NULL;
1804
1805#ifdef CONFIG_IPV6_SUBTREES
1806	/* fib6i_src.plen != 0 indicates f6i is in subtree
1807	 * and exception table is indexed by a hash of
1808	 * both fib6_dst and fib6_src.
1809	 * However, the src addr used to create the hash
1810	 * might not be exactly the passed in saddr which
1811	 * is a /128 addr from the flow.
1812	 * So we need to use f6i->fib6_src to redo lookup
1813	 * if the passed in saddr does not find anything.
1814	 * (See the logic in ip6_rt_cache_alloc() on how
1815	 * rt->rt6i_src is updated.)
1816	 */
1817	if (res->f6i->fib6_src.plen)
1818		src_key = saddr;
1819find_ex:
1820#endif
1821	bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
1822	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1823
1824	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1825		ret = rt6_ex->rt6i;
1826
1827#ifdef CONFIG_IPV6_SUBTREES
1828	/* Use fib6_src as src_key and redo lookup */
1829	if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1830		src_key = &res->f6i->fib6_src.addr;
1831		goto find_ex;
1832	}
1833#endif
1834
1835	return ret;
1836}
1837
1838/* Remove the passed in cached rt from the hash table that contains it */
1839static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
1840				    const struct rt6_info *rt)
1841{
1842	const struct in6_addr *src_key = NULL;
1843	struct rt6_exception_bucket *bucket;
1844	struct rt6_exception *rt6_ex;
1845	int err;
1846
1847	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
1848		return -ENOENT;
1849
1850	spin_lock_bh(&rt6_exception_lock);
1851	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1852
1853#ifdef CONFIG_IPV6_SUBTREES
1854	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1855	 * and exception table is indexed by a hash of
1856	 * both rt6i_dst and rt6i_src.
1857	 * Otherwise, the exception table is indexed by
1858	 * a hash of only rt6i_dst.
1859	 */
1860	if (plen)
1861		src_key = &rt->rt6i_src.addr;
1862#endif
1863	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1864					       &rt->rt6i_dst.addr,
1865					       src_key);
1866	if (rt6_ex) {
1867		rt6_remove_exception(bucket, rt6_ex);
1868		err = 0;
1869	} else {
1870		err = -ENOENT;
1871	}
1872
1873	spin_unlock_bh(&rt6_exception_lock);
1874	return err;
1875}
1876
1877struct fib6_nh_excptn_arg {
1878	struct rt6_info	*rt;
1879	int		plen;
1880};
1881
1882static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
1883{
1884	struct fib6_nh_excptn_arg *arg = _arg;
1885	int err;
1886
1887	err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
1888	if (err == 0)
1889		return 1;
1890
1891	return 0;
1892}
1893
1894static int rt6_remove_exception_rt(struct rt6_info *rt)
1895{
1896	struct fib6_info *from;
1897
1898	from = rcu_dereference(rt->from);
1899	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1900		return -EINVAL;
1901
1902	if (from->nh) {
1903		struct fib6_nh_excptn_arg arg = {
1904			.rt = rt,
1905			.plen = from->fib6_src.plen
1906		};
1907		int rc;
1908
1909		/* rc = 1 means an entry was found */
1910		rc = nexthop_for_each_fib6_nh(from->nh,
1911					      rt6_nh_remove_exception_rt,
1912					      &arg);
1913		return rc ? 0 : -ENOENT;
1914	}
1915
1916	return fib6_nh_remove_exception(from->fib6_nh,
1917					from->fib6_src.plen, rt);
1918}
1919
1920/* Find rt6_ex which contains the passed in rt cache and
1921 * refresh its stamp
1922 */
1923static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
1924				     const struct rt6_info *rt)
1925{
1926	const struct in6_addr *src_key = NULL;
1927	struct rt6_exception_bucket *bucket;
1928	struct rt6_exception *rt6_ex;
1929
1930	bucket = fib6_nh_get_excptn_bucket(nh, NULL);
1931#ifdef CONFIG_IPV6_SUBTREES
1932	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1933	 * and exception table is indexed by a hash of
1934	 * both rt6i_dst and rt6i_src.
1935	 * Otherwise, the exception table is indexed by
1936	 * a hash of only rt6i_dst.
1937	 */
1938	if (plen)
1939		src_key = &rt->rt6i_src.addr;
1940#endif
1941	rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
1942	if (rt6_ex)
1943		rt6_ex->stamp = jiffies;
1944}
1945
1946struct fib6_nh_match_arg {
1947	const struct net_device *dev;
1948	const struct in6_addr	*gw;
1949	struct fib6_nh		*match;
1950};
1951
1952/* determine if fib6_nh has given device and gateway */
1953static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
1954{
1955	struct fib6_nh_match_arg *arg = _arg;
1956
1957	if (arg->dev != nh->fib_nh_dev ||
1958	    (arg->gw && !nh->fib_nh_gw_family) ||
1959	    (!arg->gw && nh->fib_nh_gw_family) ||
1960	    (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
1961		return 0;
1962
1963	arg->match = nh;
1964
1965	/* found a match, break the loop */
1966	return 1;
1967}
1968
1969static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1970{
1971	struct fib6_info *from;
1972	struct fib6_nh *fib6_nh;
1973
1974	rcu_read_lock();
1975
1976	from = rcu_dereference(rt->from);
1977	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1978		goto unlock;
1979
1980	if (from->nh) {
1981		struct fib6_nh_match_arg arg = {
1982			.dev = rt->dst.dev,
1983			.gw = &rt->rt6i_gateway,
1984		};
1985
1986		nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);
1987
1988		if (!arg.match)
1989			goto unlock;
1990		fib6_nh = arg.match;
1991	} else {
1992		fib6_nh = from->fib6_nh;
1993	}
1994	fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
1995unlock:
1996	rcu_read_unlock();
1997}
1998
1999static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
2000					 struct rt6_info *rt, int mtu)
2001{
2002	/* If the new MTU is lower than the route PMTU, this new MTU will be the
2003	 * lowest MTU in the path: always allow updating the route PMTU to
2004	 * reflect PMTU decreases.
2005	 *
2006	 * If the new MTU is higher, and the route PMTU is equal to the local
2007	 * MTU, this means the old MTU is the lowest in the path, so allow
2008	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
2009	 * handle this.
2010	 */
2011
2012	if (dst_mtu(&rt->dst) >= mtu)
2013		return true;
2014
2015	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
2016		return true;
2017
2018	return false;
2019}
2020
2021static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
2022				       const struct fib6_nh *nh, int mtu)
2023{
2024	struct rt6_exception_bucket *bucket;
2025	struct rt6_exception *rt6_ex;
2026	int i;
2027
2028	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
2029	if (!bucket)
2030		return;
2031
2032	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2033		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
2034			struct rt6_info *entry = rt6_ex->rt6i;
2035
2036			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
2037			 * route), the metrics of its rt->from have already
2038			 * been updated.
2039			 */
2040			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
2041			    rt6_mtu_change_route_allowed(idev, entry, mtu))
2042				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
2043		}
2044		bucket++;
2045	}
2046}
2047
2048#define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
2049
2050static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
2051					    const struct in6_addr *gateway)
2052{
2053	struct rt6_exception_bucket *bucket;
2054	struct rt6_exception *rt6_ex;
2055	struct hlist_node *tmp;
2056	int i;
2057
2058	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
2059		return;
2060
2061	spin_lock_bh(&rt6_exception_lock);
2062	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
2063	if (bucket) {
2064		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2065			hlist_for_each_entry_safe(rt6_ex, tmp,
2066						  &bucket->chain, hlist) {
2067				struct rt6_info *entry = rt6_ex->rt6i;
2068
2069				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
2070				    RTF_CACHE_GATEWAY &&
2071				    ipv6_addr_equal(gateway,
2072						    &entry->rt6i_gateway)) {
2073					rt6_remove_exception(bucket, rt6_ex);
2074				}
2075			}
2076			bucket++;
2077		}
2078	}
2079
2080	spin_unlock_bh(&rt6_exception_lock);
2081}
2082
2083static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
2084				      struct rt6_exception *rt6_ex,
2085				      struct fib6_gc_args *gc_args,
2086				      unsigned long now)
2087{
2088	struct rt6_info *rt = rt6_ex->rt6i;
2089
2090	/* we are pruning and obsoleting aged-out and non gateway exceptions
2091	 * even if others have still references to them, so that on next
2092	 * dst_check() such references can be dropped.
2093	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
2094	 * expired, independently from their aging, as per RFC 8201 section 4
2095	 */
2096	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
2097		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
2098			pr_debug("aging clone %p\n", rt);
2099			rt6_remove_exception(bucket, rt6_ex);
2100			return;
2101		}
2102	} else if (time_after(jiffies, rt->dst.expires)) {
2103		pr_debug("purging expired route %p\n", rt);
2104		rt6_remove_exception(bucket, rt6_ex);
2105		return;
2106	}
2107
2108	if (rt->rt6i_flags & RTF_GATEWAY) {
2109		struct neighbour *neigh;
2110
2111		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
2112
2113		if (!(neigh && (neigh->flags & NTF_ROUTER))) {
2114			pr_debug("purging route %p via non-router but gateway\n",
2115				 rt);
2116			rt6_remove_exception(bucket, rt6_ex);
2117			return;
2118		}
2119	}
2120
2121	gc_args->more++;
2122}
2123
2124static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
2125				   struct fib6_gc_args *gc_args,
2126				   unsigned long now)
2127{
2128	struct rt6_exception_bucket *bucket;
2129	struct rt6_exception *rt6_ex;
2130	struct hlist_node *tmp;
2131	int i;
2132
2133	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
2134		return;
2135
2136	rcu_read_lock_bh();
2137	spin_lock(&rt6_exception_lock);
2138	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
2139	if (bucket) {
2140		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2141			hlist_for_each_entry_safe(rt6_ex, tmp,
2142						  &bucket->chain, hlist) {
2143				rt6_age_examine_exception(bucket, rt6_ex,
2144							  gc_args, now);
2145			}
2146			bucket++;
2147		}
2148	}
2149	spin_unlock(&rt6_exception_lock);
2150	rcu_read_unlock_bh();
2151}
2152
2153struct fib6_nh_age_excptn_arg {
2154	struct fib6_gc_args	*gc_args;
2155	unsigned long		now;
2156};
2157
2158static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
2159{
2160	struct fib6_nh_age_excptn_arg *arg = _arg;
2161
2162	fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
2163	return 0;
2164}
2165
2166void rt6_age_exceptions(struct fib6_info *f6i,
2167			struct fib6_gc_args *gc_args,
2168			unsigned long now)
2169{
2170	if (f6i->nh) {
2171		struct fib6_nh_age_excptn_arg arg = {
2172			.gc_args = gc_args,
2173			.now = now
2174		};
2175
2176		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
2177					 &arg);
2178	} else {
2179		fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
2180	}
2181}
2182
2183/* must be called with rcu lock held */
2184int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
2185		      struct flowi6 *fl6, struct fib6_result *res, int strict)
2186{
2187	struct fib6_node *fn, *saved_fn;
2188
2189	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2190	saved_fn = fn;
2191
2192redo_rt6_select:
2193	rt6_select(net, fn, oif, res, strict);
2194	if (res->f6i == net->ipv6.fib6_null_entry) {
2195		fn = fib6_backtrack(fn, &fl6->saddr);
2196		if (fn)
2197			goto redo_rt6_select;
2198		else if (strict & RT6_LOOKUP_F_REACHABLE) {
2199			/* also consider unreachable route */
2200			strict &= ~RT6_LOOKUP_F_REACHABLE;
2201			fn = saved_fn;
2202			goto redo_rt6_select;
2203		}
2204	}
2205
2206	trace_fib6_table_lookup(net, res, table, fl6);
2207
2208	return 0;
2209}
2210
2211struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
2212			       int oif, struct flowi6 *fl6,
2213			       const struct sk_buff *skb, int flags)
2214{
2215	struct fib6_result res = {};
2216	struct rt6_info *rt = NULL;
2217	int strict = 0;
2218
2219	WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
2220		     !rcu_read_lock_held());
2221
2222	strict |= flags & RT6_LOOKUP_F_IFACE;
2223	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
2224	if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
2225		strict |= RT6_LOOKUP_F_REACHABLE;
2226
2227	rcu_read_lock();
2228
2229	fib6_table_lookup(net, table, oif, fl6, &res, strict);
2230	if (res.f6i == net->ipv6.fib6_null_entry)
2231		goto out;
2232
2233	fib6_select_path(net, &res, fl6, oif, false, skb, strict);
2234
2235	/*Search through exception table */
2236	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
2237	if (rt) {
2238		goto out;
2239	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
2240			    !res.nh->fib_nh_gw_family)) {
2241		/* Create a RTF_CACHE clone which will not be
2242		 * owned by the fib6 tree.  It is for the special case where
2243		 * the daddr in the skb during the neighbor look-up is different
2244		 * from the fl6->daddr used to look-up route here.
2245		 */
2246		rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
2247
2248		if (rt) {
2249			/* 1 refcnt is taken during ip6_rt_cache_alloc().
2250			 * As rt6_uncached_list_add() does not consume refcnt,
2251			 * this refcnt is always returned to the caller even
2252			 * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
2253			 */
2254			rt6_uncached_list_add(rt);
2255			rcu_read_unlock();
2256
2257			return rt;
2258		}
2259	} else {
2260		/* Get a percpu copy */
2261		local_bh_disable();
2262		rt = rt6_get_pcpu_route(&res);
2263
2264		if (!rt)
2265			rt = rt6_make_pcpu_route(net, &res);
2266
2267		local_bh_enable();
2268	}
2269out:
2270	if (!rt)
2271		rt = net->ipv6.ip6_null_entry;
2272	if (!(flags & RT6_LOOKUP_F_DST_NOREF))
2273		ip6_hold_safe(net, &rt);
2274	rcu_read_unlock();
2275
2276	return rt;
2277}
2278EXPORT_SYMBOL_GPL(ip6_pol_route);
2279
2280INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
2281					    struct fib6_table *table,
2282					    struct flowi6 *fl6,
2283					    const struct sk_buff *skb,
2284					    int flags)
2285{
2286	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
2287}
2288
2289struct dst_entry *ip6_route_input_lookup(struct net *net,
2290					 struct net_device *dev,
2291					 struct flowi6 *fl6,
2292					 const struct sk_buff *skb,
2293					 int flags)
2294{
2295	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
2296		flags |= RT6_LOOKUP_F_IFACE;
2297
2298	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
2299}
2300EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
2301
2302static void ip6_multipath_l3_keys(const struct sk_buff *skb,
2303				  struct flow_keys *keys,
2304				  struct flow_keys *flkeys)
2305{
2306	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
2307	const struct ipv6hdr *key_iph = outer_iph;
2308	struct flow_keys *_flkeys = flkeys;
2309	const struct ipv6hdr *inner_iph;
2310	const struct icmp6hdr *icmph;
2311	struct ipv6hdr _inner_iph;
2312	struct icmp6hdr _icmph;
2313
2314	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2315		goto out;
2316
2317	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2318				   sizeof(_icmph), &_icmph);
2319	if (!icmph)
2320		goto out;
2321
2322	if (!icmpv6_is_err(icmph->icmp6_type))
2323		goto out;
2324
2325	inner_iph = skb_header_pointer(skb,
2326				       skb_transport_offset(skb) + sizeof(*icmph),
2327				       sizeof(_inner_iph), &_inner_iph);
2328	if (!inner_iph)
2329		goto out;
2330
2331	key_iph = inner_iph;
2332	_flkeys = NULL;
2333out:
2334	if (_flkeys) {
2335		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2336		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2337		keys->tags.flow_label = _flkeys->tags.flow_label;
2338		keys->basic.ip_proto = _flkeys->basic.ip_proto;
2339	} else {
2340		keys->addrs.v6addrs.src = key_iph->saddr;
2341		keys->addrs.v6addrs.dst = key_iph->daddr;
2342		keys->tags.flow_label = ip6_flowlabel(key_iph);
2343		keys->basic.ip_proto = key_iph->nexthdr;
2344	}
2345}
2346
2347static u32 rt6_multipath_custom_hash_outer(const struct net *net,
2348					   const struct sk_buff *skb,
2349					   bool *p_has_inner)
2350{
2351	u32 hash_fields = ip6_multipath_hash_fields(net);
2352	struct flow_keys keys, hash_keys;
2353
2354	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2355		return 0;
2356
2357	memset(&hash_keys, 0, sizeof(hash_keys));
2358	skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
2359
2360	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2361	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2362		hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2363	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2364		hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2365	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2366		hash_keys.basic.ip_proto = keys.basic.ip_proto;
2367	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
2368		hash_keys.tags.flow_label = keys.tags.flow_label;
2369	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2370		hash_keys.ports.src = keys.ports.src;
2371	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2372		hash_keys.ports.dst = keys.ports.dst;
2373
2374	*p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
2375	return flow_hash_from_keys(&hash_keys);
2376}
2377
2378static u32 rt6_multipath_custom_hash_inner(const struct net *net,
2379					   const struct sk_buff *skb,
2380					   bool has_inner)
2381{
2382	u32 hash_fields = ip6_multipath_hash_fields(net);
2383	struct flow_keys keys, hash_keys;
2384
2385	/* We assume the packet carries an encapsulation, but if none was
2386	 * encountered during dissection of the outer flow, then there is no
2387	 * point in calling the flow dissector again.
2388	 */
2389	if (!has_inner)
2390		return 0;
2391
2392	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
2393		return 0;
2394
2395	memset(&hash_keys, 0, sizeof(hash_keys));
2396	skb_flow_dissect_flow_keys(skb, &keys, 0);
2397
2398	if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
2399		return 0;
2400
2401	if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2402		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2403		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
2404			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2405		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
2406			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2407	} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2408		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2409		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
2410			hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2411		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
2412			hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2413		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
2414			hash_keys.tags.flow_label = keys.tags.flow_label;
2415	}
2416
2417	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
2418		hash_keys.basic.ip_proto = keys.basic.ip_proto;
2419	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
2420		hash_keys.ports.src = keys.ports.src;
2421	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
2422		hash_keys.ports.dst = keys.ports.dst;
2423
2424	return flow_hash_from_keys(&hash_keys);
2425}
2426
2427static u32 rt6_multipath_custom_hash_skb(const struct net *net,
2428					 const struct sk_buff *skb)
2429{
2430	u32 mhash, mhash_inner;
2431	bool has_inner = true;
2432
2433	mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner);
2434	mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner);
2435
2436	return jhash_2words(mhash, mhash_inner, 0);
2437}
2438
2439static u32 rt6_multipath_custom_hash_fl6(const struct net *net,
2440					 const struct flowi6 *fl6)
2441{
2442	u32 hash_fields = ip6_multipath_hash_fields(net);
2443	struct flow_keys hash_keys;
2444
2445	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2446		return 0;
2447
2448	memset(&hash_keys, 0, sizeof(hash_keys));
2449	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2450	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2451		hash_keys.addrs.v6addrs.src = fl6->saddr;
2452	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2453		hash_keys.addrs.v6addrs.dst = fl6->daddr;
2454	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2455		hash_keys.basic.ip_proto = fl6->flowi6_proto;
2456	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
2457		hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2458	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2459		hash_keys.ports.src = fl6->fl6_sport;
2460	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2461		hash_keys.ports.dst = fl6->fl6_dport;
2462
2463	return flow_hash_from_keys(&hash_keys);
2464}
2465
2466/* if skb is set it will be used and fl6 can be NULL */
2467u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2468		       const struct sk_buff *skb, struct flow_keys *flkeys)
2469{
2470	struct flow_keys hash_keys;
2471	u32 mhash = 0;
2472
2473	switch (ip6_multipath_hash_policy(net)) {
2474	case 0:
2475		memset(&hash_keys, 0, sizeof(hash_keys));
2476		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2477		if (skb) {
2478			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2479		} else {
2480			hash_keys.addrs.v6addrs.src = fl6->saddr;
2481			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2482			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2483			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2484		}
2485		mhash = flow_hash_from_keys(&hash_keys);
2486		break;
2487	case 1:
2488		if (skb) {
2489			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2490			struct flow_keys keys;
2491
2492			/* short-circuit if we already have L4 hash present */
2493			if (skb->l4_hash)
2494				return skb_get_hash_raw(skb) >> 1;
2495
2496			memset(&hash_keys, 0, sizeof(hash_keys));
2497
2498			if (!flkeys) {
2499				skb_flow_dissect_flow_keys(skb, &keys, flag);
2500				flkeys = &keys;
2501			}
2502			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2503			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2504			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2505			hash_keys.ports.src = flkeys->ports.src;
2506			hash_keys.ports.dst = flkeys->ports.dst;
2507			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2508		} else {
2509			memset(&hash_keys, 0, sizeof(hash_keys));
2510			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2511			hash_keys.addrs.v6addrs.src = fl6->saddr;
2512			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2513			hash_keys.ports.src = fl6->fl6_sport;
2514			hash_keys.ports.dst = fl6->fl6_dport;
2515			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2516		}
2517		mhash = flow_hash_from_keys(&hash_keys);
2518		break;
2519	case 2:
2520		memset(&hash_keys, 0, sizeof(hash_keys));
2521		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2522		if (skb) {
2523			struct flow_keys keys;
2524
2525			if (!flkeys) {
2526				skb_flow_dissect_flow_keys(skb, &keys, 0);
2527				flkeys = &keys;
2528			}
2529
2530			/* Inner can be v4 or v6 */
2531			if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2532				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2533				hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2534				hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2535			} else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2536				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2537				hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2538				hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2539				hash_keys.tags.flow_label = flkeys->tags.flow_label;
2540				hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2541			} else {
2542				/* Same as case 0 */
2543				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2544				ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2545			}
2546		} else {
2547			/* Same as case 0 */
2548			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2549			hash_keys.addrs.v6addrs.src = fl6->saddr;
2550			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2551			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2552			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2553		}
2554		mhash = flow_hash_from_keys(&hash_keys);
2555		break;
2556	case 3:
2557		if (skb)
2558			mhash = rt6_multipath_custom_hash_skb(net, skb);
2559		else
2560			mhash = rt6_multipath_custom_hash_fl6(net, fl6);
2561		break;
2562	}
2563
2564	return mhash >> 1;
2565}
2566
2567/* Called with rcu held */
2568void ip6_route_input(struct sk_buff *skb)
2569{
2570	const struct ipv6hdr *iph = ipv6_hdr(skb);
2571	struct net *net = dev_net(skb->dev);
2572	int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
2573	struct ip_tunnel_info *tun_info;
2574	struct flowi6 fl6 = {
2575		.flowi6_iif = skb->dev->ifindex,
2576		.daddr = iph->daddr,
2577		.saddr = iph->saddr,
2578		.flowlabel = ip6_flowinfo(iph),
2579		.flowi6_mark = skb->mark,
2580		.flowi6_proto = iph->nexthdr,
2581	};
2582	struct flow_keys *flkeys = NULL, _flkeys;
2583
2584	tun_info = skb_tunnel_info(skb);
2585	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2586		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2587
2588	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2589		flkeys = &_flkeys;
2590
2591	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2592		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2593	skb_dst_drop(skb);
2594	skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
2595						      &fl6, skb, flags));
2596}
2597
2598INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
2599					     struct fib6_table *table,
2600					     struct flowi6 *fl6,
2601					     const struct sk_buff *skb,
2602					     int flags)
2603{
2604	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2605}
2606
2607static struct dst_entry *ip6_route_output_flags_noref(struct net *net,
2608						      const struct sock *sk,
2609						      struct flowi6 *fl6,
2610						      int flags)
2611{
2612	bool any_src;
2613
2614	if (ipv6_addr_type(&fl6->daddr) &
2615	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2616		struct dst_entry *dst;
2617
2618		/* This function does not take refcnt on the dst */
2619		dst = l3mdev_link_scope_lookup(net, fl6);
2620		if (dst)
2621			return dst;
2622	}
2623
2624	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2625
2626	flags |= RT6_LOOKUP_F_DST_NOREF;
2627	any_src = ipv6_addr_any(&fl6->saddr);
2628	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2629	    (fl6->flowi6_oif && any_src))
2630		flags |= RT6_LOOKUP_F_IFACE;
2631
2632	if (!any_src)
2633		flags |= RT6_LOOKUP_F_HAS_SADDR;
2634	else if (sk)
2635		flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs));
2636
2637	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2638}
2639
2640struct dst_entry *ip6_route_output_flags(struct net *net,
2641					 const struct sock *sk,
2642					 struct flowi6 *fl6,
2643					 int flags)
2644{
2645	struct dst_entry *dst;
2646	struct rt6_info *rt6;
2647
2648	rcu_read_lock();
2649	dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
2650	rt6 = (struct rt6_info *)dst;
2651	/* For dst cached in uncached_list, refcnt is already taken. */
2652	if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) {
2653		dst = &net->ipv6.ip6_null_entry->dst;
2654		dst_hold(dst);
2655	}
2656	rcu_read_unlock();
2657
2658	return dst;
2659}
2660EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2661
2662struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2663{
2664	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2665	struct net_device *loopback_dev = net->loopback_dev;
2666	struct dst_entry *new = NULL;
2667
2668	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev,
2669		       DST_OBSOLETE_DEAD, 0);
2670	if (rt) {
2671		rt6_info_init(rt);
2672		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2673
2674		new = &rt->dst;
2675		new->__use = 1;
2676		new->input = dst_discard;
2677		new->output = dst_discard_out;
2678
2679		dst_copy_metrics(new, &ort->dst);
2680
2681		rt->rt6i_idev = in6_dev_get(loopback_dev);
2682		rt->rt6i_gateway = ort->rt6i_gateway;
2683		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2684
2685		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2686#ifdef CONFIG_IPV6_SUBTREES
2687		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2688#endif
2689	}
2690
2691	dst_release(dst_orig);
2692	return new ? new : ERR_PTR(-ENOMEM);
2693}
2694
2695/*
2696 *	Destination cache support functions
2697 */
2698
2699static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2700{
2701	u32 rt_cookie = 0;
2702
2703	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2704		return false;
2705
2706	if (fib6_check_expired(f6i))
2707		return false;
2708
2709	return true;
2710}
2711
2712static struct dst_entry *rt6_check(struct rt6_info *rt,
2713				   struct fib6_info *from,
2714				   u32 cookie)
2715{
2716	u32 rt_cookie = 0;
2717
2718	if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
2719	    rt_cookie != cookie)
2720		return NULL;
2721
2722	if (rt6_check_expired(rt))
2723		return NULL;
2724
2725	return &rt->dst;
2726}
2727
2728static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2729					    struct fib6_info *from,
2730					    u32 cookie)
2731{
2732	if (!__rt6_check_expired(rt) &&
2733	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2734	    fib6_check(from, cookie))
2735		return &rt->dst;
2736	else
2737		return NULL;
2738}
2739
2740INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
2741							u32 cookie)
2742{
2743	struct dst_entry *dst_ret;
2744	struct fib6_info *from;
2745	struct rt6_info *rt;
2746
2747	rt = container_of(dst, struct rt6_info, dst);
2748
2749	if (rt->sernum)
2750		return rt6_is_valid(rt) ? dst : NULL;
2751
2752	rcu_read_lock();
2753
2754	/* All IPV6 dsts are created with ->obsolete set to the value
2755	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2756	 * into this function always.
2757	 */
2758
2759	from = rcu_dereference(rt->from);
2760
2761	if (from && (rt->rt6i_flags & RTF_PCPU ||
2762	    unlikely(!list_empty(&rt->dst.rt_uncached))))
2763		dst_ret = rt6_dst_from_check(rt, from, cookie);
2764	else
2765		dst_ret = rt6_check(rt, from, cookie);
2766
2767	rcu_read_unlock();
2768
2769	return dst_ret;
2770}
2771EXPORT_INDIRECT_CALLABLE(ip6_dst_check);
2772
2773static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2774{
2775	struct rt6_info *rt = (struct rt6_info *) dst;
2776
2777	if (rt) {
2778		if (rt->rt6i_flags & RTF_CACHE) {
2779			rcu_read_lock();
2780			if (rt6_check_expired(rt)) {
2781				rt6_remove_exception_rt(rt);
2782				dst = NULL;
2783			}
2784			rcu_read_unlock();
2785		} else {
2786			dst_release(dst);
2787			dst = NULL;
2788		}
2789	}
2790	return dst;
2791}
2792
2793static void ip6_link_failure(struct sk_buff *skb)
2794{
2795	struct rt6_info *rt;
2796
2797	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2798
2799	rt = (struct rt6_info *) skb_dst(skb);
2800	if (rt) {
2801		rcu_read_lock();
2802		if (rt->rt6i_flags & RTF_CACHE) {
2803			rt6_remove_exception_rt(rt);
2804		} else {
2805			struct fib6_info *from;
2806			struct fib6_node *fn;
2807
2808			from = rcu_dereference(rt->from);
2809			if (from) {
2810				fn = rcu_dereference(from->fib6_node);
2811				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2812					WRITE_ONCE(fn->fn_sernum, -1);
2813			}
2814		}
2815		rcu_read_unlock();
2816	}
2817}
2818
2819static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2820{
2821	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2822		struct fib6_info *from;
2823
2824		rcu_read_lock();
2825		from = rcu_dereference(rt0->from);
2826		if (from)
2827			rt0->dst.expires = from->expires;
2828		rcu_read_unlock();
2829	}
2830
2831	dst_set_expires(&rt0->dst, timeout);
2832	rt0->rt6i_flags |= RTF_EXPIRES;
2833}
2834
2835static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2836{
2837	struct net *net = dev_net(rt->dst.dev);
2838
2839	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2840	rt->rt6i_flags |= RTF_MODIFIED;
2841	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2842}
2843
2844static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2845{
2846	return !(rt->rt6i_flags & RTF_CACHE) &&
2847		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2848}
2849
2850static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2851				 const struct ipv6hdr *iph, u32 mtu,
2852				 bool confirm_neigh)
2853{
2854	const struct in6_addr *daddr, *saddr;
2855	struct rt6_info *rt6 = (struct rt6_info *)dst;
2856
2857	/* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
2858	 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
2859	 * [see also comment in rt6_mtu_change_route()]
2860	 */
2861
2862	if (iph) {
2863		daddr = &iph->daddr;
2864		saddr = &iph->saddr;
2865	} else if (sk) {
2866		daddr = &sk->sk_v6_daddr;
2867		saddr = &inet6_sk(sk)->saddr;
2868	} else {
2869		daddr = NULL;
2870		saddr = NULL;
2871	}
2872
2873	if (confirm_neigh)
2874		dst_confirm_neigh(dst, daddr);
2875
2876	if (mtu < IPV6_MIN_MTU)
2877		return;
2878	if (mtu >= dst_mtu(dst))
2879		return;
2880
2881	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2882		rt6_do_update_pmtu(rt6, mtu);
2883		/* update rt6_ex->stamp for cache */
2884		if (rt6->rt6i_flags & RTF_CACHE)
2885			rt6_update_exception_stamp_rt(rt6);
2886	} else if (daddr) {
2887		struct fib6_result res = {};
2888		struct rt6_info *nrt6;
2889
2890		rcu_read_lock();
2891		res.f6i = rcu_dereference(rt6->from);
2892		if (!res.f6i)
2893			goto out_unlock;
2894
2895		res.fib6_flags = res.f6i->fib6_flags;
2896		res.fib6_type = res.f6i->fib6_type;
2897
2898		if (res.f6i->nh) {
2899			struct fib6_nh_match_arg arg = {
2900				.dev = dst->dev,
2901				.gw = &rt6->rt6i_gateway,
2902			};
2903
2904			nexthop_for_each_fib6_nh(res.f6i->nh,
2905						 fib6_nh_find_match, &arg);
2906
2907			/* fib6_info uses a nexthop that does not have fib6_nh
2908			 * using the dst->dev + gw. Should be impossible.
2909			 */
2910			if (!arg.match)
2911				goto out_unlock;
2912
2913			res.nh = arg.match;
2914		} else {
2915			res.nh = res.f6i->fib6_nh;
2916		}
2917
2918		nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2919		if (nrt6) {
2920			rt6_do_update_pmtu(nrt6, mtu);
2921			if (rt6_insert_exception(nrt6, &res))
2922				dst_release_immediate(&nrt6->dst);
2923		}
2924out_unlock:
2925		rcu_read_unlock();
2926	}
2927}
2928
2929static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2930			       struct sk_buff *skb, u32 mtu,
2931			       bool confirm_neigh)
2932{
2933	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
2934			     confirm_neigh);
2935}
2936
2937void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2938		     int oif, u32 mark, kuid_t uid)
2939{
2940	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2941	struct dst_entry *dst;
2942	struct flowi6 fl6 = {
2943		.flowi6_oif = oif,
2944		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2945		.daddr = iph->daddr,
2946		.saddr = iph->saddr,
2947		.flowlabel = ip6_flowinfo(iph),
2948		.flowi6_uid = uid,
2949	};
2950
2951	dst = ip6_route_output(net, NULL, &fl6);
2952	if (!dst->error)
2953		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
2954	dst_release(dst);
2955}
2956EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2957
2958void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2959{
2960	int oif = sk->sk_bound_dev_if;
2961	struct dst_entry *dst;
2962
2963	if (!oif && skb->dev)
2964		oif = l3mdev_master_ifindex(skb->dev);
2965
2966	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark),
2967			sk->sk_uid);
2968
2969	dst = __sk_dst_get(sk);
2970	if (!dst || !dst->obsolete ||
2971	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2972		return;
2973
2974	bh_lock_sock(sk);
2975	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2976		ip6_datagram_dst_update(sk, false);
2977	bh_unlock_sock(sk);
2978}
2979EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2980
2981void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2982			   const struct flowi6 *fl6)
2983{
2984#ifdef CONFIG_IPV6_SUBTREES
2985	struct ipv6_pinfo *np = inet6_sk(sk);
2986#endif
2987
2988	ip6_dst_store(sk, dst,
2989		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2990		      &sk->sk_v6_daddr : NULL,
2991#ifdef CONFIG_IPV6_SUBTREES
2992		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2993		      &np->saddr :
2994#endif
2995		      NULL);
2996}
2997
2998static bool ip6_redirect_nh_match(const struct fib6_result *res,
2999				  struct flowi6 *fl6,
3000				  const struct in6_addr *gw,
3001				  struct rt6_info **ret)
3002{
3003	const struct fib6_nh *nh = res->nh;
3004
3005	if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
3006	    fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
3007		return false;
3008
3009	/* rt_cache's gateway might be different from its 'parent'
3010	 * in the case of an ip redirect.
3011	 * So we keep searching in the exception table if the gateway
3012	 * is different.
3013	 */
3014	if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
3015		struct rt6_info *rt_cache;
3016
3017		rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
3018		if (rt_cache &&
3019		    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
3020			*ret = rt_cache;
3021			return true;
3022		}
3023		return false;
3024	}
3025	return true;
3026}
3027
3028struct fib6_nh_rd_arg {
3029	struct fib6_result	*res;
3030	struct flowi6		*fl6;
3031	const struct in6_addr	*gw;
3032	struct rt6_info		**ret;
3033};
3034
3035static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
3036{
3037	struct fib6_nh_rd_arg *arg = _arg;
3038
3039	arg->res->nh = nh;
3040	return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
3041}
3042
3043/* Handle redirects */
3044struct ip6rd_flowi {
3045	struct flowi6 fl6;
3046	struct in6_addr gateway;
3047};
3048
3049INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net,
3050					     struct fib6_table *table,
3051					     struct flowi6 *fl6,
3052					     const struct sk_buff *skb,
3053					     int flags)
3054{
3055	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
3056	struct rt6_info *ret = NULL;
3057	struct fib6_result res = {};
3058	struct fib6_nh_rd_arg arg = {
3059		.res = &res,
3060		.fl6 = fl6,
3061		.gw  = &rdfl->gateway,
3062		.ret = &ret
3063	};
3064	struct fib6_info *rt;
3065	struct fib6_node *fn;
3066
3067	/* Get the "current" route for this destination and
3068	 * check if the redirect has come from appropriate router.
3069	 *
3070	 * RFC 4861 specifies that redirects should only be
3071	 * accepted if they come from the nexthop to the target.
3072	 * Due to the way the routes are chosen, this notion
3073	 * is a bit fuzzy and one might need to check all possible
3074	 * routes.
3075	 */
3076
3077	rcu_read_lock();
3078	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
3079restart:
3080	for_each_fib6_node_rt_rcu(fn) {
3081		res.f6i = rt;
3082		if (fib6_check_expired(rt))
3083			continue;
3084		if (rt->fib6_flags & RTF_REJECT)
3085			break;
3086		if (unlikely(rt->nh)) {
3087			if (nexthop_is_blackhole(rt->nh))
3088				continue;
3089			/* on match, res->nh is filled in and potentially ret */
3090			if (nexthop_for_each_fib6_nh(rt->nh,
3091						     fib6_nh_redirect_match,
3092						     &arg))
3093				goto out;
3094		} else {
3095			res.nh = rt->fib6_nh;
3096			if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
3097						  &ret))
3098				goto out;
3099		}
3100	}
3101
3102	if (!rt)
3103		rt = net->ipv6.fib6_null_entry;
3104	else if (rt->fib6_flags & RTF_REJECT) {
3105		ret = net->ipv6.ip6_null_entry;
3106		goto out;
3107	}
3108
3109	if (rt == net->ipv6.fib6_null_entry) {
3110		fn = fib6_backtrack(fn, &fl6->saddr);
3111		if (fn)
3112			goto restart;
3113	}
3114
3115	res.f6i = rt;
3116	res.nh = rt->fib6_nh;
3117out:
3118	if (ret) {
3119		ip6_hold_safe(net, &ret);
3120	} else {
3121		res.fib6_flags = res.f6i->fib6_flags;
3122		res.fib6_type = res.f6i->fib6_type;
3123		ret = ip6_create_rt_rcu(&res);
3124	}
3125
3126	rcu_read_unlock();
3127
3128	trace_fib6_table_lookup(net, &res, table, fl6);
3129	return ret;
3130};
3131
3132static struct dst_entry *ip6_route_redirect(struct net *net,
3133					    const struct flowi6 *fl6,
3134					    const struct sk_buff *skb,
3135					    const struct in6_addr *gateway)
3136{
3137	int flags = RT6_LOOKUP_F_HAS_SADDR;
3138	struct ip6rd_flowi rdfl;
3139
3140	rdfl.fl6 = *fl6;
3141	rdfl.gateway = *gateway;
3142
3143	return fib6_rule_lookup(net, &rdfl.fl6, skb,
3144				flags, __ip6_route_redirect);
3145}
3146
3147void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
3148		  kuid_t uid)
3149{
3150	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
3151	struct dst_entry *dst;
3152	struct flowi6 fl6 = {
3153		.flowi6_iif = LOOPBACK_IFINDEX,
3154		.flowi6_oif = oif,
3155		.flowi6_mark = mark,
3156		.daddr = iph->daddr,
3157		.saddr = iph->saddr,
3158		.flowlabel = ip6_flowinfo(iph),
3159		.flowi6_uid = uid,
3160	};
3161
3162	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
3163	rt6_do_redirect(dst, NULL, skb);
3164	dst_release(dst);
3165}
3166EXPORT_SYMBOL_GPL(ip6_redirect);
3167
3168void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
3169{
3170	const struct ipv6hdr *iph = ipv6_hdr(skb);
3171	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
3172	struct dst_entry *dst;
3173	struct flowi6 fl6 = {
3174		.flowi6_iif = LOOPBACK_IFINDEX,
3175		.flowi6_oif = oif,
3176		.daddr = msg->dest,
3177		.saddr = iph->daddr,
3178		.flowi6_uid = sock_net_uid(net, NULL),
3179	};
3180
3181	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
3182	rt6_do_redirect(dst, NULL, skb);
3183	dst_release(dst);
3184}
3185
3186void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
3187{
3188	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
3189		     READ_ONCE(sk->sk_mark), sk->sk_uid);
3190}
3191EXPORT_SYMBOL_GPL(ip6_sk_redirect);
3192
3193static unsigned int ip6_default_advmss(const struct dst_entry *dst)
3194{
3195	struct net_device *dev = dst->dev;
3196	unsigned int mtu = dst_mtu(dst);
3197	struct net *net = dev_net(dev);
3198
3199	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
3200
3201	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
3202		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
3203
3204	/*
3205	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
3206	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
3207	 * IPV6_MAXPLEN is also valid and means: "any MSS,
3208	 * rely only on pmtu discovery"
3209	 */
3210	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
3211		mtu = IPV6_MAXPLEN;
3212	return mtu;
3213}
3214
3215INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst)
3216{
3217	return ip6_dst_mtu_maybe_forward(dst, false);
3218}
3219EXPORT_INDIRECT_CALLABLE(ip6_mtu);
3220
3221/* MTU selection:
3222 * 1. mtu on route is locked - use it
3223 * 2. mtu from nexthop exception
3224 * 3. mtu from egress device
3225 *
3226 * based on ip6_dst_mtu_forward and exception logic of
3227 * rt6_find_cached_rt; called with rcu_read_lock
3228 */
3229u32 ip6_mtu_from_fib6(const struct fib6_result *res,
3230		      const struct in6_addr *daddr,
3231		      const struct in6_addr *saddr)
3232{
3233	const struct fib6_nh *nh = res->nh;
3234	struct fib6_info *f6i = res->f6i;
3235	struct inet6_dev *idev;
3236	struct rt6_info *rt;
3237	u32 mtu = 0;
3238
3239	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
3240		mtu = f6i->fib6_pmtu;
3241		if (mtu)
3242			goto out;
3243	}
3244
3245	rt = rt6_find_cached_rt(res, daddr, saddr);
3246	if (unlikely(rt)) {
3247		mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
3248	} else {
3249		struct net_device *dev = nh->fib_nh_dev;
3250
3251		mtu = IPV6_MIN_MTU;
3252		idev = __in6_dev_get(dev);
3253		if (idev)
3254			mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6));
3255	}
3256
3257	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
3258out:
3259	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
3260}
3261
3262struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
3263				  struct flowi6 *fl6)
3264{
3265	struct dst_entry *dst;
3266	struct rt6_info *rt;
3267	struct inet6_dev *idev = in6_dev_get(dev);
3268	struct net *net = dev_net(dev);
3269
3270	if (unlikely(!idev))
3271		return ERR_PTR(-ENODEV);
3272
3273	rt = ip6_dst_alloc(net, dev, 0);
3274	if (unlikely(!rt)) {
3275		in6_dev_put(idev);
3276		dst = ERR_PTR(-ENOMEM);
3277		goto out;
3278	}
3279
3280	rt->dst.input = ip6_input;
3281	rt->dst.output  = ip6_output;
3282	rt->rt6i_gateway  = fl6->daddr;
3283	rt->rt6i_dst.addr = fl6->daddr;
3284	rt->rt6i_dst.plen = 128;
3285	rt->rt6i_idev     = idev;
3286	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
3287
3288	/* Add this dst into uncached_list so that rt6_disable_ip() can
3289	 * do proper release of the net_device
3290	 */
3291	rt6_uncached_list_add(rt);
3292
3293	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
3294
3295out:
3296	return dst;
3297}
3298
3299static void ip6_dst_gc(struct dst_ops *ops)
3300{
3301	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
3302	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
3303	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
3304	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
3305	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
3306	unsigned int val;
3307	int entries;
3308
3309	if (time_after(rt_last_gc + rt_min_interval, jiffies))
3310		goto out;
3311
3312	fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
3313	entries = dst_entries_get_slow(ops);
3314	if (entries < ops->gc_thresh)
3315		atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1);
3316out:
3317	val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
3318	atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
3319}
3320
3321static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
3322			       const struct in6_addr *gw_addr, u32 tbid,
3323			       int flags, struct fib6_result *res)
3324{
3325	struct flowi6 fl6 = {
3326		.flowi6_oif = cfg->fc_ifindex,
3327		.daddr = *gw_addr,
3328		.saddr = cfg->fc_prefsrc,
3329	};
3330	struct fib6_table *table;
3331	int err;
3332
3333	table = fib6_get_table(net, tbid);
3334	if (!table)
3335		return -EINVAL;
3336
3337	if (!ipv6_addr_any(&cfg->fc_prefsrc))
3338		flags |= RT6_LOOKUP_F_HAS_SADDR;
3339
3340	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
3341
3342	err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
3343	if (!err && res->f6i != net->ipv6.fib6_null_entry)
3344		fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
3345				 cfg->fc_ifindex != 0, NULL, flags);
3346
3347	return err;
3348}
3349
3350static int ip6_route_check_nh_onlink(struct net *net,
3351				     struct fib6_config *cfg,
3352				     const struct net_device *dev,
3353				     struct netlink_ext_ack *extack)
3354{
3355	u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
3356	const struct in6_addr *gw_addr = &cfg->fc_gateway;
3357	struct fib6_result res = {};
3358	int err;
3359
3360	err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
3361	if (!err && !(res.fib6_flags & RTF_REJECT) &&
3362	    /* ignore match if it is the default route */
3363	    !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
3364	    (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
3365		NL_SET_ERR_MSG(extack,
3366			       "Nexthop has invalid gateway or device mismatch");
3367		err = -EINVAL;
3368	}
3369
3370	return err;
3371}
3372
3373static int ip6_route_check_nh(struct net *net,
3374			      struct fib6_config *cfg,
3375			      struct net_device **_dev,
3376			      netdevice_tracker *dev_tracker,
3377			      struct inet6_dev **idev)
3378{
3379	const struct in6_addr *gw_addr = &cfg->fc_gateway;
3380	struct net_device *dev = _dev ? *_dev : NULL;
3381	int flags = RT6_LOOKUP_F_IFACE;
3382	struct fib6_result res = {};
3383	int err = -EHOSTUNREACH;
3384
3385	if (cfg->fc_table) {
3386		err = ip6_nh_lookup_table(net, cfg, gw_addr,
3387					  cfg->fc_table, flags, &res);
3388		/* gw_addr can not require a gateway or resolve to a reject
3389		 * route. If a device is given, it must match the result.
3390		 */
3391		if (err || res.fib6_flags & RTF_REJECT ||
3392		    res.nh->fib_nh_gw_family ||
3393		    (dev && dev != res.nh->fib_nh_dev))
3394			err = -EHOSTUNREACH;
3395	}
3396
3397	if (err < 0) {
3398		struct flowi6 fl6 = {
3399			.flowi6_oif = cfg->fc_ifindex,
3400			.daddr = *gw_addr,
3401		};
3402
3403		err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
3404		if (err || res.fib6_flags & RTF_REJECT ||
3405		    res.nh->fib_nh_gw_family)
3406			err = -EHOSTUNREACH;
3407
3408		if (err)
3409			return err;
3410
3411		fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
3412				 cfg->fc_ifindex != 0, NULL, flags);
3413	}
3414
3415	err = 0;
3416	if (dev) {
3417		if (dev != res.nh->fib_nh_dev)
3418			err = -EHOSTUNREACH;
3419	} else {
3420		*_dev = dev = res.nh->fib_nh_dev;
3421		netdev_hold(dev, dev_tracker, GFP_ATOMIC);
3422		*idev = in6_dev_get(dev);
3423	}
3424
3425	return err;
3426}
3427
3428static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
3429			   struct net_device **_dev,
3430			   netdevice_tracker *dev_tracker,
3431			   struct inet6_dev **idev,
3432			   struct netlink_ext_ack *extack)
3433{
3434	const struct in6_addr *gw_addr = &cfg->fc_gateway;
3435	int gwa_type = ipv6_addr_type(gw_addr);
3436	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
3437	const struct net_device *dev = *_dev;
3438	bool need_addr_check = !dev;
3439	int err = -EINVAL;
3440
3441	/* if gw_addr is local we will fail to detect this in case
3442	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
3443	 * will return already-added prefix route via interface that
3444	 * prefix route was assigned to, which might be non-loopback.
3445	 */
3446	if (dev &&
3447	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
3448		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
3449		goto out;
3450	}
3451
3452	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
3453		/* IPv6 strictly inhibits using not link-local
3454		 * addresses as nexthop address.
3455		 * Otherwise, router will not able to send redirects.
3456		 * It is very good, but in some (rare!) circumstances
3457		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
3458		 * some exceptions. --ANK
3459		 * We allow IPv4-mapped nexthops to support RFC4798-type
3460		 * addressing
3461		 */
3462		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
3463			NL_SET_ERR_MSG(extack, "Invalid gateway address");
3464			goto out;
3465		}
3466
3467		rcu_read_lock();
3468
3469		if (cfg->fc_flags & RTNH_F_ONLINK)
3470			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
3471		else
3472			err = ip6_route_check_nh(net, cfg, _dev, dev_tracker,
3473						 idev);
3474
3475		rcu_read_unlock();
3476
3477		if (err)
3478			goto out;
3479	}
3480
3481	/* reload in case device was changed */
3482	dev = *_dev;
3483
3484	err = -EINVAL;
3485	if (!dev) {
3486		NL_SET_ERR_MSG(extack, "Egress device not specified");
3487		goto out;
3488	} else if (dev->flags & IFF_LOOPBACK) {
3489		NL_SET_ERR_MSG(extack,
3490			       "Egress device can not be loopback device for this route");
3491		goto out;
3492	}
3493
3494	/* if we did not check gw_addr above, do so now that the
3495	 * egress device has been resolved.
3496	 */
3497	if (need_addr_check &&
3498	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
3499		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
3500		goto out;
3501	}
3502
3503	err = 0;
3504out:
3505	return err;
3506}
3507
3508static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
3509{
3510	if ((flags & RTF_REJECT) ||
3511	    (dev && (dev->flags & IFF_LOOPBACK) &&
3512	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
3513	     !(flags & (RTF_ANYCAST | RTF_LOCAL))))
3514		return true;
3515
3516	return false;
3517}
3518
3519int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
3520		 struct fib6_config *cfg, gfp_t gfp_flags,
3521		 struct netlink_ext_ack *extack)
3522{
3523	netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker;
3524	struct net_device *dev = NULL;
3525	struct inet6_dev *idev = NULL;
3526	int addr_type;
3527	int err;
3528
3529	fib6_nh->fib_nh_family = AF_INET6;
3530#ifdef CONFIG_IPV6_ROUTER_PREF
3531	fib6_nh->last_probe = jiffies;
3532#endif
3533	if (cfg->fc_is_fdb) {
3534		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3535		fib6_nh->fib_nh_gw_family = AF_INET6;
3536		return 0;
3537	}
3538
3539	err = -ENODEV;
3540	if (cfg->fc_ifindex) {
3541		dev = netdev_get_by_index(net, cfg->fc_ifindex,
3542					  dev_tracker, gfp_flags);
3543		if (!dev)
3544			goto out;
3545		idev = in6_dev_get(dev);
3546		if (!idev)
3547			goto out;
3548	}
3549
3550	if (cfg->fc_flags & RTNH_F_ONLINK) {
3551		if (!dev) {
3552			NL_SET_ERR_MSG(extack,
3553				       "Nexthop device required for onlink");
3554			goto out;
3555		}
3556
3557		if (!(dev->flags & IFF_UP)) {
3558			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3559			err = -ENETDOWN;
3560			goto out;
3561		}
3562
3563		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3564	}
3565
3566	fib6_nh->fib_nh_weight = 1;
3567
3568	/* We cannot add true routes via loopback here,
3569	 * they would result in kernel looping; promote them to reject routes
3570	 */
3571	addr_type = ipv6_addr_type(&cfg->fc_dst);
3572	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3573		/* hold loopback dev/idev if we haven't done so. */
3574		if (dev != net->loopback_dev) {
3575			if (dev) {
3576				netdev_put(dev, dev_tracker);
3577				in6_dev_put(idev);
3578			}
3579			dev = net->loopback_dev;
3580			netdev_hold(dev, dev_tracker, gfp_flags);
3581			idev = in6_dev_get(dev);
3582			if (!idev) {
3583				err = -ENODEV;
3584				goto out;
3585			}
3586		}
3587		goto pcpu_alloc;
3588	}
3589
3590	if (cfg->fc_flags & RTF_GATEWAY) {
3591		err = ip6_validate_gw(net, cfg, &dev, dev_tracker,
3592				      &idev, extack);
3593		if (err)
3594			goto out;
3595
3596		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3597		fib6_nh->fib_nh_gw_family = AF_INET6;
3598	}
3599
3600	err = -ENODEV;
3601	if (!dev)
3602		goto out;
3603
3604	if (idev->cnf.disable_ipv6) {
3605		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3606		err = -EACCES;
3607		goto out;
3608	}
3609
3610	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3611		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3612		err = -ENETDOWN;
3613		goto out;
3614	}
3615
3616	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3617	    !netif_carrier_ok(dev))
3618		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3619
3620	err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap,
3621				 cfg->fc_encap_type, cfg, gfp_flags, extack);
3622	if (err)
3623		goto out;
3624
3625pcpu_alloc:
3626	fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
3627	if (!fib6_nh->rt6i_pcpu) {
3628		err = -ENOMEM;
3629		goto out;
3630	}
3631
3632	fib6_nh->fib_nh_dev = dev;
3633	fib6_nh->fib_nh_oif = dev->ifindex;
3634	err = 0;
3635out:
3636	if (idev)
3637		in6_dev_put(idev);
3638
3639	if (err) {
3640		lwtstate_put(fib6_nh->fib_nh_lws);
3641		fib6_nh->fib_nh_lws = NULL;
3642		netdev_put(dev, dev_tracker);
3643	}
3644
3645	return err;
3646}
3647
3648void fib6_nh_release(struct fib6_nh *fib6_nh)
3649{
3650	struct rt6_exception_bucket *bucket;
3651
3652	rcu_read_lock();
3653
3654	fib6_nh_flush_exceptions(fib6_nh, NULL);
3655	bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
3656	if (bucket) {
3657		rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
3658		kfree(bucket);
3659	}
3660
3661	rcu_read_unlock();
3662
3663	fib6_nh_release_dsts(fib6_nh);
3664	free_percpu(fib6_nh->rt6i_pcpu);
3665
3666	fib_nh_common_release(&fib6_nh->nh_common);
3667}
3668
3669void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
3670{
3671	int cpu;
3672
3673	if (!fib6_nh->rt6i_pcpu)
3674		return;
3675
3676	for_each_possible_cpu(cpu) {
3677		struct rt6_info *pcpu_rt, **ppcpu_rt;
3678
3679		ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
3680		pcpu_rt = xchg(ppcpu_rt, NULL);
3681		if (pcpu_rt) {
3682			dst_dev_put(&pcpu_rt->dst);
3683			dst_release(&pcpu_rt->dst);
3684		}
3685	}
3686}
3687
3688static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3689					      gfp_t gfp_flags,
3690					      struct netlink_ext_ack *extack)
3691{
3692	struct net *net = cfg->fc_nlinfo.nl_net;
3693	struct fib6_info *rt = NULL;
3694	struct nexthop *nh = NULL;
3695	struct fib6_table *table;
3696	struct fib6_nh *fib6_nh;
3697	int err = -EINVAL;
3698	int addr_type;
3699
3700	/* RTF_PCPU is an internal flag; can not be set by userspace */
3701	if (cfg->fc_flags & RTF_PCPU) {
3702		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3703		goto out;
3704	}
3705
3706	/* RTF_CACHE is an internal flag; can not be set by userspace */
3707	if (cfg->fc_flags & RTF_CACHE) {
3708		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3709		goto out;
3710	}
3711
3712	if (cfg->fc_type > RTN_MAX) {
3713		NL_SET_ERR_MSG(extack, "Invalid route type");
3714		goto out;
3715	}
3716
3717	if (cfg->fc_dst_len > 128) {
3718		NL_SET_ERR_MSG(extack, "Invalid prefix length");
3719		goto out;
3720	}
3721	if (cfg->fc_src_len > 128) {
3722		NL_SET_ERR_MSG(extack, "Invalid source address length");
3723		goto out;
3724	}
3725#ifndef CONFIG_IPV6_SUBTREES
3726	if (cfg->fc_src_len) {
3727		NL_SET_ERR_MSG(extack,
3728			       "Specifying source address requires IPV6_SUBTREES to be enabled");
3729		goto out;
3730	}
3731#endif
3732	if (cfg->fc_nh_id) {
3733		nh = nexthop_find_by_id(net, cfg->fc_nh_id);
3734		if (!nh) {
3735			NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
3736			goto out;
3737		}
3738		err = fib6_check_nexthop(nh, cfg, extack);
3739		if (err)
3740			goto out;
3741	}
3742
3743	err = -ENOBUFS;
3744	if (cfg->fc_nlinfo.nlh &&
3745	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3746		table = fib6_get_table(net, cfg->fc_table);
3747		if (!table) {
3748			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3749			table = fib6_new_table(net, cfg->fc_table);
3750		}
3751	} else {
3752		table = fib6_new_table(net, cfg->fc_table);
3753	}
3754
3755	if (!table)
3756		goto out;
3757
3758	err = -ENOMEM;
3759	rt = fib6_info_alloc(gfp_flags, !nh);
3760	if (!rt)
3761		goto out;
3762
3763	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3764					       extack);
3765	if (IS_ERR(rt->fib6_metrics)) {
3766		err = PTR_ERR(rt->fib6_metrics);
3767		/* Do not leave garbage there. */
3768		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3769		goto out_free;
3770	}
3771
3772	if (cfg->fc_flags & RTF_ADDRCONF)
3773		rt->dst_nocount = true;
3774
3775	if (cfg->fc_flags & RTF_EXPIRES)
3776		fib6_set_expires(rt, jiffies +
3777				clock_t_to_jiffies(cfg->fc_expires));
3778
3779	if (cfg->fc_protocol == RTPROT_UNSPEC)
3780		cfg->fc_protocol = RTPROT_BOOT;
3781	rt->fib6_protocol = cfg->fc_protocol;
3782
3783	rt->fib6_table = table;
3784	rt->fib6_metric = cfg->fc_metric;
3785	rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
3786	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3787
3788	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3789	rt->fib6_dst.plen = cfg->fc_dst_len;
3790
3791#ifdef CONFIG_IPV6_SUBTREES
3792	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3793	rt->fib6_src.plen = cfg->fc_src_len;
3794#endif
3795	if (nh) {
3796		if (rt->fib6_src.plen) {
3797			NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
3798			goto out_free;
3799		}
3800		if (!nexthop_get(nh)) {
3801			NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
3802			goto out_free;
3803		}
3804		rt->nh = nh;
3805		fib6_nh = nexthop_fib6_nh(rt->nh);
3806	} else {
3807		err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
3808		if (err)
3809			goto out;
3810
3811		fib6_nh = rt->fib6_nh;
3812
3813		/* We cannot add true routes via loopback here, they would
3814		 * result in kernel looping; promote them to reject routes
3815		 */
3816		addr_type = ipv6_addr_type(&cfg->fc_dst);
3817		if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
3818				   addr_type))
3819			rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3820	}
3821
3822	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3823		struct net_device *dev = fib6_nh->fib_nh_dev;
3824
3825		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3826			NL_SET_ERR_MSG(extack, "Invalid source address");
3827			err = -EINVAL;
3828			goto out;
3829		}
3830		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3831		rt->fib6_prefsrc.plen = 128;
3832	} else
3833		rt->fib6_prefsrc.plen = 0;
3834
3835	return rt;
3836out:
3837	fib6_info_release(rt);
3838	return ERR_PTR(err);
3839out_free:
3840	ip_fib_metrics_put(rt->fib6_metrics);
3841	kfree(rt);
3842	return ERR_PTR(err);
3843}
3844
3845int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3846		  struct netlink_ext_ack *extack)
3847{
3848	struct fib6_info *rt;
3849	int err;
3850
3851	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3852	if (IS_ERR(rt))
3853		return PTR_ERR(rt);
3854
3855	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3856	fib6_info_release(rt);
3857
3858	return err;
3859}
3860
3861static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3862{
3863	struct net *net = info->nl_net;
3864	struct fib6_table *table;
3865	int err;
3866
3867	if (rt == net->ipv6.fib6_null_entry) {
3868		err = -ENOENT;
3869		goto out;
3870	}
3871
3872	table = rt->fib6_table;
3873	spin_lock_bh(&table->tb6_lock);
3874	err = fib6_del(rt, info);
3875	spin_unlock_bh(&table->tb6_lock);
3876
3877out:
3878	fib6_info_release(rt);
3879	return err;
3880}
3881
3882int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify)
3883{
3884	struct nl_info info = {
3885		.nl_net = net,
3886		.skip_notify = skip_notify
3887	};
3888
3889	return __ip6_del_rt(rt, &info);
3890}
3891
3892static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3893{
3894	struct nl_info *info = &cfg->fc_nlinfo;
3895	struct net *net = info->nl_net;
3896	struct sk_buff *skb = NULL;
3897	struct fib6_table *table;
3898	int err = -ENOENT;
3899
3900	if (rt == net->ipv6.fib6_null_entry)
3901		goto out_put;
3902	table = rt->fib6_table;
3903	spin_lock_bh(&table->tb6_lock);
3904
3905	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3906		struct fib6_info *sibling, *next_sibling;
3907		struct fib6_node *fn;
3908
3909		/* prefer to send a single notification with all hops */
3910		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3911		if (skb) {
3912			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3913
3914			if (rt6_fill_node(net, skb, rt, NULL,
3915					  NULL, NULL, 0, RTM_DELROUTE,
3916					  info->portid, seq, 0) < 0) {
3917				kfree_skb(skb);
3918				skb = NULL;
3919			} else
3920				info->skip_notify = 1;
3921		}
3922
3923		/* 'rt' points to the first sibling route. If it is not the
3924		 * leaf, then we do not need to send a notification. Otherwise,
3925		 * we need to check if the last sibling has a next route or not
3926		 * and emit a replace or delete notification, respectively.
3927		 */
3928		info->skip_notify_kernel = 1;
3929		fn = rcu_dereference_protected(rt->fib6_node,
3930					    lockdep_is_held(&table->tb6_lock));
3931		if (rcu_access_pointer(fn->leaf) == rt) {
3932			struct fib6_info *last_sibling, *replace_rt;
3933
3934			last_sibling = list_last_entry(&rt->fib6_siblings,
3935						       struct fib6_info,
3936						       fib6_siblings);
3937			replace_rt = rcu_dereference_protected(
3938					    last_sibling->fib6_next,
3939					    lockdep_is_held(&table->tb6_lock));
3940			if (replace_rt)
3941				call_fib6_entry_notifiers_replace(net,
3942								  replace_rt);
3943			else
3944				call_fib6_multipath_entry_notifiers(net,
3945						       FIB_EVENT_ENTRY_DEL,
3946						       rt, rt->fib6_nsiblings,
3947						       NULL);
3948		}
3949		list_for_each_entry_safe(sibling, next_sibling,
3950					 &rt->fib6_siblings,
3951					 fib6_siblings) {
3952			err = fib6_del(sibling, info);
3953			if (err)
3954				goto out_unlock;
3955		}
3956	}
3957
3958	err = fib6_del(rt, info);
3959out_unlock:
3960	spin_unlock_bh(&table->tb6_lock);
3961out_put:
3962	fib6_info_release(rt);
3963
3964	if (skb) {
3965		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3966			    info->nlh, gfp_any());
3967	}
3968	return err;
3969}
3970
3971static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3972{
3973	int rc = -ESRCH;
3974
3975	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3976		goto out;
3977
3978	if (cfg->fc_flags & RTF_GATEWAY &&
3979	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3980		goto out;
3981
3982	rc = rt6_remove_exception_rt(rt);
3983out:
3984	return rc;
3985}
3986
3987static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
3988			     struct fib6_nh *nh)
3989{
3990	struct fib6_result res = {
3991		.f6i = rt,
3992		.nh = nh,
3993	};
3994	struct rt6_info *rt_cache;
3995
3996	rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
3997	if (rt_cache)
3998		return __ip6_del_cached_rt(rt_cache, cfg);
3999
4000	return 0;
4001}
4002
4003struct fib6_nh_del_cached_rt_arg {
4004	struct fib6_config *cfg;
4005	struct fib6_info *f6i;
4006};
4007
4008static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
4009{
4010	struct fib6_nh_del_cached_rt_arg *arg = _arg;
4011	int rc;
4012
4013	rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
4014	return rc != -ESRCH ? rc : 0;
4015}
4016
4017static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
4018{
4019	struct fib6_nh_del_cached_rt_arg arg = {
4020		.cfg = cfg,
4021		.f6i = f6i
4022	};
4023
4024	return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
4025}
4026
4027static int ip6_route_del(struct fib6_config *cfg,
4028			 struct netlink_ext_ack *extack)
4029{
4030	struct fib6_table *table;
4031	struct fib6_info *rt;
4032	struct fib6_node *fn;
4033	int err = -ESRCH;
4034
4035	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
4036	if (!table) {
4037		NL_SET_ERR_MSG(extack, "FIB table does not exist");
4038		return err;
4039	}
4040
4041	rcu_read_lock();
4042
4043	fn = fib6_locate(&table->tb6_root,
4044			 &cfg->fc_dst, cfg->fc_dst_len,
4045			 &cfg->fc_src, cfg->fc_src_len,
4046			 !(cfg->fc_flags & RTF_CACHE));
4047
4048	if (fn) {
4049		for_each_fib6_node_rt_rcu(fn) {
4050			struct fib6_nh *nh;
4051
4052			if (rt->nh && cfg->fc_nh_id &&
4053			    rt->nh->id != cfg->fc_nh_id)
4054				continue;
4055
4056			if (cfg->fc_flags & RTF_CACHE) {
4057				int rc = 0;
4058
4059				if (rt->nh) {
4060					rc = ip6_del_cached_rt_nh(cfg, rt);
4061				} else if (cfg->fc_nh_id) {
4062					continue;
4063				} else {
4064					nh = rt->fib6_nh;
4065					rc = ip6_del_cached_rt(cfg, rt, nh);
4066				}
4067				if (rc != -ESRCH) {
4068					rcu_read_unlock();
4069					return rc;
4070				}
4071				continue;
4072			}
4073
4074			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
4075				continue;
4076			if (cfg->fc_protocol &&
4077			    cfg->fc_protocol != rt->fib6_protocol)
4078				continue;
4079
4080			if (rt->nh) {
4081				if (!fib6_info_hold_safe(rt))
4082					continue;
4083				rcu_read_unlock();
4084
4085				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
4086			}
4087			if (cfg->fc_nh_id)
4088				continue;
4089
4090			nh = rt->fib6_nh;
4091			if (cfg->fc_ifindex &&
4092			    (!nh->fib_nh_dev ||
4093			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
4094				continue;
4095			if (cfg->fc_flags & RTF_GATEWAY &&
4096			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
4097				continue;
4098			if (!fib6_info_hold_safe(rt))
4099				continue;
4100			rcu_read_unlock();
4101
4102			/* if gateway was specified only delete the one hop */
4103			if (cfg->fc_flags & RTF_GATEWAY)
4104				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
4105
4106			return __ip6_del_rt_siblings(rt, cfg);
4107		}
4108	}
4109	rcu_read_unlock();
4110
4111	return err;
4112}
4113
4114static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
4115{
4116	struct netevent_redirect netevent;
4117	struct rt6_info *rt, *nrt = NULL;
4118	struct fib6_result res = {};
4119	struct ndisc_options ndopts;
4120	struct inet6_dev *in6_dev;
4121	struct neighbour *neigh;
4122	struct rd_msg *msg;
4123	int optlen, on_link;
4124	u8 *lladdr;
4125
4126	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
4127	optlen -= sizeof(*msg);
4128
4129	if (optlen < 0) {
4130		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
4131		return;
4132	}
4133
4134	msg = (struct rd_msg *)icmp6_hdr(skb);
4135
4136	if (ipv6_addr_is_multicast(&msg->dest)) {
4137		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
4138		return;
4139	}
4140
4141	on_link = 0;
4142	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
4143		on_link = 1;
4144	} else if (ipv6_addr_type(&msg->target) !=
4145		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
4146		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
4147		return;
4148	}
4149
4150	in6_dev = __in6_dev_get(skb->dev);
4151	if (!in6_dev)
4152		return;
4153	if (READ_ONCE(in6_dev->cnf.forwarding) ||
4154	    !READ_ONCE(in6_dev->cnf.accept_redirects))
4155		return;
4156
4157	/* RFC2461 8.1:
4158	 *	The IP source address of the Redirect MUST be the same as the current
4159	 *	first-hop router for the specified ICMP Destination Address.
4160	 */
4161
4162	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
4163		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
4164		return;
4165	}
4166
4167	lladdr = NULL;
4168	if (ndopts.nd_opts_tgt_lladdr) {
4169		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
4170					     skb->dev);
4171		if (!lladdr) {
4172			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
4173			return;
4174		}
4175	}
4176
4177	rt = (struct rt6_info *) dst;
4178	if (rt->rt6i_flags & RTF_REJECT) {
4179		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
4180		return;
4181	}
4182
4183	/* Redirect received -> path was valid.
4184	 * Look, redirects are sent only in response to data packets,
4185	 * so that this nexthop apparently is reachable. --ANK
4186	 */
4187	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
4188
4189	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
4190	if (!neigh)
4191		return;
4192
4193	/*
4194	 *	We have finally decided to accept it.
4195	 */
4196
4197	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
4198		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
4199		     NEIGH_UPDATE_F_OVERRIDE|
4200		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
4201				     NEIGH_UPDATE_F_ISROUTER)),
4202		     NDISC_REDIRECT, &ndopts);
4203
4204	rcu_read_lock();
4205	res.f6i = rcu_dereference(rt->from);
4206	if (!res.f6i)
4207		goto out;
4208
4209	if (res.f6i->nh) {
4210		struct fib6_nh_match_arg arg = {
4211			.dev = dst->dev,
4212			.gw = &rt->rt6i_gateway,
4213		};
4214
4215		nexthop_for_each_fib6_nh(res.f6i->nh,
4216					 fib6_nh_find_match, &arg);
4217
4218		/* fib6_info uses a nexthop that does not have fib6_nh
4219		 * using the dst->dev. Should be impossible
4220		 */
4221		if (!arg.match)
4222			goto out;
4223		res.nh = arg.match;
4224	} else {
4225		res.nh = res.f6i->fib6_nh;
4226	}
4227
4228	res.fib6_flags = res.f6i->fib6_flags;
4229	res.fib6_type = res.f6i->fib6_type;
4230	nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
4231	if (!nrt)
4232		goto out;
4233
4234	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
4235	if (on_link)
4236		nrt->rt6i_flags &= ~RTF_GATEWAY;
4237
4238	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
4239
4240	/* rt6_insert_exception() will take care of duplicated exceptions */
4241	if (rt6_insert_exception(nrt, &res)) {
4242		dst_release_immediate(&nrt->dst);
4243		goto out;
4244	}
4245
4246	netevent.old = &rt->dst;
4247	netevent.new = &nrt->dst;
4248	netevent.daddr = &msg->dest;
4249	netevent.neigh = neigh;
4250	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
4251
4252out:
4253	rcu_read_unlock();
4254	neigh_release(neigh);
4255}
4256
4257#ifdef CONFIG_IPV6_ROUTE_INFO
4258static struct fib6_info *rt6_get_route_info(struct net *net,
4259					   const struct in6_addr *prefix, int prefixlen,
4260					   const struct in6_addr *gwaddr,
4261					   struct net_device *dev)
4262{
4263	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
4264	int ifindex = dev->ifindex;
4265	struct fib6_node *fn;
4266	struct fib6_info *rt = NULL;
4267	struct fib6_table *table;
4268
4269	table = fib6_get_table(net, tb_id);
4270	if (!table)
4271		return NULL;
4272
4273	rcu_read_lock();
4274	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
4275	if (!fn)
4276		goto out;
4277
4278	for_each_fib6_node_rt_rcu(fn) {
4279		/* these routes do not use nexthops */
4280		if (rt->nh)
4281			continue;
4282		if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
4283			continue;
4284		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
4285		    !rt->fib6_nh->fib_nh_gw_family)
4286			continue;
4287		if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
4288			continue;
4289		if (!fib6_info_hold_safe(rt))
4290			continue;
4291		break;
4292	}
4293out:
4294	rcu_read_unlock();
4295	return rt;
4296}
4297
4298static struct fib6_info *rt6_add_route_info(struct net *net,
4299					   const struct in6_addr *prefix, int prefixlen,
4300					   const struct in6_addr *gwaddr,
4301					   struct net_device *dev,
4302					   unsigned int pref)
4303{
4304	struct fib6_config cfg = {
4305		.fc_metric	= IP6_RT_PRIO_USER,
4306		.fc_ifindex	= dev->ifindex,
4307		.fc_dst_len	= prefixlen,
4308		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
4309				  RTF_UP | RTF_PREF(pref),
4310		.fc_protocol = RTPROT_RA,
4311		.fc_type = RTN_UNICAST,
4312		.fc_nlinfo.portid = 0,
4313		.fc_nlinfo.nlh = NULL,
4314		.fc_nlinfo.nl_net = net,
4315	};
4316
4317	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
4318	cfg.fc_dst = *prefix;
4319	cfg.fc_gateway = *gwaddr;
4320
4321	/* We should treat it as a default route if prefix length is 0. */
4322	if (!prefixlen)
4323		cfg.fc_flags |= RTF_DEFAULT;
4324
4325	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
4326
4327	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
4328}
4329#endif
4330
4331struct fib6_info *rt6_get_dflt_router(struct net *net,
4332				     const struct in6_addr *addr,
4333				     struct net_device *dev)
4334{
4335	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
4336	struct fib6_info *rt;
4337	struct fib6_table *table;
4338
4339	table = fib6_get_table(net, tb_id);
4340	if (!table)
4341		return NULL;
4342
4343	rcu_read_lock();
4344	for_each_fib6_node_rt_rcu(&table->tb6_root) {
4345		struct fib6_nh *nh;
4346
4347		/* RA routes do not use nexthops */
4348		if (rt->nh)
4349			continue;
4350
4351		nh = rt->fib6_nh;
4352		if (dev == nh->fib_nh_dev &&
4353		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
4354		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
4355			break;
4356	}
4357	if (rt && !fib6_info_hold_safe(rt))
4358		rt = NULL;
4359	rcu_read_unlock();
4360	return rt;
4361}
4362
4363struct fib6_info *rt6_add_dflt_router(struct net *net,
4364				     const struct in6_addr *gwaddr,
4365				     struct net_device *dev,
4366				     unsigned int pref,
4367				     u32 defrtr_usr_metric,
4368				     int lifetime)
4369{
4370	struct fib6_config cfg = {
4371		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
4372		.fc_metric	= defrtr_usr_metric,
4373		.fc_ifindex	= dev->ifindex,
4374		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
4375				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
4376		.fc_protocol = RTPROT_RA,
4377		.fc_type = RTN_UNICAST,
4378		.fc_nlinfo.portid = 0,
4379		.fc_nlinfo.nlh = NULL,
4380		.fc_nlinfo.nl_net = net,
4381		.fc_expires = jiffies_to_clock_t(lifetime * HZ),
4382	};
4383
4384	cfg.fc_gateway = *gwaddr;
4385
4386	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
4387		struct fib6_table *table;
4388
4389		table = fib6_get_table(dev_net(dev), cfg.fc_table);
4390		if (table)
4391			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
4392	}
4393
4394	return rt6_get_dflt_router(net, gwaddr, dev);
4395}
4396
4397static void __rt6_purge_dflt_routers(struct net *net,
4398				     struct fib6_table *table)
4399{
4400	struct fib6_info *rt;
4401
4402restart:
4403	rcu_read_lock();
4404	for_each_fib6_node_rt_rcu(&table->tb6_root) {
4405		struct net_device *dev = fib6_info_nh_dev(rt);
4406		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
4407
4408		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
4409		    (!idev || idev->cnf.accept_ra != 2) &&
4410		    fib6_info_hold_safe(rt)) {
4411			rcu_read_unlock();
4412			ip6_del_rt(net, rt, false);
4413			goto restart;
4414		}
4415	}
4416	rcu_read_unlock();
4417
4418	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
4419}
4420
4421void rt6_purge_dflt_routers(struct net *net)
4422{
4423	struct fib6_table *table;
4424	struct hlist_head *head;
4425	unsigned int h;
4426
4427	rcu_read_lock();
4428
4429	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
4430		head = &net->ipv6.fib_table_hash[h];
4431		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
4432			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
4433				__rt6_purge_dflt_routers(net, table);
4434		}
4435	}
4436
4437	rcu_read_unlock();
4438}
4439
4440static void rtmsg_to_fib6_config(struct net *net,
4441				 struct in6_rtmsg *rtmsg,
4442				 struct fib6_config *cfg)
4443{
4444	*cfg = (struct fib6_config){
4445		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
4446			 : RT6_TABLE_MAIN,
4447		.fc_ifindex = rtmsg->rtmsg_ifindex,
4448		.fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
4449		.fc_expires = rtmsg->rtmsg_info,
4450		.fc_dst_len = rtmsg->rtmsg_dst_len,
4451		.fc_src_len = rtmsg->rtmsg_src_len,
4452		.fc_flags = rtmsg->rtmsg_flags,
4453		.fc_type = rtmsg->rtmsg_type,
4454
4455		.fc_nlinfo.nl_net = net,
4456
4457		.fc_dst = rtmsg->rtmsg_dst,
4458		.fc_src = rtmsg->rtmsg_src,
4459		.fc_gateway = rtmsg->rtmsg_gateway,
4460	};
4461}
4462
4463int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
4464{
4465	struct fib6_config cfg;
4466	int err;
4467
4468	if (cmd != SIOCADDRT && cmd != SIOCDELRT)
4469		return -EINVAL;
4470	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
4471		return -EPERM;
4472
4473	rtmsg_to_fib6_config(net, rtmsg, &cfg);
4474
4475	rtnl_lock();
4476	switch (cmd) {
4477	case SIOCADDRT:
4478		err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
4479		break;
4480	case SIOCDELRT:
4481		err = ip6_route_del(&cfg, NULL);
4482		break;
4483	}
4484	rtnl_unlock();
4485	return err;
4486}
4487
4488/*
4489 *	Drop the packet on the floor
4490 */
4491
4492static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
4493{
4494	struct dst_entry *dst = skb_dst(skb);
4495	struct net *net = dev_net(dst->dev);
4496	struct inet6_dev *idev;
4497	SKB_DR(reason);
4498	int type;
4499
4500	if (netif_is_l3_master(skb->dev) ||
4501	    dst->dev == net->loopback_dev)
4502		idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
4503	else
4504		idev = ip6_dst_idev(dst);
4505
4506	switch (ipstats_mib_noroutes) {
4507	case IPSTATS_MIB_INNOROUTES:
4508		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
4509		if (type == IPV6_ADDR_ANY) {
4510			SKB_DR_SET(reason, IP_INADDRERRORS);
4511			IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
4512			break;
4513		}
4514		SKB_DR_SET(reason, IP_INNOROUTES);
4515		fallthrough;
4516	case IPSTATS_MIB_OUTNOROUTES:
4517		SKB_DR_OR(reason, IP_OUTNOROUTES);
4518		IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
4519		break;
4520	}
4521
4522	/* Start over by dropping the dst for l3mdev case */
4523	if (netif_is_l3_master(skb->dev))
4524		skb_dst_drop(skb);
4525
4526	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
4527	kfree_skb_reason(skb, reason);
4528	return 0;
4529}
4530
4531static int ip6_pkt_discard(struct sk_buff *skb)
4532{
4533	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
4534}
4535
4536static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
4537{
4538	skb->dev = skb_dst(skb)->dev;
4539	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
4540}
4541
4542static int ip6_pkt_prohibit(struct sk_buff *skb)
4543{
4544	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
4545}
4546
4547static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
4548{
4549	skb->dev = skb_dst(skb)->dev;
4550	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
4551}
4552
4553/*
4554 *	Allocate a dst for local (unicast / anycast) address.
4555 */
4556
4557struct fib6_info *addrconf_f6i_alloc(struct net *net,
4558				     struct inet6_dev *idev,
4559				     const struct in6_addr *addr,
4560				     bool anycast, gfp_t gfp_flags,
4561				     struct netlink_ext_ack *extack)
4562{
4563	struct fib6_config cfg = {
4564		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
4565		.fc_ifindex = idev->dev->ifindex,
4566		.fc_flags = RTF_UP | RTF_NONEXTHOP,
4567		.fc_dst = *addr,
4568		.fc_dst_len = 128,
4569		.fc_protocol = RTPROT_KERNEL,
4570		.fc_nlinfo.nl_net = net,
4571		.fc_ignore_dev_down = true,
4572	};
4573	struct fib6_info *f6i;
4574
4575	if (anycast) {
4576		cfg.fc_type = RTN_ANYCAST;
4577		cfg.fc_flags |= RTF_ANYCAST;
4578	} else {
4579		cfg.fc_type = RTN_LOCAL;
4580		cfg.fc_flags |= RTF_LOCAL;
4581	}
4582
4583	f6i = ip6_route_info_create(&cfg, gfp_flags, extack);
4584	if (!IS_ERR(f6i)) {
4585		f6i->dst_nocount = true;
4586
4587		if (!anycast &&
4588		    (READ_ONCE(net->ipv6.devconf_all->disable_policy) ||
4589		     READ_ONCE(idev->cnf.disable_policy)))
4590			f6i->dst_nopolicy = true;
4591	}
4592
4593	return f6i;
4594}
4595
4596/* remove deleted ip from prefsrc entries */
4597struct arg_dev_net_ip {
4598	struct net *net;
4599	struct in6_addr *addr;
4600};
4601
4602static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
4603{
4604	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
4605	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
4606
4607	if (!rt->nh &&
4608	    rt != net->ipv6.fib6_null_entry &&
4609	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) &&
4610	    !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) {
4611		spin_lock_bh(&rt6_exception_lock);
4612		/* remove prefsrc entry */
4613		rt->fib6_prefsrc.plen = 0;
4614		spin_unlock_bh(&rt6_exception_lock);
4615	}
4616	return 0;
4617}
4618
4619void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
4620{
4621	struct net *net = dev_net(ifp->idev->dev);
4622	struct arg_dev_net_ip adni = {
4623		.net = net,
4624		.addr = &ifp->addr,
4625	};
4626	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
4627}
4628
4629#define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
4630
4631/* Remove routers and update dst entries when gateway turn into host. */
4632static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
4633{
4634	struct in6_addr *gateway = (struct in6_addr *)arg;
4635	struct fib6_nh *nh;
4636
4637	/* RA routes do not use nexthops */
4638	if (rt->nh)
4639		return 0;
4640
4641	nh = rt->fib6_nh;
4642	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
4643	    nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
4644		return -1;
4645
4646	/* Further clean up cached routes in exception table.
4647	 * This is needed because cached route may have a different
4648	 * gateway than its 'parent' in the case of an ip redirect.
4649	 */
4650	fib6_nh_exceptions_clean_tohost(nh, gateway);
4651
4652	return 0;
4653}
4654
4655void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
4656{
4657	fib6_clean_all(net, fib6_clean_tohost, gateway);
4658}
4659
4660struct arg_netdev_event {
4661	const struct net_device *dev;
4662	union {
4663		unsigned char nh_flags;
4664		unsigned long event;
4665	};
4666};
4667
4668static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
4669{
4670	struct fib6_info *iter;
4671	struct fib6_node *fn;
4672
4673	fn = rcu_dereference_protected(rt->fib6_node,
4674			lockdep_is_held(&rt->fib6_table->tb6_lock));
4675	iter = rcu_dereference_protected(fn->leaf,
4676			lockdep_is_held(&rt->fib6_table->tb6_lock));
4677	while (iter) {
4678		if (iter->fib6_metric == rt->fib6_metric &&
4679		    rt6_qualify_for_ecmp(iter))
4680			return iter;
4681		iter = rcu_dereference_protected(iter->fib6_next,
4682				lockdep_is_held(&rt->fib6_table->tb6_lock));
4683	}
4684
4685	return NULL;
4686}
4687
4688/* only called for fib entries with builtin fib6_nh */
4689static bool rt6_is_dead(const struct fib6_info *rt)
4690{
4691	if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
4692	    (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
4693	     ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
4694		return true;
4695
4696	return false;
4697}
4698
4699static int rt6_multipath_total_weight(const struct fib6_info *rt)
4700{
4701	struct fib6_info *iter;
4702	int total = 0;
4703
4704	if (!rt6_is_dead(rt))
4705		total += rt->fib6_nh->fib_nh_weight;
4706
4707	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
4708		if (!rt6_is_dead(iter))
4709			total += iter->fib6_nh->fib_nh_weight;
4710	}
4711
4712	return total;
4713}
4714
4715static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
4716{
4717	int upper_bound = -1;
4718
4719	if (!rt6_is_dead(rt)) {
4720		*weight += rt->fib6_nh->fib_nh_weight;
4721		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
4722						    total) - 1;
4723	}
4724	atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
4725}
4726
4727static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
4728{
4729	struct fib6_info *iter;
4730	int weight = 0;
4731
4732	rt6_upper_bound_set(rt, &weight, total);
4733
4734	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4735		rt6_upper_bound_set(iter, &weight, total);
4736}
4737
4738void rt6_multipath_rebalance(struct fib6_info *rt)
4739{
4740	struct fib6_info *first;
4741	int total;
4742
4743	/* In case the entire multipath route was marked for flushing,
4744	 * then there is no need to rebalance upon the removal of every
4745	 * sibling route.
4746	 */
4747	if (!rt->fib6_nsiblings || rt->should_flush)
4748		return;
4749
4750	/* During lookup routes are evaluated in order, so we need to
4751	 * make sure upper bounds are assigned from the first sibling
4752	 * onwards.
4753	 */
4754	first = rt6_multipath_first_sibling(rt);
4755	if (WARN_ON_ONCE(!first))
4756		return;
4757
4758	total = rt6_multipath_total_weight(first);
4759	rt6_multipath_upper_bound_set(first, total);
4760}
4761
4762static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4763{
4764	const struct arg_netdev_event *arg = p_arg;
4765	struct net *net = dev_net(arg->dev);
4766
4767	if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
4768	    rt->fib6_nh->fib_nh_dev == arg->dev) {
4769		rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
4770		fib6_update_sernum_upto_root(net, rt);
4771		rt6_multipath_rebalance(rt);
4772	}
4773
4774	return 0;
4775}
4776
4777void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4778{
4779	struct arg_netdev_event arg = {
4780		.dev = dev,
4781		{
4782			.nh_flags = nh_flags,
4783		},
4784	};
4785
4786	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4787		arg.nh_flags |= RTNH_F_LINKDOWN;
4788
4789	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4790}
4791
4792/* only called for fib entries with inline fib6_nh */
4793static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4794				   const struct net_device *dev)
4795{
4796	struct fib6_info *iter;
4797
4798	if (rt->fib6_nh->fib_nh_dev == dev)
4799		return true;
4800	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4801		if (iter->fib6_nh->fib_nh_dev == dev)
4802			return true;
4803
4804	return false;
4805}
4806
4807static void rt6_multipath_flush(struct fib6_info *rt)
4808{
4809	struct fib6_info *iter;
4810
4811	rt->should_flush = 1;
4812	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4813		iter->should_flush = 1;
4814}
4815
4816static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4817					     const struct net_device *down_dev)
4818{
4819	struct fib6_info *iter;
4820	unsigned int dead = 0;
4821
4822	if (rt->fib6_nh->fib_nh_dev == down_dev ||
4823	    rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4824		dead++;
4825	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4826		if (iter->fib6_nh->fib_nh_dev == down_dev ||
4827		    iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4828			dead++;
4829
4830	return dead;
4831}
4832
4833static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4834				       const struct net_device *dev,
4835				       unsigned char nh_flags)
4836{
4837	struct fib6_info *iter;
4838
4839	if (rt->fib6_nh->fib_nh_dev == dev)
4840		rt->fib6_nh->fib_nh_flags |= nh_flags;
4841	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4842		if (iter->fib6_nh->fib_nh_dev == dev)
4843			iter->fib6_nh->fib_nh_flags |= nh_flags;
4844}
4845
4846/* called with write lock held for table with rt */
4847static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4848{
4849	const struct arg_netdev_event *arg = p_arg;
4850	const struct net_device *dev = arg->dev;
4851	struct net *net = dev_net(dev);
4852
4853	if (rt == net->ipv6.fib6_null_entry || rt->nh)
4854		return 0;
4855
4856	switch (arg->event) {
4857	case NETDEV_UNREGISTER:
4858		return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
4859	case NETDEV_DOWN:
4860		if (rt->should_flush)
4861			return -1;
4862		if (!rt->fib6_nsiblings)
4863			return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
4864		if (rt6_multipath_uses_dev(rt, dev)) {
4865			unsigned int count;
4866
4867			count = rt6_multipath_dead_count(rt, dev);
4868			if (rt->fib6_nsiblings + 1 == count) {
4869				rt6_multipath_flush(rt);
4870				return -1;
4871			}
4872			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4873						   RTNH_F_LINKDOWN);
4874			fib6_update_sernum(net, rt);
4875			rt6_multipath_rebalance(rt);
4876		}
4877		return -2;
4878	case NETDEV_CHANGE:
4879		if (rt->fib6_nh->fib_nh_dev != dev ||
4880		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4881			break;
4882		rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
4883		rt6_multipath_rebalance(rt);
4884		break;
4885	}
4886
4887	return 0;
4888}
4889
4890void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4891{
4892	struct arg_netdev_event arg = {
4893		.dev = dev,
4894		{
4895			.event = event,
4896		},
4897	};
4898	struct net *net = dev_net(dev);
4899
4900	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4901		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4902	else
4903		fib6_clean_all(net, fib6_ifdown, &arg);
4904}
4905
4906void rt6_disable_ip(struct net_device *dev, unsigned long event)
4907{
4908	rt6_sync_down_dev(dev, event);
4909	rt6_uncached_list_flush_dev(dev);
4910	neigh_ifdown(&nd_tbl, dev);
4911}
4912
4913struct rt6_mtu_change_arg {
4914	struct net_device *dev;
4915	unsigned int mtu;
4916	struct fib6_info *f6i;
4917};
4918
4919static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
4920{
4921	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
4922	struct fib6_info *f6i = arg->f6i;
4923
4924	/* For administrative MTU increase, there is no way to discover
4925	 * IPv6 PMTU increase, so PMTU increase should be updated here.
4926	 * Since RFC 1981 doesn't include administrative MTU increase
4927	 * update PMTU increase is a MUST. (i.e. jumbo frame)
4928	 */
4929	if (nh->fib_nh_dev == arg->dev) {
4930		struct inet6_dev *idev = __in6_dev_get(arg->dev);
4931		u32 mtu = f6i->fib6_pmtu;
4932
4933		if (mtu >= arg->mtu ||
4934		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4935			fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
4936
4937		spin_lock_bh(&rt6_exception_lock);
4938		rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
4939		spin_unlock_bh(&rt6_exception_lock);
4940	}
4941
4942	return 0;
4943}
4944
4945static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
4946{
4947	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4948	struct inet6_dev *idev;
4949
4950	/* In IPv6 pmtu discovery is not optional,
4951	   so that RTAX_MTU lock cannot disable it.
4952	   We still use this lock to block changes
4953	   caused by addrconf/ndisc.
4954	*/
4955
4956	idev = __in6_dev_get(arg->dev);
4957	if (!idev)
4958		return 0;
4959
4960	if (fib6_metric_locked(f6i, RTAX_MTU))
4961		return 0;
4962
4963	arg->f6i = f6i;
4964	if (f6i->nh) {
4965		/* fib6_nh_mtu_change only returns 0, so this is safe */
4966		return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
4967						arg);
4968	}
4969
4970	return fib6_nh_mtu_change(f6i->fib6_nh, arg);
4971}
4972
4973void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4974{
4975	struct rt6_mtu_change_arg arg = {
4976		.dev = dev,
4977		.mtu = mtu,
4978	};
4979
4980	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4981}
4982
4983static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4984	[RTA_UNSPEC]		= { .strict_start_type = RTA_DPORT + 1 },
4985	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4986	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4987	[RTA_OIF]               = { .type = NLA_U32 },
4988	[RTA_IIF]		= { .type = NLA_U32 },
4989	[RTA_PRIORITY]          = { .type = NLA_U32 },
4990	[RTA_METRICS]           = { .type = NLA_NESTED },
4991	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4992	[RTA_PREF]              = { .type = NLA_U8 },
4993	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4994	[RTA_ENCAP]		= { .type = NLA_NESTED },
4995	[RTA_EXPIRES]		= { .type = NLA_U32 },
4996	[RTA_UID]		= { .type = NLA_U32 },
4997	[RTA_MARK]		= { .type = NLA_U32 },
4998	[RTA_TABLE]		= { .type = NLA_U32 },
4999	[RTA_IP_PROTO]		= { .type = NLA_U8 },
5000	[RTA_SPORT]		= { .type = NLA_U16 },
5001	[RTA_DPORT]		= { .type = NLA_U16 },
5002	[RTA_NH_ID]		= { .type = NLA_U32 },
5003};
5004
5005static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
5006			      struct fib6_config *cfg,
5007			      struct netlink_ext_ack *extack)
5008{
5009	struct rtmsg *rtm;
5010	struct nlattr *tb[RTA_MAX+1];
5011	unsigned int pref;
5012	int err;
5013
5014	err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
5015				     rtm_ipv6_policy, extack);
5016	if (err < 0)
5017		goto errout;
5018
5019	err = -EINVAL;
5020	rtm = nlmsg_data(nlh);
5021
5022	if (rtm->rtm_tos) {
5023		NL_SET_ERR_MSG(extack,
5024			       "Invalid dsfield (tos): option not available for IPv6");
5025		goto errout;
5026	}
5027
5028	*cfg = (struct fib6_config){
5029		.fc_table = rtm->rtm_table,
5030		.fc_dst_len = rtm->rtm_dst_len,
5031		.fc_src_len = rtm->rtm_src_len,
5032		.fc_flags = RTF_UP,
5033		.fc_protocol = rtm->rtm_protocol,
5034		.fc_type = rtm->rtm_type,
5035
5036		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
5037		.fc_nlinfo.nlh = nlh,
5038		.fc_nlinfo.nl_net = sock_net(skb->sk),
5039	};
5040
5041	if (rtm->rtm_type == RTN_UNREACHABLE ||
5042	    rtm->rtm_type == RTN_BLACKHOLE ||
5043	    rtm->rtm_type == RTN_PROHIBIT ||
5044	    rtm->rtm_type == RTN_THROW)
5045		cfg->fc_flags |= RTF_REJECT;
5046
5047	if (rtm->rtm_type == RTN_LOCAL)
5048		cfg->fc_flags |= RTF_LOCAL;
5049
5050	if (rtm->rtm_flags & RTM_F_CLONED)
5051		cfg->fc_flags |= RTF_CACHE;
5052
5053	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
5054
5055	if (tb[RTA_NH_ID]) {
5056		if (tb[RTA_GATEWAY]   || tb[RTA_OIF] ||
5057		    tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
5058			NL_SET_ERR_MSG(extack,
5059				       "Nexthop specification and nexthop id are mutually exclusive");
5060			goto errout;
5061		}
5062		cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
5063	}
5064
5065	if (tb[RTA_GATEWAY]) {
5066		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
5067		cfg->fc_flags |= RTF_GATEWAY;
5068	}
5069	if (tb[RTA_VIA]) {
5070		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
5071		goto errout;
5072	}
5073
5074	if (tb[RTA_DST]) {
5075		int plen = (rtm->rtm_dst_len + 7) >> 3;
5076
5077		if (nla_len(tb[RTA_DST]) < plen)
5078			goto errout;
5079
5080		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
5081	}
5082
5083	if (tb[RTA_SRC]) {
5084		int plen = (rtm->rtm_src_len + 7) >> 3;
5085
5086		if (nla_len(tb[RTA_SRC]) < plen)
5087			goto errout;
5088
5089		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
5090	}
5091
5092	if (tb[RTA_PREFSRC])
5093		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
5094
5095	if (tb[RTA_OIF])
5096		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
5097
5098	if (tb[RTA_PRIORITY])
5099		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
5100
5101	if (tb[RTA_METRICS]) {
5102		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
5103		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
5104	}
5105
5106	if (tb[RTA_TABLE])
5107		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
5108
5109	if (tb[RTA_MULTIPATH]) {
5110		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
5111		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
5112
5113		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
5114						     cfg->fc_mp_len, extack);
5115		if (err < 0)
5116			goto errout;
5117	}
5118
5119	if (tb[RTA_PREF]) {
5120		pref = nla_get_u8(tb[RTA_PREF]);
5121		if (pref != ICMPV6_ROUTER_PREF_LOW &&
5122		    pref != ICMPV6_ROUTER_PREF_HIGH)
5123			pref = ICMPV6_ROUTER_PREF_MEDIUM;
5124		cfg->fc_flags |= RTF_PREF(pref);
5125	}
5126
5127	if (tb[RTA_ENCAP])
5128		cfg->fc_encap = tb[RTA_ENCAP];
5129
5130	if (tb[RTA_ENCAP_TYPE]) {
5131		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
5132
5133		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
5134		if (err < 0)
5135			goto errout;
5136	}
5137
5138	if (tb[RTA_EXPIRES]) {
5139		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
5140
5141		if (addrconf_finite_timeout(timeout)) {
5142			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
5143			cfg->fc_flags |= RTF_EXPIRES;
5144		}
5145	}
5146
5147	err = 0;
5148errout:
5149	return err;
5150}
5151
5152struct rt6_nh {
5153	struct fib6_info *fib6_info;
5154	struct fib6_config r_cfg;
5155	struct list_head next;
5156};
5157
5158static int ip6_route_info_append(struct net *net,
5159				 struct list_head *rt6_nh_list,
5160				 struct fib6_info *rt,
5161				 struct fib6_config *r_cfg)
5162{
5163	struct rt6_nh *nh;
5164	int err = -EEXIST;
5165
5166	list_for_each_entry(nh, rt6_nh_list, next) {
5167		/* check if fib6_info already exists */
5168		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
5169			return err;
5170	}
5171
5172	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
5173	if (!nh)
5174		return -ENOMEM;
5175	nh->fib6_info = rt;
5176	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
5177	list_add_tail(&nh->next, rt6_nh_list);
5178
5179	return 0;
5180}
5181
5182static void ip6_route_mpath_notify(struct fib6_info *rt,
5183				   struct fib6_info *rt_last,
5184				   struct nl_info *info,
5185				   __u16 nlflags)
5186{
5187	/* if this is an APPEND route, then rt points to the first route
5188	 * inserted and rt_last points to last route inserted. Userspace
5189	 * wants a consistent dump of the route which starts at the first
5190	 * nexthop. Since sibling routes are always added at the end of
5191	 * the list, find the first sibling of the last route appended
5192	 */
5193	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
5194		rt = list_first_entry(&rt_last->fib6_siblings,
5195				      struct fib6_info,
5196				      fib6_siblings);
5197	}
5198
5199	if (rt)
5200		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
5201}
5202
5203static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
5204{
5205	bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
5206	bool should_notify = false;
5207	struct fib6_info *leaf;
5208	struct fib6_node *fn;
5209
5210	rcu_read_lock();
5211	fn = rcu_dereference(rt->fib6_node);
5212	if (!fn)
5213		goto out;
5214
5215	leaf = rcu_dereference(fn->leaf);
5216	if (!leaf)
5217		goto out;
5218
5219	if (rt == leaf ||
5220	    (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric &&
5221	     rt6_qualify_for_ecmp(leaf)))
5222		should_notify = true;
5223out:
5224	rcu_read_unlock();
5225
5226	return should_notify;
5227}
5228
5229static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla,
5230			     struct netlink_ext_ack *extack)
5231{
5232	if (nla_len(nla) < sizeof(*gw)) {
5233		NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY");
5234		return -EINVAL;
5235	}
5236
5237	*gw = nla_get_in6_addr(nla);
5238
5239	return 0;
5240}
5241
5242static int ip6_route_multipath_add(struct fib6_config *cfg,
5243				   struct netlink_ext_ack *extack)
5244{
5245	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
5246	struct nl_info *info = &cfg->fc_nlinfo;
5247	struct fib6_config r_cfg;
5248	struct rtnexthop *rtnh;
5249	struct fib6_info *rt;
5250	struct rt6_nh *err_nh;
5251	struct rt6_nh *nh, *nh_safe;
5252	__u16 nlflags;
5253	int remaining;
5254	int attrlen;
5255	int err = 1;
5256	int nhn = 0;
5257	int replace = (cfg->fc_nlinfo.nlh &&
5258		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
5259	LIST_HEAD(rt6_nh_list);
5260
5261	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
5262	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
5263		nlflags |= NLM_F_APPEND;
5264
5265	remaining = cfg->fc_mp_len;
5266	rtnh = (struct rtnexthop *)cfg->fc_mp;
5267
5268	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
5269	 * fib6_info structs per nexthop
5270	 */
5271	while (rtnh_ok(rtnh, remaining)) {
5272		memcpy(&r_cfg, cfg, sizeof(*cfg));
5273		if (rtnh->rtnh_ifindex)
5274			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
5275
5276		attrlen = rtnh_attrlen(rtnh);
5277		if (attrlen > 0) {
5278			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
5279
5280			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
5281			if (nla) {
5282				err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
5283							extack);
5284				if (err)
5285					goto cleanup;
5286
5287				r_cfg.fc_flags |= RTF_GATEWAY;
5288			}
5289			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
5290
5291			/* RTA_ENCAP_TYPE length checked in
5292			 * lwtunnel_valid_encap_type_attr
5293			 */
5294			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
5295			if (nla)
5296				r_cfg.fc_encap_type = nla_get_u16(nla);
5297		}
5298
5299		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
5300		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
5301		if (IS_ERR(rt)) {
5302			err = PTR_ERR(rt);
5303			rt = NULL;
5304			goto cleanup;
5305		}
5306		if (!rt6_qualify_for_ecmp(rt)) {
5307			err = -EINVAL;
5308			NL_SET_ERR_MSG(extack,
5309				       "Device only routes can not be added for IPv6 using the multipath API.");
5310			fib6_info_release(rt);
5311			goto cleanup;
5312		}
5313
5314		rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
5315
5316		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
5317					    rt, &r_cfg);
5318		if (err) {
5319			fib6_info_release(rt);
5320			goto cleanup;
5321		}
5322
5323		rtnh = rtnh_next(rtnh, &remaining);
5324	}
5325
5326	if (list_empty(&rt6_nh_list)) {
5327		NL_SET_ERR_MSG(extack,
5328			       "Invalid nexthop configuration - no valid nexthops");
5329		return -EINVAL;
5330	}
5331
5332	/* for add and replace send one notification with all nexthops.
5333	 * Skip the notification in fib6_add_rt2node and send one with
5334	 * the full route when done
5335	 */
5336	info->skip_notify = 1;
5337
5338	/* For add and replace, send one notification with all nexthops. For
5339	 * append, send one notification with all appended nexthops.
5340	 */
5341	info->skip_notify_kernel = 1;
5342
5343	err_nh = NULL;
5344	list_for_each_entry(nh, &rt6_nh_list, next) {
5345		err = __ip6_ins_rt(nh->fib6_info, info, extack);
5346
5347		if (err) {
5348			if (replace && nhn)
5349				NL_SET_ERR_MSG_MOD(extack,
5350						   "multipath route replace failed (check consistency of installed routes)");
5351			err_nh = nh;
5352			goto add_errout;
5353		}
5354		/* save reference to last route successfully inserted */
5355		rt_last = nh->fib6_info;
5356
5357		/* save reference to first route for notification */
5358		if (!rt_notif)
5359			rt_notif = nh->fib6_info;
5360
5361		/* Because each route is added like a single route we remove
5362		 * these flags after the first nexthop: if there is a collision,
5363		 * we have already failed to add the first nexthop:
5364		 * fib6_add_rt2node() has rejected it; when replacing, old
5365		 * nexthops have been replaced by first new, the rest should
5366		 * be added to it.
5367		 */
5368		if (cfg->fc_nlinfo.nlh) {
5369			cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
5370							     NLM_F_REPLACE);
5371			cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
5372		}
5373		nhn++;
5374	}
5375
5376	/* An in-kernel notification should only be sent in case the new
5377	 * multipath route is added as the first route in the node, or if
5378	 * it was appended to it. We pass 'rt_notif' since it is the first
5379	 * sibling and might allow us to skip some checks in the replace case.
5380	 */
5381	if (ip6_route_mpath_should_notify(rt_notif)) {
5382		enum fib_event_type fib_event;
5383
5384		if (rt_notif->fib6_nsiblings != nhn - 1)
5385			fib_event = FIB_EVENT_ENTRY_APPEND;
5386		else
5387			fib_event = FIB_EVENT_ENTRY_REPLACE;
5388
5389		err = call_fib6_multipath_entry_notifiers(info->nl_net,
5390							  fib_event, rt_notif,
5391							  nhn - 1, extack);
5392		if (err) {
5393			/* Delete all the siblings that were just added */
5394			err_nh = NULL;
5395			goto add_errout;
5396		}
5397	}
5398
5399	/* success ... tell user about new route */
5400	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
5401	goto cleanup;
5402
5403add_errout:
5404	/* send notification for routes that were added so that
5405	 * the delete notifications sent by ip6_route_del are
5406	 * coherent
5407	 */
5408	if (rt_notif)
5409		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
5410
5411	/* Delete routes that were already added */
5412	list_for_each_entry(nh, &rt6_nh_list, next) {
5413		if (err_nh == nh)
5414			break;
5415		ip6_route_del(&nh->r_cfg, extack);
5416	}
5417
5418cleanup:
5419	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
5420		fib6_info_release(nh->fib6_info);
5421		list_del(&nh->next);
5422		kfree(nh);
5423	}
5424
5425	return err;
5426}
5427
5428static int ip6_route_multipath_del(struct fib6_config *cfg,
5429				   struct netlink_ext_ack *extack)
5430{
5431	struct fib6_config r_cfg;
5432	struct rtnexthop *rtnh;
5433	int last_err = 0;
5434	int remaining;
5435	int attrlen;
5436	int err;
5437
5438	remaining = cfg->fc_mp_len;
5439	rtnh = (struct rtnexthop *)cfg->fc_mp;
5440
5441	/* Parse a Multipath Entry */
5442	while (rtnh_ok(rtnh, remaining)) {
5443		memcpy(&r_cfg, cfg, sizeof(*cfg));
5444		if (rtnh->rtnh_ifindex)
5445			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
5446
5447		attrlen = rtnh_attrlen(rtnh);
5448		if (attrlen > 0) {
5449			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
5450
5451			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
5452			if (nla) {
5453				err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
5454							extack);
5455				if (err) {
5456					last_err = err;
5457					goto next_rtnh;
5458				}
5459
5460				r_cfg.fc_flags |= RTF_GATEWAY;
5461			}
5462		}
5463		err = ip6_route_del(&r_cfg, extack);
5464		if (err)
5465			last_err = err;
5466
5467next_rtnh:
5468		rtnh = rtnh_next(rtnh, &remaining);
5469	}
5470
5471	return last_err;
5472}
5473
5474static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
5475			      struct netlink_ext_ack *extack)
5476{
5477	struct fib6_config cfg;
5478	int err;
5479
5480	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
5481	if (err < 0)
5482		return err;
5483
5484	if (cfg.fc_nh_id &&
5485	    !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
5486		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
5487		return -EINVAL;
5488	}
5489
5490	if (cfg.fc_mp)
5491		return ip6_route_multipath_del(&cfg, extack);
5492	else {
5493		cfg.fc_delete_all_nh = 1;
5494		return ip6_route_del(&cfg, extack);
5495	}
5496}
5497
5498static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
5499			      struct netlink_ext_ack *extack)
5500{
5501	struct fib6_config cfg;
5502	int err;
5503
5504	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
5505	if (err < 0)
5506		return err;
5507
5508	if (cfg.fc_metric == 0)
5509		cfg.fc_metric = IP6_RT_PRIO_USER;
5510
5511	if (cfg.fc_mp)
5512		return ip6_route_multipath_add(&cfg, extack);
5513	else
5514		return ip6_route_add(&cfg, GFP_KERNEL, extack);
5515}
5516
5517/* add the overhead of this fib6_nh to nexthop_len */
5518static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
5519{
5520	int *nexthop_len = arg;
5521
5522	*nexthop_len += nla_total_size(0)	 /* RTA_MULTIPATH */
5523		     + NLA_ALIGN(sizeof(struct rtnexthop))
5524		     + nla_total_size(16); /* RTA_GATEWAY */
5525
5526	if (nh->fib_nh_lws) {
5527		/* RTA_ENCAP_TYPE */
5528		*nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
5529		/* RTA_ENCAP */
5530		*nexthop_len += nla_total_size(2);
5531	}
5532
5533	return 0;
5534}
5535
5536static size_t rt6_nlmsg_size(struct fib6_info *f6i)
5537{
5538	int nexthop_len;
5539
5540	if (f6i->nh) {
5541		nexthop_len = nla_total_size(4); /* RTA_NH_ID */
5542		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
5543					 &nexthop_len);
5544	} else {
5545		struct fib6_info *sibling, *next_sibling;
5546		struct fib6_nh *nh = f6i->fib6_nh;
5547
5548		nexthop_len = 0;
5549		if (f6i->fib6_nsiblings) {
5550			rt6_nh_nlmsg_size(nh, &nexthop_len);
5551
5552			list_for_each_entry_safe(sibling, next_sibling,
5553						 &f6i->fib6_siblings, fib6_siblings) {
5554				rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
5555			}
5556		}
5557		nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
5558	}
5559
5560	return NLMSG_ALIGN(sizeof(struct rtmsg))
5561	       + nla_total_size(16) /* RTA_SRC */
5562	       + nla_total_size(16) /* RTA_DST */
5563	       + nla_total_size(16) /* RTA_GATEWAY */
5564	       + nla_total_size(16) /* RTA_PREFSRC */
5565	       + nla_total_size(4) /* RTA_TABLE */
5566	       + nla_total_size(4) /* RTA_IIF */
5567	       + nla_total_size(4) /* RTA_OIF */
5568	       + nla_total_size(4) /* RTA_PRIORITY */
5569	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
5570	       + nla_total_size(sizeof(struct rta_cacheinfo))
5571	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
5572	       + nla_total_size(1) /* RTA_PREF */
5573	       + nexthop_len;
5574}
5575
5576static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
5577				 unsigned char *flags)
5578{
5579	if (nexthop_is_multipath(nh)) {
5580		struct nlattr *mp;
5581
5582		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
5583		if (!mp)
5584			goto nla_put_failure;
5585
5586		if (nexthop_mpath_fill_node(skb, nh, AF_INET6))
5587			goto nla_put_failure;
5588
5589		nla_nest_end(skb, mp);
5590	} else {
5591		struct fib6_nh *fib6_nh;
5592
5593		fib6_nh = nexthop_fib6_nh(nh);
5594		if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6,
5595				     flags, false) < 0)
5596			goto nla_put_failure;
5597	}
5598
5599	return 0;
5600
5601nla_put_failure:
5602	return -EMSGSIZE;
5603}
5604
5605static int rt6_fill_node(struct net *net, struct sk_buff *skb,
5606			 struct fib6_info *rt, struct dst_entry *dst,
5607			 struct in6_addr *dest, struct in6_addr *src,
5608			 int iif, int type, u32 portid, u32 seq,
5609			 unsigned int flags)
5610{
5611	struct rt6_info *rt6 = (struct rt6_info *)dst;
5612	struct rt6key *rt6_dst, *rt6_src;
5613	u32 *pmetrics, table, rt6_flags;
5614	unsigned char nh_flags = 0;
5615	struct nlmsghdr *nlh;
5616	struct rtmsg *rtm;
5617	long expires = 0;
5618
5619	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
5620	if (!nlh)
5621		return -EMSGSIZE;
5622
5623	if (rt6) {
5624		rt6_dst = &rt6->rt6i_dst;
5625		rt6_src = &rt6->rt6i_src;
5626		rt6_flags = rt6->rt6i_flags;
5627	} else {
5628		rt6_dst = &rt->fib6_dst;
5629		rt6_src = &rt->fib6_src;
5630		rt6_flags = rt->fib6_flags;
5631	}
5632
5633	rtm = nlmsg_data(nlh);
5634	rtm->rtm_family = AF_INET6;
5635	rtm->rtm_dst_len = rt6_dst->plen;
5636	rtm->rtm_src_len = rt6_src->plen;
5637	rtm->rtm_tos = 0;
5638	if (rt->fib6_table)
5639		table = rt->fib6_table->tb6_id;
5640	else
5641		table = RT6_TABLE_UNSPEC;
5642	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
5643	if (nla_put_u32(skb, RTA_TABLE, table))
5644		goto nla_put_failure;
5645
5646	rtm->rtm_type = rt->fib6_type;
5647	rtm->rtm_flags = 0;
5648	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
5649	rtm->rtm_protocol = rt->fib6_protocol;
5650
5651	if (rt6_flags & RTF_CACHE)
5652		rtm->rtm_flags |= RTM_F_CLONED;
5653
5654	if (dest) {
5655		if (nla_put_in6_addr(skb, RTA_DST, dest))
5656			goto nla_put_failure;
5657		rtm->rtm_dst_len = 128;
5658	} else if (rtm->rtm_dst_len)
5659		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
5660			goto nla_put_failure;
5661#ifdef CONFIG_IPV6_SUBTREES
5662	if (src) {
5663		if (nla_put_in6_addr(skb, RTA_SRC, src))
5664			goto nla_put_failure;
5665		rtm->rtm_src_len = 128;
5666	} else if (rtm->rtm_src_len &&
5667		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
5668		goto nla_put_failure;
5669#endif
5670	if (iif) {
5671#ifdef CONFIG_IPV6_MROUTE
5672		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
5673			int err = ip6mr_get_route(net, skb, rtm, portid);
5674
5675			if (err == 0)
5676				return 0;
5677			if (err < 0)
5678				goto nla_put_failure;
5679		} else
5680#endif
5681			if (nla_put_u32(skb, RTA_IIF, iif))
5682				goto nla_put_failure;
5683	} else if (dest) {
5684		struct in6_addr saddr_buf;
5685		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
5686		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
5687			goto nla_put_failure;
5688	}
5689
5690	if (rt->fib6_prefsrc.plen) {
5691		struct in6_addr saddr_buf;
5692		saddr_buf = rt->fib6_prefsrc.addr;
5693		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
5694			goto nla_put_failure;
5695	}
5696
5697	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
5698	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
5699		goto nla_put_failure;
5700
5701	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
5702		goto nla_put_failure;
5703
5704	/* For multipath routes, walk the siblings list and add
5705	 * each as a nexthop within RTA_MULTIPATH.
5706	 */
5707	if (rt6) {
5708		if (rt6_flags & RTF_GATEWAY &&
5709		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
5710			goto nla_put_failure;
5711
5712		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
5713			goto nla_put_failure;
5714
5715		if (dst->lwtstate &&
5716		    lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
5717			goto nla_put_failure;
5718	} else if (rt->fib6_nsiblings) {
5719		struct fib6_info *sibling, *next_sibling;
5720		struct nlattr *mp;
5721
5722		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
5723		if (!mp)
5724			goto nla_put_failure;
5725
5726		if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
5727				    rt->fib6_nh->fib_nh_weight, AF_INET6,
5728				    0) < 0)
5729			goto nla_put_failure;
5730
5731		list_for_each_entry_safe(sibling, next_sibling,
5732					 &rt->fib6_siblings, fib6_siblings) {
5733			if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
5734					    sibling->fib6_nh->fib_nh_weight,
5735					    AF_INET6, 0) < 0)
5736				goto nla_put_failure;
5737		}
5738
5739		nla_nest_end(skb, mp);
5740	} else if (rt->nh) {
5741		if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
5742			goto nla_put_failure;
5743
5744		if (nexthop_is_blackhole(rt->nh))
5745			rtm->rtm_type = RTN_BLACKHOLE;
5746
5747		if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) &&
5748		    rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
5749			goto nla_put_failure;
5750
5751		rtm->rtm_flags |= nh_flags;
5752	} else {
5753		if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6,
5754				     &nh_flags, false) < 0)
5755			goto nla_put_failure;
5756
5757		rtm->rtm_flags |= nh_flags;
5758	}
5759
5760	if (rt6_flags & RTF_EXPIRES) {
5761		expires = dst ? dst->expires : rt->expires;
5762		expires -= jiffies;
5763	}
5764
5765	if (!dst) {
5766		if (READ_ONCE(rt->offload))
5767			rtm->rtm_flags |= RTM_F_OFFLOAD;
5768		if (READ_ONCE(rt->trap))
5769			rtm->rtm_flags |= RTM_F_TRAP;
5770		if (READ_ONCE(rt->offload_failed))
5771			rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
5772	}
5773
5774	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
5775		goto nla_put_failure;
5776
5777	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
5778		goto nla_put_failure;
5779
5780
5781	nlmsg_end(skb, nlh);
5782	return 0;
5783
5784nla_put_failure:
5785	nlmsg_cancel(skb, nlh);
5786	return -EMSGSIZE;
5787}
5788
5789static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
5790{
5791	const struct net_device *dev = arg;
5792
5793	if (nh->fib_nh_dev == dev)
5794		return 1;
5795
5796	return 0;
5797}
5798
5799static bool fib6_info_uses_dev(const struct fib6_info *f6i,
5800			       const struct net_device *dev)
5801{
5802	if (f6i->nh) {
5803		struct net_device *_dev = (struct net_device *)dev;
5804
5805		return !!nexthop_for_each_fib6_nh(f6i->nh,
5806						  fib6_info_nh_uses_dev,
5807						  _dev);
5808	}
5809
5810	if (f6i->fib6_nh->fib_nh_dev == dev)
5811		return true;
5812
5813	if (f6i->fib6_nsiblings) {
5814		struct fib6_info *sibling, *next_sibling;
5815
5816		list_for_each_entry_safe(sibling, next_sibling,
5817					 &f6i->fib6_siblings, fib6_siblings) {
5818			if (sibling->fib6_nh->fib_nh_dev == dev)
5819				return true;
5820		}
5821	}
5822
5823	return false;
5824}
5825
5826struct fib6_nh_exception_dump_walker {
5827	struct rt6_rtnl_dump_arg *dump;
5828	struct fib6_info *rt;
5829	unsigned int flags;
5830	unsigned int skip;
5831	unsigned int count;
5832};
5833
5834static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
5835{
5836	struct fib6_nh_exception_dump_walker *w = arg;
5837	struct rt6_rtnl_dump_arg *dump = w->dump;
5838	struct rt6_exception_bucket *bucket;
5839	struct rt6_exception *rt6_ex;
5840	int i, err;
5841
5842	bucket = fib6_nh_get_excptn_bucket(nh, NULL);
5843	if (!bucket)
5844		return 0;
5845
5846	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
5847		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
5848			if (w->skip) {
5849				w->skip--;
5850				continue;
5851			}
5852
5853			/* Expiration of entries doesn't bump sernum, insertion
5854			 * does. Removal is triggered by insertion, so we can
5855			 * rely on the fact that if entries change between two
5856			 * partial dumps, this node is scanned again completely,
5857			 * see rt6_insert_exception() and fib6_dump_table().
5858			 *
5859			 * Count expired entries we go through as handled
5860			 * entries that we'll skip next time, in case of partial
5861			 * node dump. Otherwise, if entries expire meanwhile,
5862			 * we'll skip the wrong amount.
5863			 */
5864			if (rt6_check_expired(rt6_ex->rt6i)) {
5865				w->count++;
5866				continue;
5867			}
5868
5869			err = rt6_fill_node(dump->net, dump->skb, w->rt,
5870					    &rt6_ex->rt6i->dst, NULL, NULL, 0,
5871					    RTM_NEWROUTE,
5872					    NETLINK_CB(dump->cb->skb).portid,
5873					    dump->cb->nlh->nlmsg_seq, w->flags);
5874			if (err)
5875				return err;
5876
5877			w->count++;
5878		}
5879		bucket++;
5880	}
5881
5882	return 0;
5883}
5884
5885/* Return -1 if done with node, number of handled routes on partial dump */
5886int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
5887{
5888	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
5889	struct fib_dump_filter *filter = &arg->filter;
5890	unsigned int flags = NLM_F_MULTI;
5891	struct net *net = arg->net;
5892	int count = 0;
5893
5894	if (rt == net->ipv6.fib6_null_entry)
5895		return -1;
5896
5897	if ((filter->flags & RTM_F_PREFIX) &&
5898	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
5899		/* success since this is not a prefix route */
5900		return -1;
5901	}
5902	if (filter->filter_set &&
5903	    ((filter->rt_type  && rt->fib6_type != filter->rt_type) ||
5904	     (filter->dev      && !fib6_info_uses_dev(rt, filter->dev)) ||
5905	     (filter->protocol && rt->fib6_protocol != filter->protocol))) {
5906		return -1;
5907	}
5908
5909	if (filter->filter_set ||
5910	    !filter->dump_routes || !filter->dump_exceptions) {
5911		flags |= NLM_F_DUMP_FILTERED;
5912	}
5913
5914	if (filter->dump_routes) {
5915		if (skip) {
5916			skip--;
5917		} else {
5918			if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
5919					  0, RTM_NEWROUTE,
5920					  NETLINK_CB(arg->cb->skb).portid,
5921					  arg->cb->nlh->nlmsg_seq, flags)) {
5922				return 0;
5923			}
5924			count++;
5925		}
5926	}
5927
5928	if (filter->dump_exceptions) {
5929		struct fib6_nh_exception_dump_walker w = { .dump = arg,
5930							   .rt = rt,
5931							   .flags = flags,
5932							   .skip = skip,
5933							   .count = 0 };
5934		int err;
5935
5936		rcu_read_lock();
5937		if (rt->nh) {
5938			err = nexthop_for_each_fib6_nh(rt->nh,
5939						       rt6_nh_dump_exceptions,
5940						       &w);
5941		} else {
5942			err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
5943		}
5944		rcu_read_unlock();
5945
5946		if (err)
5947			return count + w.count;
5948	}
5949
5950	return -1;
5951}
5952
5953static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
5954					const struct nlmsghdr *nlh,
5955					struct nlattr **tb,
5956					struct netlink_ext_ack *extack)
5957{
5958	struct rtmsg *rtm;
5959	int i, err;
5960
5961	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
5962		NL_SET_ERR_MSG_MOD(extack,
5963				   "Invalid header for get route request");
5964		return -EINVAL;
5965	}
5966
5967	if (!netlink_strict_get_check(skb))
5968		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
5969					      rtm_ipv6_policy, extack);
5970
5971	rtm = nlmsg_data(nlh);
5972	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
5973	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
5974	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
5975	    rtm->rtm_type) {
5976		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
5977		return -EINVAL;
5978	}
5979	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
5980		NL_SET_ERR_MSG_MOD(extack,
5981				   "Invalid flags for get route request");
5982		return -EINVAL;
5983	}
5984
5985	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
5986					    rtm_ipv6_policy, extack);
5987	if (err)
5988		return err;
5989
5990	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
5991	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
5992		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
5993		return -EINVAL;
5994	}
5995
5996	for (i = 0; i <= RTA_MAX; i++) {
5997		if (!tb[i])
5998			continue;
5999
6000		switch (i) {
6001		case RTA_SRC:
6002		case RTA_DST:
6003		case RTA_IIF:
6004		case RTA_OIF:
6005		case RTA_MARK:
6006		case RTA_UID:
6007		case RTA_SPORT:
6008		case RTA_DPORT:
6009		case RTA_IP_PROTO:
6010			break;
6011		default:
6012			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
6013			return -EINVAL;
6014		}
6015	}
6016
6017	return 0;
6018}
6019
6020static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
6021			      struct netlink_ext_ack *extack)
6022{
6023	struct net *net = sock_net(in_skb->sk);
6024	struct nlattr *tb[RTA_MAX+1];
6025	int err, iif = 0, oif = 0;
6026	struct fib6_info *from;
6027	struct dst_entry *dst;
6028	struct rt6_info *rt;
6029	struct sk_buff *skb;
6030	struct rtmsg *rtm;
6031	struct flowi6 fl6 = {};
6032	bool fibmatch;
6033
6034	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
6035	if (err < 0)
6036		goto errout;
6037
6038	err = -EINVAL;
6039	rtm = nlmsg_data(nlh);
6040	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
6041	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
6042
6043	if (tb[RTA_SRC]) {
6044		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
6045			goto errout;
6046
6047		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
6048	}
6049
6050	if (tb[RTA_DST]) {
6051		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
6052			goto errout;
6053
6054		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
6055	}
6056
6057	if (tb[RTA_IIF])
6058		iif = nla_get_u32(tb[RTA_IIF]);
6059
6060	if (tb[RTA_OIF])
6061		oif = nla_get_u32(tb[RTA_OIF]);
6062
6063	if (tb[RTA_MARK])
6064		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
6065
6066	if (tb[RTA_UID])
6067		fl6.flowi6_uid = make_kuid(current_user_ns(),
6068					   nla_get_u32(tb[RTA_UID]));
6069	else
6070		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
6071
6072	if (tb[RTA_SPORT])
6073		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
6074
6075	if (tb[RTA_DPORT])
6076		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
6077
6078	if (tb[RTA_IP_PROTO]) {
6079		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
6080						  &fl6.flowi6_proto, AF_INET6,
6081						  extack);
6082		if (err)
6083			goto errout;
6084	}
6085
6086	if (iif) {
6087		struct net_device *dev;
6088		int flags = 0;
6089
6090		rcu_read_lock();
6091
6092		dev = dev_get_by_index_rcu(net, iif);
6093		if (!dev) {
6094			rcu_read_unlock();
6095			err = -ENODEV;
6096			goto errout;
6097		}
6098
6099		fl6.flowi6_iif = iif;
6100
6101		if (!ipv6_addr_any(&fl6.saddr))
6102			flags |= RT6_LOOKUP_F_HAS_SADDR;
6103
6104		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
6105
6106		rcu_read_unlock();
6107	} else {
6108		fl6.flowi6_oif = oif;
6109
6110		dst = ip6_route_output(net, NULL, &fl6);
6111	}
6112
6113
6114	rt = container_of(dst, struct rt6_info, dst);
6115	if (rt->dst.error) {
6116		err = rt->dst.error;
6117		ip6_rt_put(rt);
6118		goto errout;
6119	}
6120
6121	if (rt == net->ipv6.ip6_null_entry) {
6122		err = rt->dst.error;
6123		ip6_rt_put(rt);
6124		goto errout;
6125	}
6126
6127	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
6128	if (!skb) {
6129		ip6_rt_put(rt);
6130		err = -ENOBUFS;
6131		goto errout;
6132	}
6133
6134	skb_dst_set(skb, &rt->dst);
6135
6136	rcu_read_lock();
6137	from = rcu_dereference(rt->from);
6138	if (from) {
6139		if (fibmatch)
6140			err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
6141					    iif, RTM_NEWROUTE,
6142					    NETLINK_CB(in_skb).portid,
6143					    nlh->nlmsg_seq, 0);
6144		else
6145			err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
6146					    &fl6.saddr, iif, RTM_NEWROUTE,
6147					    NETLINK_CB(in_skb).portid,
6148					    nlh->nlmsg_seq, 0);
6149	} else {
6150		err = -ENETUNREACH;
6151	}
6152	rcu_read_unlock();
6153
6154	if (err < 0) {
6155		kfree_skb(skb);
6156		goto errout;
6157	}
6158
6159	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
6160errout:
6161	return err;
6162}
6163
6164void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
6165		     unsigned int nlm_flags)
6166{
6167	struct sk_buff *skb;
6168	struct net *net = info->nl_net;
6169	u32 seq;
6170	int err;
6171
6172	err = -ENOBUFS;
6173	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
6174
6175	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
6176	if (!skb)
6177		goto errout;
6178
6179	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
6180			    event, info->portid, seq, nlm_flags);
6181	if (err < 0) {
6182		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6183		WARN_ON(err == -EMSGSIZE);
6184		kfree_skb(skb);
6185		goto errout;
6186	}
6187	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
6188		    info->nlh, gfp_any());
6189	return;
6190errout:
6191	if (err < 0)
6192		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
6193}
6194
6195void fib6_rt_update(struct net *net, struct fib6_info *rt,
6196		    struct nl_info *info)
6197{
6198	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
6199	struct sk_buff *skb;
6200	int err = -ENOBUFS;
6201
6202	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
6203	if (!skb)
6204		goto errout;
6205
6206	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
6207			    RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
6208	if (err < 0) {
6209		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6210		WARN_ON(err == -EMSGSIZE);
6211		kfree_skb(skb);
6212		goto errout;
6213	}
6214	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
6215		    info->nlh, gfp_any());
6216	return;
6217errout:
6218	if (err < 0)
6219		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
6220}
6221
6222void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
6223			    bool offload, bool trap, bool offload_failed)
6224{
6225	struct sk_buff *skb;
6226	int err;
6227
6228	if (READ_ONCE(f6i->offload) == offload &&
6229	    READ_ONCE(f6i->trap) == trap &&
6230	    READ_ONCE(f6i->offload_failed) == offload_failed)
6231		return;
6232
6233	WRITE_ONCE(f6i->offload, offload);
6234	WRITE_ONCE(f6i->trap, trap);
6235
6236	/* 2 means send notifications only if offload_failed was changed. */
6237	if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 &&
6238	    READ_ONCE(f6i->offload_failed) == offload_failed)
6239		return;
6240
6241	WRITE_ONCE(f6i->offload_failed, offload_failed);
6242
6243	if (!rcu_access_pointer(f6i->fib6_node))
6244		/* The route was removed from the tree, do not send
6245		 * notification.
6246		 */
6247		return;
6248
6249	if (!net->ipv6.sysctl.fib_notify_on_flag_change)
6250		return;
6251
6252	skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL);
6253	if (!skb) {
6254		err = -ENOBUFS;
6255		goto errout;
6256	}
6257
6258	err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0,
6259			    0, 0);
6260	if (err < 0) {
6261		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6262		WARN_ON(err == -EMSGSIZE);
6263		kfree_skb(skb);
6264		goto errout;
6265	}
6266
6267	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL);
6268	return;
6269
6270errout:
6271	rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
6272}
6273EXPORT_SYMBOL(fib6_info_hw_flags_set);
6274
6275static int ip6_route_dev_notify(struct notifier_block *this,
6276				unsigned long event, void *ptr)
6277{
6278	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
6279	struct net *net = dev_net(dev);
6280
6281	if (!(dev->flags & IFF_LOOPBACK))
6282		return NOTIFY_OK;
6283
6284	if (event == NETDEV_REGISTER) {
6285		net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
6286		net->ipv6.ip6_null_entry->dst.dev = dev;
6287		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
6288#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6289		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
6290		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
6291		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
6292		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
6293#endif
6294	 } else if (event == NETDEV_UNREGISTER &&
6295		    dev->reg_state != NETREG_UNREGISTERED) {
6296		/* NETDEV_UNREGISTER could be fired for multiple times by
6297		 * netdev_wait_allrefs(). Make sure we only call this once.
6298		 */
6299		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
6300#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6301		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
6302		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
6303#endif
6304	}
6305
6306	return NOTIFY_OK;
6307}
6308
6309/*
6310 *	/proc
6311 */
6312
6313#ifdef CONFIG_PROC_FS
6314static int rt6_stats_seq_show(struct seq_file *seq, void *v)
6315{
6316	struct net *net = (struct net *)seq->private;
6317	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
6318		   net->ipv6.rt6_stats->fib_nodes,
6319		   net->ipv6.rt6_stats->fib_route_nodes,
6320		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
6321		   net->ipv6.rt6_stats->fib_rt_entries,
6322		   net->ipv6.rt6_stats->fib_rt_cache,
6323		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
6324		   net->ipv6.rt6_stats->fib_discarded_routes);
6325
6326	return 0;
6327}
6328#endif	/* CONFIG_PROC_FS */
6329
6330#ifdef CONFIG_SYSCTL
6331
6332static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
6333			      void *buffer, size_t *lenp, loff_t *ppos)
6334{
6335	struct net *net;
6336	int delay;
6337	int ret;
6338	if (!write)
6339		return -EINVAL;
6340
6341	net = (struct net *)ctl->extra1;
6342	delay = net->ipv6.sysctl.flush_delay;
6343	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
6344	if (ret)
6345		return ret;
6346
6347	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
6348	return 0;
6349}
6350
6351static struct ctl_table ipv6_route_table_template[] = {
6352	{
6353		.procname	=	"max_size",
6354		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
6355		.maxlen		=	sizeof(int),
6356		.mode		=	0644,
6357		.proc_handler	=	proc_dointvec,
6358	},
6359	{
6360		.procname	=	"gc_thresh",
6361		.data		=	&ip6_dst_ops_template.gc_thresh,
6362		.maxlen		=	sizeof(int),
6363		.mode		=	0644,
6364		.proc_handler	=	proc_dointvec,
6365	},
6366	{
6367		.procname	=	"flush",
6368		.data		=	&init_net.ipv6.sysctl.flush_delay,
6369		.maxlen		=	sizeof(int),
6370		.mode		=	0200,
6371		.proc_handler	=	ipv6_sysctl_rtcache_flush
6372	},
6373	{
6374		.procname	=	"gc_min_interval",
6375		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
6376		.maxlen		=	sizeof(int),
6377		.mode		=	0644,
6378		.proc_handler	=	proc_dointvec_jiffies,
6379	},
6380	{
6381		.procname	=	"gc_timeout",
6382		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
6383		.maxlen		=	sizeof(int),
6384		.mode		=	0644,
6385		.proc_handler	=	proc_dointvec_jiffies,
6386	},
6387	{
6388		.procname	=	"gc_interval",
6389		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
6390		.maxlen		=	sizeof(int),
6391		.mode		=	0644,
6392		.proc_handler	=	proc_dointvec_jiffies,
6393	},
6394	{
6395		.procname	=	"gc_elasticity",
6396		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
6397		.maxlen		=	sizeof(int),
6398		.mode		=	0644,
6399		.proc_handler	=	proc_dointvec,
6400	},
6401	{
6402		.procname	=	"mtu_expires",
6403		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
6404		.maxlen		=	sizeof(int),
6405		.mode		=	0644,
6406		.proc_handler	=	proc_dointvec_jiffies,
6407	},
6408	{
6409		.procname	=	"min_adv_mss",
6410		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
6411		.maxlen		=	sizeof(int),
6412		.mode		=	0644,
6413		.proc_handler	=	proc_dointvec,
6414	},
6415	{
6416		.procname	=	"gc_min_interval_ms",
6417		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
6418		.maxlen		=	sizeof(int),
6419		.mode		=	0644,
6420		.proc_handler	=	proc_dointvec_ms_jiffies,
6421	},
6422	{
6423		.procname	=	"skip_notify_on_dev_down",
6424		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
6425		.maxlen		=	sizeof(u8),
6426		.mode		=	0644,
6427		.proc_handler	=	proc_dou8vec_minmax,
6428		.extra1		=	SYSCTL_ZERO,
6429		.extra2		=	SYSCTL_ONE,
6430	},
6431	{ }
6432};
6433
6434struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
6435{
6436	struct ctl_table *table;
6437
6438	table = kmemdup(ipv6_route_table_template,
6439			sizeof(ipv6_route_table_template),
6440			GFP_KERNEL);
6441
6442	if (table) {
6443		table[0].data = &net->ipv6.sysctl.ip6_rt_max_size;
6444		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
6445		table[2].data = &net->ipv6.sysctl.flush_delay;
6446		table[2].extra1 = net;
6447		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
6448		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
6449		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
6450		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
6451		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
6452		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
6453		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
6454		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
6455
6456		/* Don't export sysctls to unprivileged users */
6457		if (net->user_ns != &init_user_ns)
6458			table[1].procname = NULL;
6459	}
6460
6461	return table;
6462}
6463
6464size_t ipv6_route_sysctl_table_size(struct net *net)
6465{
6466	/* Don't export sysctls to unprivileged users */
6467	if (net->user_ns != &init_user_ns)
6468		return 1;
6469
6470	return ARRAY_SIZE(ipv6_route_table_template);
6471}
6472#endif
6473
6474static int __net_init ip6_route_net_init(struct net *net)
6475{
6476	int ret = -ENOMEM;
6477
6478	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
6479	       sizeof(net->ipv6.ip6_dst_ops));
6480
6481	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
6482		goto out_ip6_dst_ops;
6483
6484	net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
6485	if (!net->ipv6.fib6_null_entry)
6486		goto out_ip6_dst_entries;
6487	memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
6488	       sizeof(*net->ipv6.fib6_null_entry));
6489
6490	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
6491					   sizeof(*net->ipv6.ip6_null_entry),
6492					   GFP_KERNEL);
6493	if (!net->ipv6.ip6_null_entry)
6494		goto out_fib6_null_entry;
6495	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6496	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
6497			 ip6_template_metrics, true);
6498	INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached);
6499
6500#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6501	net->ipv6.fib6_has_custom_rules = false;
6502	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
6503					       sizeof(*net->ipv6.ip6_prohibit_entry),
6504					       GFP_KERNEL);
6505	if (!net->ipv6.ip6_prohibit_entry)
6506		goto out_ip6_null_entry;
6507	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6508	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
6509			 ip6_template_metrics, true);
6510	INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached);
6511
6512	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
6513					       sizeof(*net->ipv6.ip6_blk_hole_entry),
6514					       GFP_KERNEL);
6515	if (!net->ipv6.ip6_blk_hole_entry)
6516		goto out_ip6_prohibit_entry;
6517	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6518	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
6519			 ip6_template_metrics, true);
6520	INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached);
6521#ifdef CONFIG_IPV6_SUBTREES
6522	net->ipv6.fib6_routes_require_src = 0;
6523#endif
6524#endif
6525
6526	net->ipv6.sysctl.flush_delay = 0;
6527	net->ipv6.sysctl.ip6_rt_max_size = INT_MAX;
6528	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
6529	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
6530	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
6531	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
6532	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
6533	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
6534	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
6535
6536	atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);
6537
6538	ret = 0;
6539out:
6540	return ret;
6541
6542#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6543out_ip6_prohibit_entry:
6544	kfree(net->ipv6.ip6_prohibit_entry);
6545out_ip6_null_entry:
6546	kfree(net->ipv6.ip6_null_entry);
6547#endif
6548out_fib6_null_entry:
6549	kfree(net->ipv6.fib6_null_entry);
6550out_ip6_dst_entries:
6551	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
6552out_ip6_dst_ops:
6553	goto out;
6554}
6555
6556static void __net_exit ip6_route_net_exit(struct net *net)
6557{
6558	kfree(net->ipv6.fib6_null_entry);
6559	kfree(net->ipv6.ip6_null_entry);
6560#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6561	kfree(net->ipv6.ip6_prohibit_entry);
6562	kfree(net->ipv6.ip6_blk_hole_entry);
6563#endif
6564	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
6565}
6566
6567static int __net_init ip6_route_net_init_late(struct net *net)
6568{
6569#ifdef CONFIG_PROC_FS
6570	if (!proc_create_net("ipv6_route", 0, net->proc_net,
6571			     &ipv6_route_seq_ops,
6572			     sizeof(struct ipv6_route_iter)))
6573		return -ENOMEM;
6574
6575	if (!proc_create_net_single("rt6_stats", 0444, net->proc_net,
6576				    rt6_stats_seq_show, NULL)) {
6577		remove_proc_entry("ipv6_route", net->proc_net);
6578		return -ENOMEM;
6579	}
6580#endif
6581	return 0;
6582}
6583
6584static void __net_exit ip6_route_net_exit_late(struct net *net)
6585{
6586#ifdef CONFIG_PROC_FS
6587	remove_proc_entry("ipv6_route", net->proc_net);
6588	remove_proc_entry("rt6_stats", net->proc_net);
6589#endif
6590}
6591
6592static struct pernet_operations ip6_route_net_ops = {
6593	.init = ip6_route_net_init,
6594	.exit = ip6_route_net_exit,
6595};
6596
6597static int __net_init ipv6_inetpeer_init(struct net *net)
6598{
6599	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
6600
6601	if (!bp)
6602		return -ENOMEM;
6603	inet_peer_base_init(bp);
6604	net->ipv6.peers = bp;
6605	return 0;
6606}
6607
6608static void __net_exit ipv6_inetpeer_exit(struct net *net)
6609{
6610	struct inet_peer_base *bp = net->ipv6.peers;
6611
6612	net->ipv6.peers = NULL;
6613	inetpeer_invalidate_tree(bp);
6614	kfree(bp);
6615}
6616
6617static struct pernet_operations ipv6_inetpeer_ops = {
6618	.init	=	ipv6_inetpeer_init,
6619	.exit	=	ipv6_inetpeer_exit,
6620};
6621
6622static struct pernet_operations ip6_route_net_late_ops = {
6623	.init = ip6_route_net_init_late,
6624	.exit = ip6_route_net_exit_late,
6625};
6626
6627static struct notifier_block ip6_route_dev_notifier = {
6628	.notifier_call = ip6_route_dev_notify,
6629	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
6630};
6631
6632void __init ip6_route_init_special_entries(void)
6633{
6634	/* Registering of the loopback is done before this portion of code,
6635	 * the loopback reference in rt6_info will not be taken, do it
6636	 * manually for init_net */
6637	init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
6638	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
6639	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6640  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6641	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
6642	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6643	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
6644	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6645  #endif
6646}
6647
6648#if IS_BUILTIN(CONFIG_IPV6)
6649#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6650DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
6651
6652BTF_ID_LIST(btf_fib6_info_id)
6653BTF_ID(struct, fib6_info)
6654
6655static const struct bpf_iter_seq_info ipv6_route_seq_info = {
6656	.seq_ops		= &ipv6_route_seq_ops,
6657	.init_seq_private	= bpf_iter_init_seq_net,
6658	.fini_seq_private	= bpf_iter_fini_seq_net,
6659	.seq_priv_size		= sizeof(struct ipv6_route_iter),
6660};
6661
6662static struct bpf_iter_reg ipv6_route_reg_info = {
6663	.target			= "ipv6_route",
6664	.ctx_arg_info_size	= 1,
6665	.ctx_arg_info		= {
6666		{ offsetof(struct bpf_iter__ipv6_route, rt),
6667		  PTR_TO_BTF_ID_OR_NULL },
6668	},
6669	.seq_info		= &ipv6_route_seq_info,
6670};
6671
6672static int __init bpf_iter_register(void)
6673{
6674	ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id;
6675	return bpf_iter_reg_target(&ipv6_route_reg_info);
6676}
6677
6678static void bpf_iter_unregister(void)
6679{
6680	bpf_iter_unreg_target(&ipv6_route_reg_info);
6681}
6682#endif
6683#endif
6684
6685int __init ip6_route_init(void)
6686{
6687	int ret;
6688	int cpu;
6689
6690	ret = -ENOMEM;
6691	ip6_dst_ops_template.kmem_cachep =
6692		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
6693				  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
6694	if (!ip6_dst_ops_template.kmem_cachep)
6695		goto out;
6696
6697	ret = dst_entries_init(&ip6_dst_blackhole_ops);
6698	if (ret)
6699		goto out_kmem_cache;
6700
6701	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
6702	if (ret)
6703		goto out_dst_entries;
6704
6705	ret = register_pernet_subsys(&ip6_route_net_ops);
6706	if (ret)
6707		goto out_register_inetpeer;
6708
6709	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
6710
6711	ret = fib6_init();
6712	if (ret)
6713		goto out_register_subsys;
6714
6715	ret = xfrm6_init();
6716	if (ret)
6717		goto out_fib6_init;
6718
6719	ret = fib6_rules_init();
6720	if (ret)
6721		goto xfrm6_init;
6722
6723	ret = register_pernet_subsys(&ip6_route_net_late_ops);
6724	if (ret)
6725		goto fib6_rules_init;
6726
6727	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
6728				   inet6_rtm_newroute, NULL, 0);
6729	if (ret < 0)
6730		goto out_register_late_subsys;
6731
6732	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
6733				   inet6_rtm_delroute, NULL, 0);
6734	if (ret < 0)
6735		goto out_register_late_subsys;
6736
6737	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
6738				   inet6_rtm_getroute, NULL,
6739				   RTNL_FLAG_DOIT_UNLOCKED);
6740	if (ret < 0)
6741		goto out_register_late_subsys;
6742
6743	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
6744	if (ret)
6745		goto out_register_late_subsys;
6746
6747#if IS_BUILTIN(CONFIG_IPV6)
6748#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6749	ret = bpf_iter_register();
6750	if (ret)
6751		goto out_register_late_subsys;
6752#endif
6753#endif
6754
6755	for_each_possible_cpu(cpu) {
6756		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
6757
6758		INIT_LIST_HEAD(&ul->head);
6759		INIT_LIST_HEAD(&ul->quarantine);
6760		spin_lock_init(&ul->lock);
6761	}
6762
6763out:
6764	return ret;
6765
6766out_register_late_subsys:
6767	rtnl_unregister_all(PF_INET6);
6768	unregister_pernet_subsys(&ip6_route_net_late_ops);
6769fib6_rules_init:
6770	fib6_rules_cleanup();
6771xfrm6_init:
6772	xfrm6_fini();
6773out_fib6_init:
6774	fib6_gc_cleanup();
6775out_register_subsys:
6776	unregister_pernet_subsys(&ip6_route_net_ops);
6777out_register_inetpeer:
6778	unregister_pernet_subsys(&ipv6_inetpeer_ops);
6779out_dst_entries:
6780	dst_entries_destroy(&ip6_dst_blackhole_ops);
6781out_kmem_cache:
6782	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
6783	goto out;
6784}
6785
6786void ip6_route_cleanup(void)
6787{
6788#if IS_BUILTIN(CONFIG_IPV6)
6789#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6790	bpf_iter_unregister();
6791#endif
6792#endif
6793	unregister_netdevice_notifier(&ip6_route_dev_notifier);
6794	unregister_pernet_subsys(&ip6_route_net_late_ops);
6795	fib6_rules_cleanup();
6796	xfrm6_fini();
6797	fib6_gc_cleanup();
6798	unregister_pernet_subsys(&ipv6_inetpeer_ops);
6799	unregister_pernet_subsys(&ip6_route_net_ops);
6800	dst_entries_destroy(&ip6_dst_blackhole_ops);
6801	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
6802}
6803