1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (c) 2013 Nicira, Inc.
4 */
5
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8#include <linux/capability.h>
9#include <linux/module.h>
10#include <linux/types.h>
11#include <linux/kernel.h>
12#include <linux/slab.h>
13#include <linux/uaccess.h>
14#include <linux/skbuff.h>
15#include <linux/netdevice.h>
16#include <linux/in.h>
17#include <linux/tcp.h>
18#include <linux/udp.h>
19#include <linux/if_arp.h>
20#include <linux/init.h>
21#include <linux/in6.h>
22#include <linux/inetdevice.h>
23#include <linux/igmp.h>
24#include <linux/netfilter_ipv4.h>
25#include <linux/etherdevice.h>
26#include <linux/if_ether.h>
27#include <linux/if_vlan.h>
28#include <linux/rculist.h>
29#include <linux/err.h>
30
31#include <net/sock.h>
32#include <net/ip.h>
33#include <net/icmp.h>
34#include <net/protocol.h>
35#include <net/ip_tunnels.h>
36#include <net/arp.h>
37#include <net/checksum.h>
38#include <net/dsfield.h>
39#include <net/inet_ecn.h>
40#include <net/xfrm.h>
41#include <net/net_namespace.h>
42#include <net/netns/generic.h>
43#include <net/rtnetlink.h>
44#include <net/udp.h>
45#include <net/dst_metadata.h>
46
47#if IS_ENABLED(CONFIG_IPV6)
48#include <net/ipv6.h>
49#include <net/ip6_fib.h>
50#include <net/ip6_route.h>
51#endif
52
53static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54{
55	return hash_32((__force u32)key ^ (__force u32)remote,
56			 IP_TNL_HASH_BITS);
57}
58
59static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60				__be16 flags, __be32 key)
61{
62	if (p->i_flags & TUNNEL_KEY) {
63		if (flags & TUNNEL_KEY)
64			return key == p->i_key;
65		else
66			/* key expected, none present */
67			return false;
68	} else
69		return !(flags & TUNNEL_KEY);
70}
71
72/* Fallback tunnel: no source, no destination, no key, no options
73
74   Tunnel hash table:
75   We require exact key match i.e. if a key is present in packet
76   it will match only tunnel with the same key; if it is not present,
77   it will match only keyless tunnel.
78
79   All keysless packets, if not matched configured keyless tunnels
80   will match fallback tunnel.
81   Given src, dst and key, find appropriate for input tunnel.
82*/
83struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84				   int link, __be16 flags,
85				   __be32 remote, __be32 local,
86				   __be32 key)
87{
88	struct ip_tunnel *t, *cand = NULL;
89	struct hlist_head *head;
90	struct net_device *ndev;
91	unsigned int hash;
92
93	hash = ip_tunnel_hash(key, remote);
94	head = &itn->tunnels[hash];
95
96	hlist_for_each_entry_rcu(t, head, hash_node) {
97		if (local != t->parms.iph.saddr ||
98		    remote != t->parms.iph.daddr ||
99		    !(t->dev->flags & IFF_UP))
100			continue;
101
102		if (!ip_tunnel_key_match(&t->parms, flags, key))
103			continue;
104
105		if (READ_ONCE(t->parms.link) == link)
106			return t;
107		cand = t;
108	}
109
110	hlist_for_each_entry_rcu(t, head, hash_node) {
111		if (remote != t->parms.iph.daddr ||
112		    t->parms.iph.saddr != 0 ||
113		    !(t->dev->flags & IFF_UP))
114			continue;
115
116		if (!ip_tunnel_key_match(&t->parms, flags, key))
117			continue;
118
119		if (READ_ONCE(t->parms.link) == link)
120			return t;
121		if (!cand)
122			cand = t;
123	}
124
125	hash = ip_tunnel_hash(key, 0);
126	head = &itn->tunnels[hash];
127
128	hlist_for_each_entry_rcu(t, head, hash_node) {
129		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
130		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
131			continue;
132
133		if (!(t->dev->flags & IFF_UP))
134			continue;
135
136		if (!ip_tunnel_key_match(&t->parms, flags, key))
137			continue;
138
139		if (READ_ONCE(t->parms.link) == link)
140			return t;
141		if (!cand)
142			cand = t;
143	}
144
145	hlist_for_each_entry_rcu(t, head, hash_node) {
146		if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
147		    t->parms.iph.saddr != 0 ||
148		    t->parms.iph.daddr != 0 ||
149		    !(t->dev->flags & IFF_UP))
150			continue;
151
152		if (READ_ONCE(t->parms.link) == link)
153			return t;
154		if (!cand)
155			cand = t;
156	}
157
158	if (cand)
159		return cand;
160
161	t = rcu_dereference(itn->collect_md_tun);
162	if (t && t->dev->flags & IFF_UP)
163		return t;
164
165	ndev = READ_ONCE(itn->fb_tunnel_dev);
166	if (ndev && ndev->flags & IFF_UP)
167		return netdev_priv(ndev);
168
169	return NULL;
170}
171EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
172
173static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
174				    struct ip_tunnel_parm *parms)
175{
176	unsigned int h;
177	__be32 remote;
178	__be32 i_key = parms->i_key;
179
180	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
181		remote = parms->iph.daddr;
182	else
183		remote = 0;
184
185	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
186		i_key = 0;
187
188	h = ip_tunnel_hash(i_key, remote);
189	return &itn->tunnels[h];
190}
191
192static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
193{
194	struct hlist_head *head = ip_bucket(itn, &t->parms);
195
196	if (t->collect_md)
197		rcu_assign_pointer(itn->collect_md_tun, t);
198	hlist_add_head_rcu(&t->hash_node, head);
199}
200
201static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
202{
203	if (t->collect_md)
204		rcu_assign_pointer(itn->collect_md_tun, NULL);
205	hlist_del_init_rcu(&t->hash_node);
206}
207
208static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
209					struct ip_tunnel_parm *parms,
210					int type)
211{
212	__be32 remote = parms->iph.daddr;
213	__be32 local = parms->iph.saddr;
214	__be32 key = parms->i_key;
215	__be16 flags = parms->i_flags;
216	int link = parms->link;
217	struct ip_tunnel *t = NULL;
218	struct hlist_head *head = ip_bucket(itn, parms);
219
220	hlist_for_each_entry_rcu(t, head, hash_node) {
221		if (local == t->parms.iph.saddr &&
222		    remote == t->parms.iph.daddr &&
223		    link == READ_ONCE(t->parms.link) &&
224		    type == t->dev->type &&
225		    ip_tunnel_key_match(&t->parms, flags, key))
226			break;
227	}
228	return t;
229}
230
231static struct net_device *__ip_tunnel_create(struct net *net,
232					     const struct rtnl_link_ops *ops,
233					     struct ip_tunnel_parm *parms)
234{
235	int err;
236	struct ip_tunnel *tunnel;
237	struct net_device *dev;
238	char name[IFNAMSIZ];
239
240	err = -E2BIG;
241	if (parms->name[0]) {
242		if (!dev_valid_name(parms->name))
243			goto failed;
244		strscpy(name, parms->name, IFNAMSIZ);
245	} else {
246		if (strlen(ops->kind) > (IFNAMSIZ - 3))
247			goto failed;
248		strcpy(name, ops->kind);
249		strcat(name, "%d");
250	}
251
252	ASSERT_RTNL();
253	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
254	if (!dev) {
255		err = -ENOMEM;
256		goto failed;
257	}
258	dev_net_set(dev, net);
259
260	dev->rtnl_link_ops = ops;
261
262	tunnel = netdev_priv(dev);
263	tunnel->parms = *parms;
264	tunnel->net = net;
265
266	err = register_netdevice(dev);
267	if (err)
268		goto failed_free;
269
270	return dev;
271
272failed_free:
273	free_netdev(dev);
274failed:
275	return ERR_PTR(err);
276}
277
278static int ip_tunnel_bind_dev(struct net_device *dev)
279{
280	struct net_device *tdev = NULL;
281	struct ip_tunnel *tunnel = netdev_priv(dev);
282	const struct iphdr *iph;
283	int hlen = LL_MAX_HEADER;
284	int mtu = ETH_DATA_LEN;
285	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
286
287	iph = &tunnel->parms.iph;
288
289	/* Guess output device to choose reasonable mtu and needed_headroom */
290	if (iph->daddr) {
291		struct flowi4 fl4;
292		struct rtable *rt;
293
294		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
295				    iph->saddr, tunnel->parms.o_key,
296				    RT_TOS(iph->tos), dev_net(dev),
297				    tunnel->parms.link, tunnel->fwmark, 0, 0);
298		rt = ip_route_output_key(tunnel->net, &fl4);
299
300		if (!IS_ERR(rt)) {
301			tdev = rt->dst.dev;
302			ip_rt_put(rt);
303		}
304		if (dev->type != ARPHRD_ETHER)
305			dev->flags |= IFF_POINTOPOINT;
306
307		dst_cache_reset(&tunnel->dst_cache);
308	}
309
310	if (!tdev && tunnel->parms.link)
311		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
312
313	if (tdev) {
314		hlen = tdev->hard_header_len + tdev->needed_headroom;
315		mtu = min(tdev->mtu, IP_MAX_MTU);
316	}
317
318	dev->needed_headroom = t_hlen + hlen;
319	mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
320
321	if (mtu < IPV4_MIN_MTU)
322		mtu = IPV4_MIN_MTU;
323
324	return mtu;
325}
326
327static struct ip_tunnel *ip_tunnel_create(struct net *net,
328					  struct ip_tunnel_net *itn,
329					  struct ip_tunnel_parm *parms)
330{
331	struct ip_tunnel *nt;
332	struct net_device *dev;
333	int t_hlen;
334	int mtu;
335	int err;
336
337	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
338	if (IS_ERR(dev))
339		return ERR_CAST(dev);
340
341	mtu = ip_tunnel_bind_dev(dev);
342	err = dev_set_mtu(dev, mtu);
343	if (err)
344		goto err_dev_set_mtu;
345
346	nt = netdev_priv(dev);
347	t_hlen = nt->hlen + sizeof(struct iphdr);
348	dev->min_mtu = ETH_MIN_MTU;
349	dev->max_mtu = IP_MAX_MTU - t_hlen;
350	if (dev->type == ARPHRD_ETHER)
351		dev->max_mtu -= dev->hard_header_len;
352
353	ip_tunnel_add(itn, nt);
354	return nt;
355
356err_dev_set_mtu:
357	unregister_netdevice(dev);
358	return ERR_PTR(err);
359}
360
361void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
362{
363	const struct iphdr *iph = ip_hdr(skb);
364	const struct udphdr *udph;
365
366	if (iph->protocol != IPPROTO_UDP)
367		return;
368
369	udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
370	info->encap.sport = udph->source;
371	info->encap.dport = udph->dest;
372}
373EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
374
375int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
376		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
377		  bool log_ecn_error)
378{
379	const struct iphdr *iph = ip_hdr(skb);
380	int nh, err;
381
382#ifdef CONFIG_NET_IPGRE_BROADCAST
383	if (ipv4_is_multicast(iph->daddr)) {
384		DEV_STATS_INC(tunnel->dev, multicast);
385		skb->pkt_type = PACKET_BROADCAST;
386	}
387#endif
388
389	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
390	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
391		DEV_STATS_INC(tunnel->dev, rx_crc_errors);
392		DEV_STATS_INC(tunnel->dev, rx_errors);
393		goto drop;
394	}
395
396	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
397		if (!(tpi->flags&TUNNEL_SEQ) ||
398		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
399			DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
400			DEV_STATS_INC(tunnel->dev, rx_errors);
401			goto drop;
402		}
403		tunnel->i_seqno = ntohl(tpi->seq) + 1;
404	}
405
406	/* Save offset of outer header relative to skb->head,
407	 * because we are going to reset the network header to the inner header
408	 * and might change skb->head.
409	 */
410	nh = skb_network_header(skb) - skb->head;
411
412	skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
413
414	if (!pskb_inet_may_pull(skb)) {
415		DEV_STATS_INC(tunnel->dev, rx_length_errors);
416		DEV_STATS_INC(tunnel->dev, rx_errors);
417		goto drop;
418	}
419	iph = (struct iphdr *)(skb->head + nh);
420
421	err = IP_ECN_decapsulate(iph, skb);
422	if (unlikely(err)) {
423		if (log_ecn_error)
424			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
425					&iph->saddr, iph->tos);
426		if (err > 1) {
427			DEV_STATS_INC(tunnel->dev, rx_frame_errors);
428			DEV_STATS_INC(tunnel->dev, rx_errors);
429			goto drop;
430		}
431	}
432
433	dev_sw_netstats_rx_add(tunnel->dev, skb->len);
434	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
435
436	if (tunnel->dev->type == ARPHRD_ETHER) {
437		skb->protocol = eth_type_trans(skb, tunnel->dev);
438		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
439	} else {
440		skb->dev = tunnel->dev;
441	}
442
443	if (tun_dst)
444		skb_dst_set(skb, (struct dst_entry *)tun_dst);
445
446	gro_cells_receive(&tunnel->gro_cells, skb);
447	return 0;
448
449drop:
450	if (tun_dst)
451		dst_release((struct dst_entry *)tun_dst);
452	kfree_skb(skb);
453	return 0;
454}
455EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
456
457int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
458			    unsigned int num)
459{
460	if (num >= MAX_IPTUN_ENCAP_OPS)
461		return -ERANGE;
462
463	return !cmpxchg((const struct ip_tunnel_encap_ops **)
464			&iptun_encaps[num],
465			NULL, ops) ? 0 : -1;
466}
467EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
468
469int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
470			    unsigned int num)
471{
472	int ret;
473
474	if (num >= MAX_IPTUN_ENCAP_OPS)
475		return -ERANGE;
476
477	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
478		       &iptun_encaps[num],
479		       ops, NULL) == ops) ? 0 : -1;
480
481	synchronize_net();
482
483	return ret;
484}
485EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
486
487int ip_tunnel_encap_setup(struct ip_tunnel *t,
488			  struct ip_tunnel_encap *ipencap)
489{
490	int hlen;
491
492	memset(&t->encap, 0, sizeof(t->encap));
493
494	hlen = ip_encap_hlen(ipencap);
495	if (hlen < 0)
496		return hlen;
497
498	t->encap.type = ipencap->type;
499	t->encap.sport = ipencap->sport;
500	t->encap.dport = ipencap->dport;
501	t->encap.flags = ipencap->flags;
502
503	t->encap_hlen = hlen;
504	t->hlen = t->encap_hlen + t->tun_hlen;
505
506	return 0;
507}
508EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
509
510static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
511			    struct rtable *rt, __be16 df,
512			    const struct iphdr *inner_iph,
513			    int tunnel_hlen, __be32 dst, bool md)
514{
515	struct ip_tunnel *tunnel = netdev_priv(dev);
516	int pkt_size;
517	int mtu;
518
519	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
520	pkt_size = skb->len - tunnel_hlen;
521	pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
522
523	if (df) {
524		mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
525		mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
526	} else {
527		mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
528	}
529
530	if (skb_valid_dst(skb))
531		skb_dst_update_pmtu_no_confirm(skb, mtu);
532
533	if (skb->protocol == htons(ETH_P_IP)) {
534		if (!skb_is_gso(skb) &&
535		    (inner_iph->frag_off & htons(IP_DF)) &&
536		    mtu < pkt_size) {
537			icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
538			return -E2BIG;
539		}
540	}
541#if IS_ENABLED(CONFIG_IPV6)
542	else if (skb->protocol == htons(ETH_P_IPV6)) {
543		struct rt6_info *rt6;
544		__be32 daddr;
545
546		rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
547					   NULL;
548		daddr = md ? dst : tunnel->parms.iph.daddr;
549
550		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
551			   mtu >= IPV6_MIN_MTU) {
552			if ((daddr && !ipv4_is_multicast(daddr)) ||
553			    rt6->rt6i_dst.plen == 128) {
554				rt6->rt6i_flags |= RTF_MODIFIED;
555				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
556			}
557		}
558
559		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
560					mtu < pkt_size) {
561			icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
562			return -E2BIG;
563		}
564	}
565#endif
566	return 0;
567}
568
569static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom)
570{
571	/* we must cap headroom to some upperlimit, else pskb_expand_head
572	 * will overflow header offsets in skb_headers_offset_update().
573	 */
574	static const unsigned int max_allowed = 512;
575
576	if (headroom > max_allowed)
577		headroom = max_allowed;
578
579	if (headroom > READ_ONCE(dev->needed_headroom))
580		WRITE_ONCE(dev->needed_headroom, headroom);
581}
582
583void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
584		       u8 proto, int tunnel_hlen)
585{
586	struct ip_tunnel *tunnel = netdev_priv(dev);
587	u32 headroom = sizeof(struct iphdr);
588	struct ip_tunnel_info *tun_info;
589	const struct ip_tunnel_key *key;
590	const struct iphdr *inner_iph;
591	struct rtable *rt = NULL;
592	struct flowi4 fl4;
593	__be16 df = 0;
594	u8 tos, ttl;
595	bool use_cache;
596
597	tun_info = skb_tunnel_info(skb);
598	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
599		     ip_tunnel_info_af(tun_info) != AF_INET))
600		goto tx_error;
601	key = &tun_info->key;
602	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
603	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
604	tos = key->tos;
605	if (tos == 1) {
606		if (skb->protocol == htons(ETH_P_IP))
607			tos = inner_iph->tos;
608		else if (skb->protocol == htons(ETH_P_IPV6))
609			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
610	}
611	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
612			    tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
613			    dev_net(dev), 0, skb->mark, skb_get_hash(skb),
614			    key->flow_flags);
615
616	if (!tunnel_hlen)
617		tunnel_hlen = ip_encap_hlen(&tun_info->encap);
618
619	if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
620		goto tx_error;
621
622	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
623	if (use_cache)
624		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
625	if (!rt) {
626		rt = ip_route_output_key(tunnel->net, &fl4);
627		if (IS_ERR(rt)) {
628			DEV_STATS_INC(dev, tx_carrier_errors);
629			goto tx_error;
630		}
631		if (use_cache)
632			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
633					  fl4.saddr);
634	}
635	if (rt->dst.dev == dev) {
636		ip_rt_put(rt);
637		DEV_STATS_INC(dev, collisions);
638		goto tx_error;
639	}
640
641	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
642		df = htons(IP_DF);
643	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
644			    key->u.ipv4.dst, true)) {
645		ip_rt_put(rt);
646		goto tx_error;
647	}
648
649	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
650	ttl = key->ttl;
651	if (ttl == 0) {
652		if (skb->protocol == htons(ETH_P_IP))
653			ttl = inner_iph->ttl;
654		else if (skb->protocol == htons(ETH_P_IPV6))
655			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
656		else
657			ttl = ip4_dst_hoplimit(&rt->dst);
658	}
659
660	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
661	if (skb_cow_head(skb, headroom)) {
662		ip_rt_put(rt);
663		goto tx_dropped;
664	}
665
666	ip_tunnel_adj_headroom(dev, headroom);
667
668	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
669		      df, !net_eq(tunnel->net, dev_net(dev)));
670	return;
671tx_error:
672	DEV_STATS_INC(dev, tx_errors);
673	goto kfree;
674tx_dropped:
675	DEV_STATS_INC(dev, tx_dropped);
676kfree:
677	kfree_skb(skb);
678}
679EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
680
681void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
682		    const struct iphdr *tnl_params, u8 protocol)
683{
684	struct ip_tunnel *tunnel = netdev_priv(dev);
685	struct ip_tunnel_info *tun_info = NULL;
686	const struct iphdr *inner_iph;
687	unsigned int max_headroom;	/* The extra header space needed */
688	struct rtable *rt = NULL;		/* Route to the other host */
689	__be16 payload_protocol;
690	bool use_cache = false;
691	struct flowi4 fl4;
692	bool md = false;
693	bool connected;
694	u8 tos, ttl;
695	__be32 dst;
696	__be16 df;
697
698	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
699	connected = (tunnel->parms.iph.daddr != 0);
700	payload_protocol = skb_protocol(skb, true);
701
702	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
703
704	dst = tnl_params->daddr;
705	if (dst == 0) {
706		/* NBMA tunnel */
707
708		if (!skb_dst(skb)) {
709			DEV_STATS_INC(dev, tx_fifo_errors);
710			goto tx_error;
711		}
712
713		tun_info = skb_tunnel_info(skb);
714		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
715		    ip_tunnel_info_af(tun_info) == AF_INET &&
716		    tun_info->key.u.ipv4.dst) {
717			dst = tun_info->key.u.ipv4.dst;
718			md = true;
719			connected = true;
720		} else if (payload_protocol == htons(ETH_P_IP)) {
721			rt = skb_rtable(skb);
722			dst = rt_nexthop(rt, inner_iph->daddr);
723		}
724#if IS_ENABLED(CONFIG_IPV6)
725		else if (payload_protocol == htons(ETH_P_IPV6)) {
726			const struct in6_addr *addr6;
727			struct neighbour *neigh;
728			bool do_tx_error_icmp;
729			int addr_type;
730
731			neigh = dst_neigh_lookup(skb_dst(skb),
732						 &ipv6_hdr(skb)->daddr);
733			if (!neigh)
734				goto tx_error;
735
736			addr6 = (const struct in6_addr *)&neigh->primary_key;
737			addr_type = ipv6_addr_type(addr6);
738
739			if (addr_type == IPV6_ADDR_ANY) {
740				addr6 = &ipv6_hdr(skb)->daddr;
741				addr_type = ipv6_addr_type(addr6);
742			}
743
744			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
745				do_tx_error_icmp = true;
746			else {
747				do_tx_error_icmp = false;
748				dst = addr6->s6_addr32[3];
749			}
750			neigh_release(neigh);
751			if (do_tx_error_icmp)
752				goto tx_error_icmp;
753		}
754#endif
755		else
756			goto tx_error;
757
758		if (!md)
759			connected = false;
760	}
761
762	tos = tnl_params->tos;
763	if (tos & 0x1) {
764		tos &= ~0x1;
765		if (payload_protocol == htons(ETH_P_IP)) {
766			tos = inner_iph->tos;
767			connected = false;
768		} else if (payload_protocol == htons(ETH_P_IPV6)) {
769			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
770			connected = false;
771		}
772	}
773
774	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
775			    tunnel->parms.o_key, RT_TOS(tos),
776			    dev_net(dev), READ_ONCE(tunnel->parms.link),
777			    tunnel->fwmark, skb_get_hash(skb), 0);
778
779	if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
780		goto tx_error;
781
782	if (connected && md) {
783		use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
784		if (use_cache)
785			rt = dst_cache_get_ip4(&tun_info->dst_cache,
786					       &fl4.saddr);
787	} else {
788		rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
789						&fl4.saddr) : NULL;
790	}
791
792	if (!rt) {
793		rt = ip_route_output_key(tunnel->net, &fl4);
794
795		if (IS_ERR(rt)) {
796			DEV_STATS_INC(dev, tx_carrier_errors);
797			goto tx_error;
798		}
799		if (use_cache)
800			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
801					  fl4.saddr);
802		else if (!md && connected)
803			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
804					  fl4.saddr);
805	}
806
807	if (rt->dst.dev == dev) {
808		ip_rt_put(rt);
809		DEV_STATS_INC(dev, collisions);
810		goto tx_error;
811	}
812
813	df = tnl_params->frag_off;
814	if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
815		df |= (inner_iph->frag_off & htons(IP_DF));
816
817	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
818		ip_rt_put(rt);
819		goto tx_error;
820	}
821
822	if (tunnel->err_count > 0) {
823		if (time_before(jiffies,
824				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
825			tunnel->err_count--;
826
827			dst_link_failure(skb);
828		} else
829			tunnel->err_count = 0;
830	}
831
832	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
833	ttl = tnl_params->ttl;
834	if (ttl == 0) {
835		if (payload_protocol == htons(ETH_P_IP))
836			ttl = inner_iph->ttl;
837#if IS_ENABLED(CONFIG_IPV6)
838		else if (payload_protocol == htons(ETH_P_IPV6))
839			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
840#endif
841		else
842			ttl = ip4_dst_hoplimit(&rt->dst);
843	}
844
845	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
846			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
847
848	if (skb_cow_head(skb, max_headroom)) {
849		ip_rt_put(rt);
850		DEV_STATS_INC(dev, tx_dropped);
851		kfree_skb(skb);
852		return;
853	}
854
855	ip_tunnel_adj_headroom(dev, max_headroom);
856
857	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
858		      df, !net_eq(tunnel->net, dev_net(dev)));
859	return;
860
861#if IS_ENABLED(CONFIG_IPV6)
862tx_error_icmp:
863	dst_link_failure(skb);
864#endif
865tx_error:
866	DEV_STATS_INC(dev, tx_errors);
867	kfree_skb(skb);
868}
869EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
870
871static void ip_tunnel_update(struct ip_tunnel_net *itn,
872			     struct ip_tunnel *t,
873			     struct net_device *dev,
874			     struct ip_tunnel_parm *p,
875			     bool set_mtu,
876			     __u32 fwmark)
877{
878	ip_tunnel_del(itn, t);
879	t->parms.iph.saddr = p->iph.saddr;
880	t->parms.iph.daddr = p->iph.daddr;
881	t->parms.i_key = p->i_key;
882	t->parms.o_key = p->o_key;
883	if (dev->type != ARPHRD_ETHER) {
884		__dev_addr_set(dev, &p->iph.saddr, 4);
885		memcpy(dev->broadcast, &p->iph.daddr, 4);
886	}
887	ip_tunnel_add(itn, t);
888
889	t->parms.iph.ttl = p->iph.ttl;
890	t->parms.iph.tos = p->iph.tos;
891	t->parms.iph.frag_off = p->iph.frag_off;
892
893	if (t->parms.link != p->link || t->fwmark != fwmark) {
894		int mtu;
895
896		WRITE_ONCE(t->parms.link, p->link);
897		t->fwmark = fwmark;
898		mtu = ip_tunnel_bind_dev(dev);
899		if (set_mtu)
900			dev->mtu = mtu;
901	}
902	dst_cache_reset(&t->dst_cache);
903	netdev_state_change(dev);
904}
905
906int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
907{
908	int err = 0;
909	struct ip_tunnel *t = netdev_priv(dev);
910	struct net *net = t->net;
911	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
912
913	switch (cmd) {
914	case SIOCGETTUNNEL:
915		if (dev == itn->fb_tunnel_dev) {
916			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
917			if (!t)
918				t = netdev_priv(dev);
919		}
920		memcpy(p, &t->parms, sizeof(*p));
921		break;
922
923	case SIOCADDTUNNEL:
924	case SIOCCHGTUNNEL:
925		err = -EPERM;
926		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
927			goto done;
928		if (p->iph.ttl)
929			p->iph.frag_off |= htons(IP_DF);
930		if (!(p->i_flags & VTI_ISVTI)) {
931			if (!(p->i_flags & TUNNEL_KEY))
932				p->i_key = 0;
933			if (!(p->o_flags & TUNNEL_KEY))
934				p->o_key = 0;
935		}
936
937		t = ip_tunnel_find(itn, p, itn->type);
938
939		if (cmd == SIOCADDTUNNEL) {
940			if (!t) {
941				t = ip_tunnel_create(net, itn, p);
942				err = PTR_ERR_OR_ZERO(t);
943				break;
944			}
945
946			err = -EEXIST;
947			break;
948		}
949		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
950			if (t) {
951				if (t->dev != dev) {
952					err = -EEXIST;
953					break;
954				}
955			} else {
956				unsigned int nflags = 0;
957
958				if (ipv4_is_multicast(p->iph.daddr))
959					nflags = IFF_BROADCAST;
960				else if (p->iph.daddr)
961					nflags = IFF_POINTOPOINT;
962
963				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
964					err = -EINVAL;
965					break;
966				}
967
968				t = netdev_priv(dev);
969			}
970		}
971
972		if (t) {
973			err = 0;
974			ip_tunnel_update(itn, t, dev, p, true, 0);
975		} else {
976			err = -ENOENT;
977		}
978		break;
979
980	case SIOCDELTUNNEL:
981		err = -EPERM;
982		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
983			goto done;
984
985		if (dev == itn->fb_tunnel_dev) {
986			err = -ENOENT;
987			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
988			if (!t)
989				goto done;
990			err = -EPERM;
991			if (t == netdev_priv(itn->fb_tunnel_dev))
992				goto done;
993			dev = t->dev;
994		}
995		unregister_netdevice(dev);
996		err = 0;
997		break;
998
999	default:
1000		err = -EINVAL;
1001	}
1002
1003done:
1004	return err;
1005}
1006EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
1007
1008int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
1009			     void __user *data, int cmd)
1010{
1011	struct ip_tunnel_parm p;
1012	int err;
1013
1014	if (copy_from_user(&p, data, sizeof(p)))
1015		return -EFAULT;
1016	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
1017	if (!err && copy_to_user(data, &p, sizeof(p)))
1018		return -EFAULT;
1019	return err;
1020}
1021EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
1022
1023int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1024{
1025	struct ip_tunnel *tunnel = netdev_priv(dev);
1026	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1027	int max_mtu = IP_MAX_MTU - t_hlen;
1028
1029	if (dev->type == ARPHRD_ETHER)
1030		max_mtu -= dev->hard_header_len;
1031
1032	if (new_mtu < ETH_MIN_MTU)
1033		return -EINVAL;
1034
1035	if (new_mtu > max_mtu) {
1036		if (strict)
1037			return -EINVAL;
1038
1039		new_mtu = max_mtu;
1040	}
1041
1042	dev->mtu = new_mtu;
1043	return 0;
1044}
1045EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1046
1047int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1048{
1049	return __ip_tunnel_change_mtu(dev, new_mtu, true);
1050}
1051EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1052
1053static void ip_tunnel_dev_free(struct net_device *dev)
1054{
1055	struct ip_tunnel *tunnel = netdev_priv(dev);
1056
1057	gro_cells_destroy(&tunnel->gro_cells);
1058	dst_cache_destroy(&tunnel->dst_cache);
1059	free_percpu(dev->tstats);
1060}
1061
1062void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1063{
1064	struct ip_tunnel *tunnel = netdev_priv(dev);
1065	struct ip_tunnel_net *itn;
1066
1067	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1068
1069	if (itn->fb_tunnel_dev != dev) {
1070		ip_tunnel_del(itn, netdev_priv(dev));
1071		unregister_netdevice_queue(dev, head);
1072	}
1073}
1074EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1075
1076struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1077{
1078	struct ip_tunnel *tunnel = netdev_priv(dev);
1079
1080	return tunnel->net;
1081}
1082EXPORT_SYMBOL(ip_tunnel_get_link_net);
1083
1084int ip_tunnel_get_iflink(const struct net_device *dev)
1085{
1086	const struct ip_tunnel *tunnel = netdev_priv(dev);
1087
1088	return READ_ONCE(tunnel->parms.link);
1089}
1090EXPORT_SYMBOL(ip_tunnel_get_iflink);
1091
1092int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1093				  struct rtnl_link_ops *ops, char *devname)
1094{
1095	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1096	struct ip_tunnel_parm parms;
1097	unsigned int i;
1098
1099	itn->rtnl_link_ops = ops;
1100	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1101		INIT_HLIST_HEAD(&itn->tunnels[i]);
1102
1103	if (!ops || !net_has_fallback_tunnels(net)) {
1104		struct ip_tunnel_net *it_init_net;
1105
1106		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1107		itn->type = it_init_net->type;
1108		itn->fb_tunnel_dev = NULL;
1109		return 0;
1110	}
1111
1112	memset(&parms, 0, sizeof(parms));
1113	if (devname)
1114		strscpy(parms.name, devname, IFNAMSIZ);
1115
1116	rtnl_lock();
1117	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1118	/* FB netdevice is special: we have one, and only one per netns.
1119	 * Allowing to move it to another netns is clearly unsafe.
1120	 */
1121	if (!IS_ERR(itn->fb_tunnel_dev)) {
1122		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1123		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1124		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1125		itn->type = itn->fb_tunnel_dev->type;
1126	}
1127	rtnl_unlock();
1128
1129	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1130}
1131EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1132
1133static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1134			      struct list_head *head,
1135			      struct rtnl_link_ops *ops)
1136{
1137	struct net_device *dev, *aux;
1138	int h;
1139
1140	for_each_netdev_safe(net, dev, aux)
1141		if (dev->rtnl_link_ops == ops)
1142			unregister_netdevice_queue(dev, head);
1143
1144	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1145		struct ip_tunnel *t;
1146		struct hlist_node *n;
1147		struct hlist_head *thead = &itn->tunnels[h];
1148
1149		hlist_for_each_entry_safe(t, n, thead, hash_node)
1150			/* If dev is in the same netns, it has already
1151			 * been added to the list by the previous loop.
1152			 */
1153			if (!net_eq(dev_net(t->dev), net))
1154				unregister_netdevice_queue(t->dev, head);
1155	}
1156}
1157
1158void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1159			   struct rtnl_link_ops *ops,
1160			   struct list_head *dev_to_kill)
1161{
1162	struct ip_tunnel_net *itn;
1163	struct net *net;
1164
1165	ASSERT_RTNL();
1166	list_for_each_entry(net, net_list, exit_list) {
1167		itn = net_generic(net, id);
1168		ip_tunnel_destroy(net, itn, dev_to_kill, ops);
1169	}
1170}
1171EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1172
1173int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1174		      struct ip_tunnel_parm *p, __u32 fwmark)
1175{
1176	struct ip_tunnel *nt;
1177	struct net *net = dev_net(dev);
1178	struct ip_tunnel_net *itn;
1179	int mtu;
1180	int err;
1181
1182	nt = netdev_priv(dev);
1183	itn = net_generic(net, nt->ip_tnl_net_id);
1184
1185	if (nt->collect_md) {
1186		if (rtnl_dereference(itn->collect_md_tun))
1187			return -EEXIST;
1188	} else {
1189		if (ip_tunnel_find(itn, p, dev->type))
1190			return -EEXIST;
1191	}
1192
1193	nt->net = net;
1194	nt->parms = *p;
1195	nt->fwmark = fwmark;
1196	err = register_netdevice(dev);
1197	if (err)
1198		goto err_register_netdevice;
1199
1200	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1201		eth_hw_addr_random(dev);
1202
1203	mtu = ip_tunnel_bind_dev(dev);
1204	if (tb[IFLA_MTU]) {
1205		unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1206
1207		if (dev->type == ARPHRD_ETHER)
1208			max -= dev->hard_header_len;
1209
1210		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1211	}
1212
1213	err = dev_set_mtu(dev, mtu);
1214	if (err)
1215		goto err_dev_set_mtu;
1216
1217	ip_tunnel_add(itn, nt);
1218	return 0;
1219
1220err_dev_set_mtu:
1221	unregister_netdevice(dev);
1222err_register_netdevice:
1223	return err;
1224}
1225EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1226
1227int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1228			 struct ip_tunnel_parm *p, __u32 fwmark)
1229{
1230	struct ip_tunnel *t;
1231	struct ip_tunnel *tunnel = netdev_priv(dev);
1232	struct net *net = tunnel->net;
1233	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1234
1235	if (dev == itn->fb_tunnel_dev)
1236		return -EINVAL;
1237
1238	t = ip_tunnel_find(itn, p, dev->type);
1239
1240	if (t) {
1241		if (t->dev != dev)
1242			return -EEXIST;
1243	} else {
1244		t = tunnel;
1245
1246		if (dev->type != ARPHRD_ETHER) {
1247			unsigned int nflags = 0;
1248
1249			if (ipv4_is_multicast(p->iph.daddr))
1250				nflags = IFF_BROADCAST;
1251			else if (p->iph.daddr)
1252				nflags = IFF_POINTOPOINT;
1253
1254			if ((dev->flags ^ nflags) &
1255			    (IFF_POINTOPOINT | IFF_BROADCAST))
1256				return -EINVAL;
1257		}
1258	}
1259
1260	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1261	return 0;
1262}
1263EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1264
1265int ip_tunnel_init(struct net_device *dev)
1266{
1267	struct ip_tunnel *tunnel = netdev_priv(dev);
1268	struct iphdr *iph = &tunnel->parms.iph;
1269	int err;
1270
1271	dev->needs_free_netdev = true;
1272	dev->priv_destructor = ip_tunnel_dev_free;
1273	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1274	if (!dev->tstats)
1275		return -ENOMEM;
1276
1277	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1278	if (err) {
1279		free_percpu(dev->tstats);
1280		return err;
1281	}
1282
1283	err = gro_cells_init(&tunnel->gro_cells, dev);
1284	if (err) {
1285		dst_cache_destroy(&tunnel->dst_cache);
1286		free_percpu(dev->tstats);
1287		return err;
1288	}
1289
1290	tunnel->dev = dev;
1291	tunnel->net = dev_net(dev);
1292	strcpy(tunnel->parms.name, dev->name);
1293	iph->version		= 4;
1294	iph->ihl		= 5;
1295
1296	if (tunnel->collect_md)
1297		netif_keep_dst(dev);
1298	netdev_lockdep_set_classes(dev);
1299	return 0;
1300}
1301EXPORT_SYMBOL_GPL(ip_tunnel_init);
1302
1303void ip_tunnel_uninit(struct net_device *dev)
1304{
1305	struct ip_tunnel *tunnel = netdev_priv(dev);
1306	struct net *net = tunnel->net;
1307	struct ip_tunnel_net *itn;
1308
1309	itn = net_generic(net, tunnel->ip_tnl_net_id);
1310	ip_tunnel_del(itn, netdev_priv(dev));
1311	if (itn->fb_tunnel_dev == dev)
1312		WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1313
1314	dst_cache_reset(&tunnel->dst_cache);
1315}
1316EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1317
1318/* Do least required initialization, rest of init is done in tunnel_init call */
1319void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1320{
1321	struct ip_tunnel *tunnel = netdev_priv(dev);
1322	tunnel->ip_tnl_net_id = net_id;
1323}
1324EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1325
1326MODULE_DESCRIPTION("IPv4 tunnel implementation library");
1327MODULE_LICENSE("GPL");
1328