1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2// Copyright (c) 2019, 2020 Cloudflare
3
4#include <stdbool.h>
5#include <stddef.h>
6#include <stdint.h>
7#include <string.h>
8
9#include <linux/bpf.h>
10#include <linux/icmp.h>
11#include <linux/icmpv6.h>
12#include <linux/if_ether.h>
13#include <linux/in.h>
14#include <linux/ip.h>
15#include <linux/ipv6.h>
16#include <linux/pkt_cls.h>
17#include <linux/tcp.h>
18#include <linux/udp.h>
19
20#include <bpf/bpf_helpers.h>
21#include <bpf/bpf_endian.h>
22
23#include "test_cls_redirect.h"
24#include "bpf_kfuncs.h"
25
26#pragma GCC diagnostic ignored "-Waddress-of-packed-member"
27
28#define offsetofend(TYPE, MEMBER) \
29	(offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
30
31#define IP_OFFSET_MASK (0x1FFF)
32#define IP_MF (0x2000)
33
34char _license[] SEC("license") = "Dual BSD/GPL";
35
36/**
37 * Destination port and IP used for UDP encapsulation.
38 */
39volatile const __be16 ENCAPSULATION_PORT;
40volatile const __be32 ENCAPSULATION_IP;
41
42typedef struct {
43	uint64_t processed_packets_total;
44	uint64_t l3_protocol_packets_total_ipv4;
45	uint64_t l3_protocol_packets_total_ipv6;
46	uint64_t l4_protocol_packets_total_tcp;
47	uint64_t l4_protocol_packets_total_udp;
48	uint64_t accepted_packets_total_syn;
49	uint64_t accepted_packets_total_syn_cookies;
50	uint64_t accepted_packets_total_last_hop;
51	uint64_t accepted_packets_total_icmp_echo_request;
52	uint64_t accepted_packets_total_established;
53	uint64_t forwarded_packets_total_gue;
54	uint64_t forwarded_packets_total_gre;
55
56	uint64_t errors_total_unknown_l3_proto;
57	uint64_t errors_total_unknown_l4_proto;
58	uint64_t errors_total_malformed_ip;
59	uint64_t errors_total_fragmented_ip;
60	uint64_t errors_total_malformed_icmp;
61	uint64_t errors_total_unwanted_icmp;
62	uint64_t errors_total_malformed_icmp_pkt_too_big;
63	uint64_t errors_total_malformed_tcp;
64	uint64_t errors_total_malformed_udp;
65	uint64_t errors_total_icmp_echo_replies;
66	uint64_t errors_total_malformed_encapsulation;
67	uint64_t errors_total_encap_adjust_failed;
68	uint64_t errors_total_encap_buffer_too_small;
69	uint64_t errors_total_redirect_loop;
70	uint64_t errors_total_encap_mtu_violate;
71} metrics_t;
72
73typedef enum {
74	INVALID = 0,
75	UNKNOWN,
76	ECHO_REQUEST,
77	SYN,
78	SYN_COOKIE,
79	ESTABLISHED,
80} verdict_t;
81
82typedef struct {
83	uint16_t src, dst;
84} flow_ports_t;
85
86_Static_assert(
87	sizeof(flow_ports_t) !=
88		offsetofend(struct bpf_sock_tuple, ipv4.dport) -
89			offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
90	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
91_Static_assert(
92	sizeof(flow_ports_t) !=
93		offsetofend(struct bpf_sock_tuple, ipv6.dport) -
94			offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
95	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
96
97struct iphdr_info {
98	void *hdr;
99	__u64 len;
100};
101
102typedef int ret_t;
103
104/* This is a bit of a hack. We need a return value which allows us to
105 * indicate that the regular flow of the program should continue,
106 * while allowing functions to use XDP_PASS and XDP_DROP, etc.
107 */
108static const ret_t CONTINUE_PROCESSING = -1;
109
110/* Convenience macro to call functions which return ret_t.
111 */
112#define MAYBE_RETURN(x)                           \
113	do {                                      \
114		ret_t __ret = x;                  \
115		if (__ret != CONTINUE_PROCESSING) \
116			return __ret;             \
117	} while (0)
118
119static bool ipv4_is_fragment(const struct iphdr *ip)
120{
121	uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
122	return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
123}
124
125static int pkt_parse_ipv4(struct bpf_dynptr *dynptr, __u64 *offset, struct iphdr *iphdr)
126{
127	if (bpf_dynptr_read(iphdr, sizeof(*iphdr), dynptr, *offset, 0))
128		return -1;
129
130	*offset += sizeof(*iphdr);
131
132	if (iphdr->ihl < 5)
133		return -1;
134
135	/* skip ipv4 options */
136	*offset += (iphdr->ihl - 5) * 4;
137
138	return 0;
139}
140
141/* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
142static bool pkt_parse_icmp_l4_ports(struct bpf_dynptr *dynptr, __u64 *offset, flow_ports_t *ports)
143{
144	if (bpf_dynptr_read(ports, sizeof(*ports), dynptr, *offset, 0))
145		return false;
146
147	*offset += sizeof(*ports);
148
149	/* Ports in the L4 headers are reversed, since we are parsing an ICMP
150	 * payload which is going towards the eyeball.
151	 */
152	uint16_t dst = ports->src;
153	ports->src = ports->dst;
154	ports->dst = dst;
155	return true;
156}
157
158static uint16_t pkt_checksum_fold(uint32_t csum)
159{
160	/* The highest reasonable value for an IPv4 header
161	 * checksum requires two folds, so we just do that always.
162	 */
163	csum = (csum & 0xffff) + (csum >> 16);
164	csum = (csum & 0xffff) + (csum >> 16);
165	return (uint16_t)~csum;
166}
167
168static void pkt_ipv4_checksum(struct iphdr *iph)
169{
170	iph->check = 0;
171
172	/* An IP header without options is 20 bytes. Two of those
173	 * are the checksum, which we always set to zero. Hence,
174	 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
175	 * which fits in 32 bit.
176	 */
177	_Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
178	uint32_t acc = 0;
179	uint16_t *ipw = (uint16_t *)iph;
180
181	for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++)
182		acc += ipw[i];
183
184	iph->check = pkt_checksum_fold(acc);
185}
186
187static bool pkt_skip_ipv6_extension_headers(struct bpf_dynptr *dynptr, __u64 *offset,
188					    const struct ipv6hdr *ipv6, uint8_t *upper_proto,
189					    bool *is_fragment)
190{
191	/* We understand five extension headers.
192	 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
193	 * headers should occur once, except Destination Options, which may
194	 * occur twice. Hence we give up after 6 headers.
195	 */
196	struct {
197		uint8_t next;
198		uint8_t len;
199	} exthdr = {
200		.next = ipv6->nexthdr,
201	};
202	*is_fragment = false;
203
204	for (int i = 0; i < 6; i++) {
205		switch (exthdr.next) {
206		case IPPROTO_FRAGMENT:
207			*is_fragment = true;
208			/* NB: We don't check that hdrlen == 0 as per spec. */
209			/* fallthrough; */
210
211		case IPPROTO_HOPOPTS:
212		case IPPROTO_ROUTING:
213		case IPPROTO_DSTOPTS:
214		case IPPROTO_MH:
215			if (bpf_dynptr_read(&exthdr, sizeof(exthdr), dynptr, *offset, 0))
216				return false;
217
218			/* hdrlen is in 8-octet units, and excludes the first 8 octets. */
219			*offset += (exthdr.len + 1) * 8;
220
221			/* Decode next header */
222			break;
223
224		default:
225			/* The next header is not one of the known extension
226			 * headers, treat it as the upper layer header.
227			 *
228			 * This handles IPPROTO_NONE.
229			 *
230			 * Encapsulating Security Payload (50) and Authentication
231			 * Header (51) also end up here (and will trigger an
232			 * unknown proto error later). They have a custom header
233			 * format and seem too esoteric to care about.
234			 */
235			*upper_proto = exthdr.next;
236			return true;
237		}
238	}
239
240	/* We never found an upper layer header. */
241	return false;
242}
243
244static int pkt_parse_ipv6(struct bpf_dynptr *dynptr, __u64 *offset, struct ipv6hdr *ipv6,
245			  uint8_t *proto, bool *is_fragment)
246{
247	if (bpf_dynptr_read(ipv6, sizeof(*ipv6), dynptr, *offset, 0))
248		return -1;
249
250	*offset += sizeof(*ipv6);
251
252	if (!pkt_skip_ipv6_extension_headers(dynptr, offset, ipv6, proto, is_fragment))
253		return -1;
254
255	return 0;
256}
257
258/* Global metrics, per CPU
259 */
260struct {
261	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
262	__uint(max_entries, 1);
263	__type(key, unsigned int);
264	__type(value, metrics_t);
265} metrics_map SEC(".maps");
266
267static metrics_t *get_global_metrics(void)
268{
269	uint64_t key = 0;
270	return bpf_map_lookup_elem(&metrics_map, &key);
271}
272
273static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
274{
275	const int payload_off =
276		sizeof(*encap) +
277		sizeof(struct in_addr) * encap->unigue.hop_count;
278	int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
279
280	/* Changing the ethertype if the encapsulated packet is ipv6 */
281	if (encap->gue.proto_ctype == IPPROTO_IPV6)
282		encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
283
284	if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
285				BPF_F_ADJ_ROOM_FIXED_GSO |
286				BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
287	    bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
288		return TC_ACT_SHOT;
289
290	return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
291}
292
293static ret_t forward_with_gre(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
294			      encap_headers_t *encap, struct in_addr *next_hop,
295			      metrics_t *metrics)
296{
297	const int payload_off =
298		sizeof(*encap) +
299		sizeof(struct in_addr) * encap->unigue.hop_count;
300	int32_t encap_overhead =
301		payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
302	int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
303	__u8 encap_buffer[sizeof(encap_gre_t)] = {};
304	uint16_t proto = ETH_P_IP;
305	uint32_t mtu_len = 0;
306	encap_gre_t *encap_gre;
307
308	metrics->forwarded_packets_total_gre++;
309
310	/* Loop protection: the inner packet's TTL is decremented as a safeguard
311	 * against any forwarding loop. As the only interesting field is the TTL
312	 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
313	 * as they handle the split packets if needed (no need for the data to be
314	 * in the linear section).
315	 */
316	if (encap->gue.proto_ctype == IPPROTO_IPV6) {
317		proto = ETH_P_IPV6;
318		uint8_t ttl;
319		int rc;
320
321		rc = bpf_skb_load_bytes(
322			skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
323			&ttl, 1);
324		if (rc != 0) {
325			metrics->errors_total_malformed_encapsulation++;
326			return TC_ACT_SHOT;
327		}
328
329		if (ttl == 0) {
330			metrics->errors_total_redirect_loop++;
331			return TC_ACT_SHOT;
332		}
333
334		ttl--;
335		rc = bpf_skb_store_bytes(
336			skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
337			&ttl, 1, 0);
338		if (rc != 0) {
339			metrics->errors_total_malformed_encapsulation++;
340			return TC_ACT_SHOT;
341		}
342	} else {
343		uint8_t ttl;
344		int rc;
345
346		rc = bpf_skb_load_bytes(
347			skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
348			1);
349		if (rc != 0) {
350			metrics->errors_total_malformed_encapsulation++;
351			return TC_ACT_SHOT;
352		}
353
354		if (ttl == 0) {
355			metrics->errors_total_redirect_loop++;
356			return TC_ACT_SHOT;
357		}
358
359		/* IPv4 also has a checksum to patch. While the TTL is only one byte,
360		 * this function only works for 2 and 4 bytes arguments (the result is
361		 * the same).
362		 */
363		rc = bpf_l3_csum_replace(
364			skb, payload_off + offsetof(struct iphdr, check), ttl,
365			ttl - 1, 2);
366		if (rc != 0) {
367			metrics->errors_total_malformed_encapsulation++;
368			return TC_ACT_SHOT;
369		}
370
371		ttl--;
372		rc = bpf_skb_store_bytes(
373			skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
374			0);
375		if (rc != 0) {
376			metrics->errors_total_malformed_encapsulation++;
377			return TC_ACT_SHOT;
378		}
379	}
380
381	if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) {
382		metrics->errors_total_encap_mtu_violate++;
383		return TC_ACT_SHOT;
384	}
385
386	if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
387				BPF_F_ADJ_ROOM_FIXED_GSO |
388				BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
389	    bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
390		metrics->errors_total_encap_adjust_failed++;
391		return TC_ACT_SHOT;
392	}
393
394	if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
395		metrics->errors_total_encap_buffer_too_small++;
396		return TC_ACT_SHOT;
397	}
398
399	encap_gre = bpf_dynptr_slice_rdwr(dynptr, 0, encap_buffer, sizeof(encap_buffer));
400	if (!encap_gre) {
401		metrics->errors_total_encap_buffer_too_small++;
402		return TC_ACT_SHOT;
403	}
404
405	encap_gre->ip.protocol = IPPROTO_GRE;
406	encap_gre->ip.daddr = next_hop->s_addr;
407	encap_gre->ip.saddr = ENCAPSULATION_IP;
408	encap_gre->ip.tot_len =
409		bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
410	encap_gre->gre.flags = 0;
411	encap_gre->gre.protocol = bpf_htons(proto);
412	pkt_ipv4_checksum((void *)&encap_gre->ip);
413
414	if (encap_gre == encap_buffer)
415		bpf_dynptr_write(dynptr, 0, encap_buffer, sizeof(encap_buffer), 0);
416
417	return bpf_redirect(skb->ifindex, 0);
418}
419
420static ret_t forward_to_next_hop(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
421				 encap_headers_t *encap, struct in_addr *next_hop,
422				 metrics_t *metrics)
423{
424	/* swap L2 addresses */
425	/* This assumes that packets are received from a router.
426	 * So just swapping the MAC addresses here will make the packet go back to
427	 * the router, which will send it to the appropriate machine.
428	 */
429	unsigned char temp[ETH_ALEN];
430	memcpy(temp, encap->eth.h_dest, sizeof(temp));
431	memcpy(encap->eth.h_dest, encap->eth.h_source,
432	       sizeof(encap->eth.h_dest));
433	memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
434
435	if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
436	    encap->unigue.last_hop_gre) {
437		return forward_with_gre(skb, dynptr, encap, next_hop, metrics);
438	}
439
440	metrics->forwarded_packets_total_gue++;
441	uint32_t old_saddr = encap->ip.saddr;
442	encap->ip.saddr = encap->ip.daddr;
443	encap->ip.daddr = next_hop->s_addr;
444	if (encap->unigue.next_hop < encap->unigue.hop_count) {
445		encap->unigue.next_hop++;
446	}
447
448	/* Remove ip->saddr, add next_hop->s_addr */
449	const uint64_t off = offsetof(typeof(*encap), ip.check);
450	int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
451	if (ret < 0) {
452		return TC_ACT_SHOT;
453	}
454
455	return bpf_redirect(skb->ifindex, 0);
456}
457
458static ret_t skip_next_hops(__u64 *offset, int n)
459{
460	switch (n) {
461	case 1:
462		*offset += sizeof(struct in_addr);
463	case 0:
464		return CONTINUE_PROCESSING;
465
466	default:
467		return TC_ACT_SHOT;
468	}
469}
470
471/* Get the next hop from the GLB header.
472 *
473 * Sets next_hop->s_addr to 0 if there are no more hops left.
474 * pkt is positioned just after the variable length GLB header
475 * iff the call is successful.
476 */
477static ret_t get_next_hop(struct bpf_dynptr *dynptr, __u64 *offset, encap_headers_t *encap,
478			  struct in_addr *next_hop)
479{
480	if (encap->unigue.next_hop > encap->unigue.hop_count)
481		return TC_ACT_SHOT;
482
483	/* Skip "used" next hops. */
484	MAYBE_RETURN(skip_next_hops(offset, encap->unigue.next_hop));
485
486	if (encap->unigue.next_hop == encap->unigue.hop_count) {
487		/* No more next hops, we are at the end of the GLB header. */
488		next_hop->s_addr = 0;
489		return CONTINUE_PROCESSING;
490	}
491
492	if (bpf_dynptr_read(next_hop, sizeof(*next_hop), dynptr, *offset, 0))
493		return TC_ACT_SHOT;
494
495	*offset += sizeof(*next_hop);
496
497	/* Skip the remainig next hops (may be zero). */
498	return skip_next_hops(offset, encap->unigue.hop_count - encap->unigue.next_hop - 1);
499}
500
501/* Fill a bpf_sock_tuple to be used with the socket lookup functions.
502 * This is a kludge that let's us work around verifier limitations:
503 *
504 *    fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
505 *
506 * clang will substitue a costant for sizeof, which allows the verifier
507 * to track it's value. Based on this, it can figure out the constant
508 * return value, and calling code works while still being "generic" to
509 * IPv4 and IPv6.
510 */
511static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
512				    uint64_t iphlen, uint16_t sport, uint16_t dport)
513{
514	switch (iphlen) {
515	case sizeof(struct iphdr): {
516		struct iphdr *ipv4 = (struct iphdr *)iph;
517		tuple->ipv4.daddr = ipv4->daddr;
518		tuple->ipv4.saddr = ipv4->saddr;
519		tuple->ipv4.sport = sport;
520		tuple->ipv4.dport = dport;
521		return sizeof(tuple->ipv4);
522	}
523
524	case sizeof(struct ipv6hdr): {
525		struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
526		memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
527		       sizeof(tuple->ipv6.daddr));
528		memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
529		       sizeof(tuple->ipv6.saddr));
530		tuple->ipv6.sport = sport;
531		tuple->ipv6.dport = dport;
532		return sizeof(tuple->ipv6);
533	}
534
535	default:
536		return 0;
537	}
538}
539
540static verdict_t classify_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple,
541			      uint64_t tuplen, void *iph, struct tcphdr *tcp)
542{
543	struct bpf_sock *sk =
544		bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
545
546	if (sk == NULL)
547		return UNKNOWN;
548
549	if (sk->state != BPF_TCP_LISTEN) {
550		bpf_sk_release(sk);
551		return ESTABLISHED;
552	}
553
554	if (iph != NULL && tcp != NULL) {
555		/* Kludge: we've run out of arguments, but need the length of the ip header. */
556		uint64_t iphlen = sizeof(struct iphdr);
557
558		if (tuplen == sizeof(tuple->ipv6))
559			iphlen = sizeof(struct ipv6hdr);
560
561		if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
562					    sizeof(*tcp)) == 0) {
563			bpf_sk_release(sk);
564			return SYN_COOKIE;
565		}
566	}
567
568	bpf_sk_release(sk);
569	return UNKNOWN;
570}
571
572static verdict_t classify_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, uint64_t tuplen)
573{
574	struct bpf_sock *sk =
575		bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
576
577	if (sk == NULL)
578		return UNKNOWN;
579
580	if (sk->state == BPF_TCP_ESTABLISHED) {
581		bpf_sk_release(sk);
582		return ESTABLISHED;
583	}
584
585	bpf_sk_release(sk);
586	return UNKNOWN;
587}
588
589static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, struct bpf_sock_tuple *tuple,
590			       uint64_t tuplen, metrics_t *metrics)
591{
592	switch (proto) {
593	case IPPROTO_TCP:
594		return classify_tcp(skb, tuple, tuplen, NULL, NULL);
595
596	case IPPROTO_UDP:
597		return classify_udp(skb, tuple, tuplen);
598
599	default:
600		metrics->errors_total_malformed_icmp++;
601		return INVALID;
602	}
603}
604
605static verdict_t process_icmpv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr, __u64 *offset,
606				metrics_t *metrics)
607{
608	struct icmphdr icmp;
609	struct iphdr ipv4;
610
611	if (bpf_dynptr_read(&icmp, sizeof(icmp), dynptr, *offset, 0)) {
612		metrics->errors_total_malformed_icmp++;
613		return INVALID;
614	}
615
616	*offset += sizeof(icmp);
617
618	/* We should never receive encapsulated echo replies. */
619	if (icmp.type == ICMP_ECHOREPLY) {
620		metrics->errors_total_icmp_echo_replies++;
621		return INVALID;
622	}
623
624	if (icmp.type == ICMP_ECHO)
625		return ECHO_REQUEST;
626
627	if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
628		metrics->errors_total_unwanted_icmp++;
629		return INVALID;
630	}
631
632	if (pkt_parse_ipv4(dynptr, offset, &ipv4)) {
633		metrics->errors_total_malformed_icmp_pkt_too_big++;
634		return INVALID;
635	}
636
637	/* The source address in the outer IP header is from the entity that
638	 * originated the ICMP message. Use the original IP header to restore
639	 * the correct flow tuple.
640	 */
641	struct bpf_sock_tuple tuple;
642	tuple.ipv4.saddr = ipv4.daddr;
643	tuple.ipv4.daddr = ipv4.saddr;
644
645	if (!pkt_parse_icmp_l4_ports(dynptr, offset, (flow_ports_t *)&tuple.ipv4.sport)) {
646		metrics->errors_total_malformed_icmp_pkt_too_big++;
647		return INVALID;
648	}
649
650	return classify_icmp(skb, ipv4.protocol, &tuple,
651			     sizeof(tuple.ipv4), metrics);
652}
653
654static verdict_t process_icmpv6(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
655				metrics_t *metrics)
656{
657	struct bpf_sock_tuple tuple;
658	struct ipv6hdr ipv6;
659	struct icmp6hdr icmp6;
660	bool is_fragment;
661	uint8_t l4_proto;
662
663	if (bpf_dynptr_read(&icmp6, sizeof(icmp6), dynptr, *offset, 0)) {
664		metrics->errors_total_malformed_icmp++;
665		return INVALID;
666	}
667
668	/* We should never receive encapsulated echo replies. */
669	if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
670		metrics->errors_total_icmp_echo_replies++;
671		return INVALID;
672	}
673
674	if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
675		return ECHO_REQUEST;
676	}
677
678	if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
679		metrics->errors_total_unwanted_icmp++;
680		return INVALID;
681	}
682
683	if (pkt_parse_ipv6(dynptr, offset, &ipv6, &l4_proto, &is_fragment)) {
684		metrics->errors_total_malformed_icmp_pkt_too_big++;
685		return INVALID;
686	}
687
688	if (is_fragment) {
689		metrics->errors_total_fragmented_ip++;
690		return INVALID;
691	}
692
693	/* Swap source and dest addresses. */
694	memcpy(&tuple.ipv6.saddr, &ipv6.daddr, sizeof(tuple.ipv6.saddr));
695	memcpy(&tuple.ipv6.daddr, &ipv6.saddr, sizeof(tuple.ipv6.daddr));
696
697	if (!pkt_parse_icmp_l4_ports(dynptr, offset, (flow_ports_t *)&tuple.ipv6.sport)) {
698		metrics->errors_total_malformed_icmp_pkt_too_big++;
699		return INVALID;
700	}
701
702	return classify_icmp(skb, l4_proto, &tuple, sizeof(tuple.ipv6),
703			     metrics);
704}
705
706static verdict_t process_tcp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
707			     struct iphdr_info *info, metrics_t *metrics)
708{
709	struct bpf_sock_tuple tuple;
710	struct tcphdr tcp;
711	uint64_t tuplen;
712
713	metrics->l4_protocol_packets_total_tcp++;
714
715	if (bpf_dynptr_read(&tcp, sizeof(tcp), dynptr, *offset, 0)) {
716		metrics->errors_total_malformed_tcp++;
717		return INVALID;
718	}
719
720	*offset += sizeof(tcp);
721
722	if (tcp.syn)
723		return SYN;
724
725	tuplen = fill_tuple(&tuple, info->hdr, info->len, tcp.source, tcp.dest);
726	return classify_tcp(skb, &tuple, tuplen, info->hdr, &tcp);
727}
728
729static verdict_t process_udp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
730			     struct iphdr_info *info, metrics_t *metrics)
731{
732	struct bpf_sock_tuple tuple;
733	struct udphdr udph;
734	uint64_t tuplen;
735
736	metrics->l4_protocol_packets_total_udp++;
737
738	if (bpf_dynptr_read(&udph, sizeof(udph), dynptr, *offset, 0)) {
739		metrics->errors_total_malformed_udp++;
740		return INVALID;
741	}
742	*offset += sizeof(udph);
743
744	tuplen = fill_tuple(&tuple, info->hdr, info->len, udph.source, udph.dest);
745	return classify_udp(skb, &tuple, tuplen);
746}
747
748static verdict_t process_ipv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
749			      __u64 *offset, metrics_t *metrics)
750{
751	struct iphdr ipv4;
752	struct iphdr_info info = {
753		.hdr = &ipv4,
754		.len = sizeof(ipv4),
755	};
756
757	metrics->l3_protocol_packets_total_ipv4++;
758
759	if (pkt_parse_ipv4(dynptr, offset, &ipv4)) {
760		metrics->errors_total_malformed_ip++;
761		return INVALID;
762	}
763
764	if (ipv4.version != 4) {
765		metrics->errors_total_malformed_ip++;
766		return INVALID;
767	}
768
769	if (ipv4_is_fragment(&ipv4)) {
770		metrics->errors_total_fragmented_ip++;
771		return INVALID;
772	}
773
774	switch (ipv4.protocol) {
775	case IPPROTO_ICMP:
776		return process_icmpv4(skb, dynptr, offset, metrics);
777
778	case IPPROTO_TCP:
779		return process_tcp(dynptr, offset, skb, &info, metrics);
780
781	case IPPROTO_UDP:
782		return process_udp(dynptr, offset, skb, &info, metrics);
783
784	default:
785		metrics->errors_total_unknown_l4_proto++;
786		return INVALID;
787	}
788}
789
790static verdict_t process_ipv6(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
791			      __u64 *offset, metrics_t *metrics)
792{
793	struct ipv6hdr ipv6;
794	struct iphdr_info info = {
795		.hdr = &ipv6,
796		.len = sizeof(ipv6),
797	};
798	uint8_t l4_proto;
799	bool is_fragment;
800
801	metrics->l3_protocol_packets_total_ipv6++;
802
803	if (pkt_parse_ipv6(dynptr, offset, &ipv6, &l4_proto, &is_fragment)) {
804		metrics->errors_total_malformed_ip++;
805		return INVALID;
806	}
807
808	if (ipv6.version != 6) {
809		metrics->errors_total_malformed_ip++;
810		return INVALID;
811	}
812
813	if (is_fragment) {
814		metrics->errors_total_fragmented_ip++;
815		return INVALID;
816	}
817
818	switch (l4_proto) {
819	case IPPROTO_ICMPV6:
820		return process_icmpv6(dynptr, offset, skb, metrics);
821
822	case IPPROTO_TCP:
823		return process_tcp(dynptr, offset, skb, &info, metrics);
824
825	case IPPROTO_UDP:
826		return process_udp(dynptr, offset, skb, &info, metrics);
827
828	default:
829		metrics->errors_total_unknown_l4_proto++;
830		return INVALID;
831	}
832}
833
834SEC("tc")
835int cls_redirect(struct __sk_buff *skb)
836{
837	__u8 encap_buffer[sizeof(encap_headers_t)] = {};
838	struct bpf_dynptr dynptr;
839	struct in_addr next_hop;
840	/* Tracks offset of the dynptr. This will be unnecessary once
841	 * bpf_dynptr_advance() is available.
842	 */
843	__u64 off = 0;
844	ret_t ret;
845
846	bpf_dynptr_from_skb(skb, 0, &dynptr);
847
848	metrics_t *metrics = get_global_metrics();
849	if (metrics == NULL)
850		return TC_ACT_SHOT;
851
852	metrics->processed_packets_total++;
853
854	/* Pass bogus packets as long as we're not sure they're
855	 * destined for us.
856	 */
857	if (skb->protocol != bpf_htons(ETH_P_IP))
858		return TC_ACT_OK;
859
860	encap_headers_t *encap;
861
862	/* Make sure that all encapsulation headers are available in
863	 * the linear portion of the skb. This makes it easy to manipulate them.
864	 */
865	if (bpf_skb_pull_data(skb, sizeof(*encap)))
866		return TC_ACT_OK;
867
868	encap = bpf_dynptr_slice_rdwr(&dynptr, 0, encap_buffer, sizeof(encap_buffer));
869	if (!encap)
870		return TC_ACT_OK;
871
872	off += sizeof(*encap);
873
874	if (encap->ip.ihl != 5)
875		/* We never have any options. */
876		return TC_ACT_OK;
877
878	if (encap->ip.daddr != ENCAPSULATION_IP ||
879	    encap->ip.protocol != IPPROTO_UDP)
880		return TC_ACT_OK;
881
882	/* TODO Check UDP length? */
883	if (encap->udp.dest != ENCAPSULATION_PORT)
884		return TC_ACT_OK;
885
886	/* We now know that the packet is destined to us, we can
887	 * drop bogus ones.
888	 */
889	if (ipv4_is_fragment((void *)&encap->ip)) {
890		metrics->errors_total_fragmented_ip++;
891		return TC_ACT_SHOT;
892	}
893
894	if (encap->gue.variant != 0) {
895		metrics->errors_total_malformed_encapsulation++;
896		return TC_ACT_SHOT;
897	}
898
899	if (encap->gue.control != 0) {
900		metrics->errors_total_malformed_encapsulation++;
901		return TC_ACT_SHOT;
902	}
903
904	if (encap->gue.flags != 0) {
905		metrics->errors_total_malformed_encapsulation++;
906		return TC_ACT_SHOT;
907	}
908
909	if (encap->gue.hlen !=
910	    sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
911		metrics->errors_total_malformed_encapsulation++;
912		return TC_ACT_SHOT;
913	}
914
915	if (encap->unigue.version != 0) {
916		metrics->errors_total_malformed_encapsulation++;
917		return TC_ACT_SHOT;
918	}
919
920	if (encap->unigue.reserved != 0)
921		return TC_ACT_SHOT;
922
923	MAYBE_RETURN(get_next_hop(&dynptr, &off, encap, &next_hop));
924
925	if (next_hop.s_addr == 0) {
926		metrics->accepted_packets_total_last_hop++;
927		return accept_locally(skb, encap);
928	}
929
930	verdict_t verdict;
931	switch (encap->gue.proto_ctype) {
932	case IPPROTO_IPIP:
933		verdict = process_ipv4(skb, &dynptr, &off, metrics);
934		break;
935
936	case IPPROTO_IPV6:
937		verdict = process_ipv6(skb, &dynptr, &off, metrics);
938		break;
939
940	default:
941		metrics->errors_total_unknown_l3_proto++;
942		return TC_ACT_SHOT;
943	}
944
945	switch (verdict) {
946	case INVALID:
947		/* metrics have already been bumped */
948		return TC_ACT_SHOT;
949
950	case UNKNOWN:
951		return forward_to_next_hop(skb, &dynptr, encap, &next_hop, metrics);
952
953	case ECHO_REQUEST:
954		metrics->accepted_packets_total_icmp_echo_request++;
955		break;
956
957	case SYN:
958		if (encap->unigue.forward_syn) {
959			return forward_to_next_hop(skb, &dynptr, encap, &next_hop,
960						   metrics);
961		}
962
963		metrics->accepted_packets_total_syn++;
964		break;
965
966	case SYN_COOKIE:
967		metrics->accepted_packets_total_syn_cookies++;
968		break;
969
970	case ESTABLISHED:
971		metrics->accepted_packets_total_established++;
972		break;
973	}
974
975	ret = accept_locally(skb, encap);
976
977	if (encap == encap_buffer)
978		bpf_dynptr_write(&dynptr, 0, encap_buffer, sizeof(encap_buffer), 0);
979
980	return ret;
981}
982