1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2// Copyright (c) 2019, 2020 Cloudflare 3 4#include <stdbool.h> 5#include <stddef.h> 6#include <stdint.h> 7#include <string.h> 8 9#include <linux/bpf.h> 10#include <linux/icmp.h> 11#include <linux/icmpv6.h> 12#include <linux/if_ether.h> 13#include <linux/in.h> 14#include <linux/ip.h> 15#include <linux/ipv6.h> 16#include <linux/pkt_cls.h> 17#include <linux/tcp.h> 18#include <linux/udp.h> 19 20#include <bpf/bpf_helpers.h> 21#include <bpf/bpf_endian.h> 22 23#include "bpf_compiler.h" 24#include "test_cls_redirect.h" 25 26#pragma GCC diagnostic ignored "-Waddress-of-packed-member" 27 28#ifdef SUBPROGS 29#define INLINING __noinline 30#else 31#define INLINING __always_inline 32#endif 33 34#define offsetofend(TYPE, MEMBER) \ 35 (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) 36 37#define IP_OFFSET_MASK (0x1FFF) 38#define IP_MF (0x2000) 39 40char _license[] SEC("license") = "Dual BSD/GPL"; 41 42/** 43 * Destination port and IP used for UDP encapsulation. 44 */ 45volatile const __be16 ENCAPSULATION_PORT; 46volatile const __be32 ENCAPSULATION_IP; 47 48typedef struct { 49 uint64_t processed_packets_total; 50 uint64_t l3_protocol_packets_total_ipv4; 51 uint64_t l3_protocol_packets_total_ipv6; 52 uint64_t l4_protocol_packets_total_tcp; 53 uint64_t l4_protocol_packets_total_udp; 54 uint64_t accepted_packets_total_syn; 55 uint64_t accepted_packets_total_syn_cookies; 56 uint64_t accepted_packets_total_last_hop; 57 uint64_t accepted_packets_total_icmp_echo_request; 58 uint64_t accepted_packets_total_established; 59 uint64_t forwarded_packets_total_gue; 60 uint64_t forwarded_packets_total_gre; 61 62 uint64_t errors_total_unknown_l3_proto; 63 uint64_t errors_total_unknown_l4_proto; 64 uint64_t errors_total_malformed_ip; 65 uint64_t errors_total_fragmented_ip; 66 uint64_t errors_total_malformed_icmp; 67 uint64_t errors_total_unwanted_icmp; 68 uint64_t errors_total_malformed_icmp_pkt_too_big; 69 uint64_t errors_total_malformed_tcp; 70 uint64_t errors_total_malformed_udp; 71 uint64_t errors_total_icmp_echo_replies; 72 uint64_t errors_total_malformed_encapsulation; 73 uint64_t errors_total_encap_adjust_failed; 74 uint64_t errors_total_encap_buffer_too_small; 75 uint64_t errors_total_redirect_loop; 76 uint64_t errors_total_encap_mtu_violate; 77} metrics_t; 78 79typedef enum { 80 INVALID = 0, 81 UNKNOWN, 82 ECHO_REQUEST, 83 SYN, 84 SYN_COOKIE, 85 ESTABLISHED, 86} verdict_t; 87 88typedef struct { 89 uint16_t src, dst; 90} flow_ports_t; 91 92_Static_assert( 93 sizeof(flow_ports_t) != 94 offsetofend(struct bpf_sock_tuple, ipv4.dport) - 95 offsetof(struct bpf_sock_tuple, ipv4.sport) - 1, 96 "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); 97_Static_assert( 98 sizeof(flow_ports_t) != 99 offsetofend(struct bpf_sock_tuple, ipv6.dport) - 100 offsetof(struct bpf_sock_tuple, ipv6.sport) - 1, 101 "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); 102 103typedef int ret_t; 104 105/* This is a bit of a hack. We need a return value which allows us to 106 * indicate that the regular flow of the program should continue, 107 * while allowing functions to use XDP_PASS and XDP_DROP, etc. 108 */ 109static const ret_t CONTINUE_PROCESSING = -1; 110 111/* Convenience macro to call functions which return ret_t. 112 */ 113#define MAYBE_RETURN(x) \ 114 do { \ 115 ret_t __ret = x; \ 116 if (__ret != CONTINUE_PROCESSING) \ 117 return __ret; \ 118 } while (0) 119 120/* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes), 121 * or not aligned if the arch supports efficient unaligned access. 122 * 123 * Since the verifier ensures that eBPF packet accesses follow these rules, 124 * we can tell LLVM to emit code as if we always had a larger alignment. 125 * It will yell at us if we end up on a platform where this is not valid. 126 */ 127typedef uint8_t *net_ptr __attribute__((align_value(8))); 128 129typedef struct buf { 130 struct __sk_buff *skb; 131 net_ptr head; 132 /* NB: tail musn't have alignment other than 1, otherwise 133 * LLVM will go and eliminate code, e.g. when checking packet lengths. 134 */ 135 uint8_t *const tail; 136} buf_t; 137 138static __always_inline size_t buf_off(const buf_t *buf) 139{ 140 /* Clang seems to optimize constructs like 141 * a - b + c 142 * if c is known: 143 * r? = c 144 * r? -= b 145 * r? += a 146 * 147 * This is a problem if a and b are packet pointers, 148 * since the verifier allows subtracting two pointers to 149 * get a scalar, but not a scalar and a pointer. 150 * 151 * Use inline asm to break this optimization. 152 */ 153 size_t off = (size_t)buf->head; 154 asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data)); 155 return off; 156} 157 158static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len) 159{ 160 if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) { 161 return false; 162 } 163 164 buf->head += len; 165 return true; 166} 167 168static __always_inline bool buf_skip(buf_t *buf, const size_t len) 169{ 170 /* Check whether off + len is valid in the non-linear part. */ 171 if (buf_off(buf) + len > buf->skb->len) { 172 return false; 173 } 174 175 buf->head += len; 176 return true; 177} 178 179/* Returns a pointer to the start of buf, or NULL if len is 180 * larger than the remaining data. Consumes len bytes on a successful 181 * call. 182 * 183 * If scratch is not NULL, the function will attempt to load non-linear 184 * data via bpf_skb_load_bytes. On success, scratch is returned. 185 */ 186static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch) 187{ 188 if (buf->head + len > buf->tail) { 189 if (scratch == NULL) { 190 return NULL; 191 } 192 193 return buf_copy(buf, scratch, len) ? scratch : NULL; 194 } 195 196 void *ptr = buf->head; 197 buf->head += len; 198 return ptr; 199} 200 201static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) 202{ 203 if (ipv4->ihl <= 5) { 204 return true; 205 } 206 207 return buf_skip(buf, (ipv4->ihl - 5) * 4); 208} 209 210static INLINING bool ipv4_is_fragment(const struct iphdr *ip) 211{ 212 uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); 213 return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; 214} 215 216static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) 217{ 218 struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch); 219 if (ipv4 == NULL) { 220 return NULL; 221 } 222 223 if (ipv4->ihl < 5) { 224 return NULL; 225 } 226 227 if (!pkt_skip_ipv4_options(pkt, ipv4)) { 228 return NULL; 229 } 230 231 return ipv4; 232} 233 234/* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ 235static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) 236{ 237 if (!buf_copy(pkt, ports, sizeof(*ports))) { 238 return false; 239 } 240 241 /* Ports in the L4 headers are reversed, since we are parsing an ICMP 242 * payload which is going towards the eyeball. 243 */ 244 uint16_t dst = ports->src; 245 ports->src = ports->dst; 246 ports->dst = dst; 247 return true; 248} 249 250static INLINING uint16_t pkt_checksum_fold(uint32_t csum) 251{ 252 /* The highest reasonable value for an IPv4 header 253 * checksum requires two folds, so we just do that always. 254 */ 255 csum = (csum & 0xffff) + (csum >> 16); 256 csum = (csum & 0xffff) + (csum >> 16); 257 return (uint16_t)~csum; 258} 259 260static INLINING void pkt_ipv4_checksum(struct iphdr *iph) 261{ 262 iph->check = 0; 263 264 /* An IP header without options is 20 bytes. Two of those 265 * are the checksum, which we always set to zero. Hence, 266 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7, 267 * which fits in 32 bit. 268 */ 269 _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes"); 270 uint32_t acc = 0; 271 uint16_t *ipw = (uint16_t *)iph; 272 273 __pragma_loop_unroll_full 274 for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) { 275 acc += ipw[i]; 276 } 277 278 iph->check = pkt_checksum_fold(acc); 279} 280 281static INLINING 282bool pkt_skip_ipv6_extension_headers(buf_t *pkt, 283 const struct ipv6hdr *ipv6, 284 uint8_t *upper_proto, 285 bool *is_fragment) 286{ 287 /* We understand five extension headers. 288 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all 289 * headers should occur once, except Destination Options, which may 290 * occur twice. Hence we give up after 6 headers. 291 */ 292 struct { 293 uint8_t next; 294 uint8_t len; 295 } exthdr = { 296 .next = ipv6->nexthdr, 297 }; 298 *is_fragment = false; 299 300 __pragma_loop_unroll_full 301 for (int i = 0; i < 6; i++) { 302 switch (exthdr.next) { 303 case IPPROTO_FRAGMENT: 304 *is_fragment = true; 305 /* NB: We don't check that hdrlen == 0 as per spec. */ 306 /* fallthrough; */ 307 308 case IPPROTO_HOPOPTS: 309 case IPPROTO_ROUTING: 310 case IPPROTO_DSTOPTS: 311 case IPPROTO_MH: 312 if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) { 313 return false; 314 } 315 316 /* hdrlen is in 8-octet units, and excludes the first 8 octets. */ 317 if (!buf_skip(pkt, 318 (exthdr.len + 1) * 8 - sizeof(exthdr))) { 319 return false; 320 } 321 322 /* Decode next header */ 323 break; 324 325 default: 326 /* The next header is not one of the known extension 327 * headers, treat it as the upper layer header. 328 * 329 * This handles IPPROTO_NONE. 330 * 331 * Encapsulating Security Payload (50) and Authentication 332 * Header (51) also end up here (and will trigger an 333 * unknown proto error later). They have a custom header 334 * format and seem too esoteric to care about. 335 */ 336 *upper_proto = exthdr.next; 337 return true; 338 } 339 } 340 341 /* We never found an upper layer header. */ 342 return false; 343} 344 345/* This function has to be inlined, because the verifier otherwise rejects it 346 * due to returning a pointer to the stack. This is technically correct, since 347 * scratch is allocated on the stack. However, this usage should be safe since 348 * it's the callers stack after all. 349 */ 350static __always_inline struct ipv6hdr * 351pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, 352 bool *is_fragment) 353{ 354 struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch); 355 if (ipv6 == NULL) { 356 return NULL; 357 } 358 359 if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) { 360 return NULL; 361 } 362 363 return ipv6; 364} 365 366/* Global metrics, per CPU 367 */ 368struct { 369 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 370 __uint(max_entries, 1); 371 __type(key, unsigned int); 372 __type(value, metrics_t); 373} metrics_map SEC(".maps"); 374 375static INLINING metrics_t *get_global_metrics(void) 376{ 377 uint64_t key = 0; 378 return bpf_map_lookup_elem(&metrics_map, &key); 379} 380 381static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) 382{ 383 const int payload_off = 384 sizeof(*encap) + 385 sizeof(struct in_addr) * encap->unigue.hop_count; 386 int32_t encap_overhead = payload_off - sizeof(struct ethhdr); 387 388 // Changing the ethertype if the encapsulated packet is ipv6 389 if (encap->gue.proto_ctype == IPPROTO_IPV6) { 390 encap->eth.h_proto = bpf_htons(ETH_P_IPV6); 391 } 392 393 if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, 394 BPF_F_ADJ_ROOM_FIXED_GSO | 395 BPF_F_ADJ_ROOM_NO_CSUM_RESET) || 396 bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC)) 397 return TC_ACT_SHOT; 398 399 return bpf_redirect(skb->ifindex, BPF_F_INGRESS); 400} 401 402static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, 403 struct in_addr *next_hop, metrics_t *metrics) 404{ 405 metrics->forwarded_packets_total_gre++; 406 407 const int payload_off = 408 sizeof(*encap) + 409 sizeof(struct in_addr) * encap->unigue.hop_count; 410 int32_t encap_overhead = 411 payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); 412 int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; 413 uint16_t proto = ETH_P_IP; 414 uint32_t mtu_len = 0; 415 416 /* Loop protection: the inner packet's TTL is decremented as a safeguard 417 * against any forwarding loop. As the only interesting field is the TTL 418 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes 419 * as they handle the split packets if needed (no need for the data to be 420 * in the linear section). 421 */ 422 if (encap->gue.proto_ctype == IPPROTO_IPV6) { 423 proto = ETH_P_IPV6; 424 uint8_t ttl; 425 int rc; 426 427 rc = bpf_skb_load_bytes( 428 skb, payload_off + offsetof(struct ipv6hdr, hop_limit), 429 &ttl, 1); 430 if (rc != 0) { 431 metrics->errors_total_malformed_encapsulation++; 432 return TC_ACT_SHOT; 433 } 434 435 if (ttl == 0) { 436 metrics->errors_total_redirect_loop++; 437 return TC_ACT_SHOT; 438 } 439 440 ttl--; 441 rc = bpf_skb_store_bytes( 442 skb, payload_off + offsetof(struct ipv6hdr, hop_limit), 443 &ttl, 1, 0); 444 if (rc != 0) { 445 metrics->errors_total_malformed_encapsulation++; 446 return TC_ACT_SHOT; 447 } 448 } else { 449 uint8_t ttl; 450 int rc; 451 452 rc = bpf_skb_load_bytes( 453 skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 454 1); 455 if (rc != 0) { 456 metrics->errors_total_malformed_encapsulation++; 457 return TC_ACT_SHOT; 458 } 459 460 if (ttl == 0) { 461 metrics->errors_total_redirect_loop++; 462 return TC_ACT_SHOT; 463 } 464 465 /* IPv4 also has a checksum to patch. While the TTL is only one byte, 466 * this function only works for 2 and 4 bytes arguments (the result is 467 * the same). 468 */ 469 rc = bpf_l3_csum_replace( 470 skb, payload_off + offsetof(struct iphdr, check), ttl, 471 ttl - 1, 2); 472 if (rc != 0) { 473 metrics->errors_total_malformed_encapsulation++; 474 return TC_ACT_SHOT; 475 } 476 477 ttl--; 478 rc = bpf_skb_store_bytes( 479 skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1, 480 0); 481 if (rc != 0) { 482 metrics->errors_total_malformed_encapsulation++; 483 return TC_ACT_SHOT; 484 } 485 } 486 487 if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) { 488 metrics->errors_total_encap_mtu_violate++; 489 return TC_ACT_SHOT; 490 } 491 492 if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, 493 BPF_F_ADJ_ROOM_FIXED_GSO | 494 BPF_F_ADJ_ROOM_NO_CSUM_RESET) || 495 bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) { 496 metrics->errors_total_encap_adjust_failed++; 497 return TC_ACT_SHOT; 498 } 499 500 if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) { 501 metrics->errors_total_encap_buffer_too_small++; 502 return TC_ACT_SHOT; 503 } 504 505 buf_t pkt = { 506 .skb = skb, 507 .head = (uint8_t *)(long)skb->data, 508 .tail = (uint8_t *)(long)skb->data_end, 509 }; 510 511 encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL); 512 if (encap_gre == NULL) { 513 metrics->errors_total_encap_buffer_too_small++; 514 return TC_ACT_SHOT; 515 } 516 517 encap_gre->ip.protocol = IPPROTO_GRE; 518 encap_gre->ip.daddr = next_hop->s_addr; 519 encap_gre->ip.saddr = ENCAPSULATION_IP; 520 encap_gre->ip.tot_len = 521 bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta); 522 encap_gre->gre.flags = 0; 523 encap_gre->gre.protocol = bpf_htons(proto); 524 pkt_ipv4_checksum((void *)&encap_gre->ip); 525 526 return bpf_redirect(skb->ifindex, 0); 527} 528 529static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, 530 struct in_addr *next_hop, metrics_t *metrics) 531{ 532 /* swap L2 addresses */ 533 /* This assumes that packets are received from a router. 534 * So just swapping the MAC addresses here will make the packet go back to 535 * the router, which will send it to the appropriate machine. 536 */ 537 unsigned char temp[ETH_ALEN]; 538 memcpy(temp, encap->eth.h_dest, sizeof(temp)); 539 memcpy(encap->eth.h_dest, encap->eth.h_source, 540 sizeof(encap->eth.h_dest)); 541 memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source)); 542 543 if (encap->unigue.next_hop == encap->unigue.hop_count - 1 && 544 encap->unigue.last_hop_gre) { 545 return forward_with_gre(skb, encap, next_hop, metrics); 546 } 547 548 metrics->forwarded_packets_total_gue++; 549 uint32_t old_saddr = encap->ip.saddr; 550 encap->ip.saddr = encap->ip.daddr; 551 encap->ip.daddr = next_hop->s_addr; 552 if (encap->unigue.next_hop < encap->unigue.hop_count) { 553 encap->unigue.next_hop++; 554 } 555 556 /* Remove ip->saddr, add next_hop->s_addr */ 557 const uint64_t off = offsetof(typeof(*encap), ip.check); 558 int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4); 559 if (ret < 0) { 560 return TC_ACT_SHOT; 561 } 562 563 return bpf_redirect(skb->ifindex, 0); 564} 565 566static INLINING ret_t skip_next_hops(buf_t *pkt, int n) 567{ 568 switch (n) { 569 case 1: 570 if (!buf_skip(pkt, sizeof(struct in_addr))) 571 return TC_ACT_SHOT; 572 case 0: 573 return CONTINUE_PROCESSING; 574 575 default: 576 return TC_ACT_SHOT; 577 } 578} 579 580/* Get the next hop from the GLB header. 581 * 582 * Sets next_hop->s_addr to 0 if there are no more hops left. 583 * pkt is positioned just after the variable length GLB header 584 * iff the call is successful. 585 */ 586static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, 587 struct in_addr *next_hop) 588{ 589 if (encap->unigue.next_hop > encap->unigue.hop_count) { 590 return TC_ACT_SHOT; 591 } 592 593 /* Skip "used" next hops. */ 594 MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop)); 595 596 if (encap->unigue.next_hop == encap->unigue.hop_count) { 597 /* No more next hops, we are at the end of the GLB header. */ 598 next_hop->s_addr = 0; 599 return CONTINUE_PROCESSING; 600 } 601 602 if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) { 603 return TC_ACT_SHOT; 604 } 605 606 /* Skip the remaining next hops (may be zero). */ 607 return skip_next_hops(pkt, encap->unigue.hop_count - 608 encap->unigue.next_hop - 1); 609} 610 611/* Fill a bpf_sock_tuple to be used with the socket lookup functions. 612 * This is a kludge that let's us work around verifier limitations: 613 * 614 * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) 615 * 616 * clang will substitute a constant for sizeof, which allows the verifier 617 * to track its value. Based on this, it can figure out the constant 618 * return value, and calling code works while still being "generic" to 619 * IPv4 and IPv6. 620 */ 621static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, 622 uint64_t iphlen, uint16_t sport, uint16_t dport) 623{ 624 switch (iphlen) { 625 case sizeof(struct iphdr): { 626 struct iphdr *ipv4 = (struct iphdr *)iph; 627 tuple->ipv4.daddr = ipv4->daddr; 628 tuple->ipv4.saddr = ipv4->saddr; 629 tuple->ipv4.sport = sport; 630 tuple->ipv4.dport = dport; 631 return sizeof(tuple->ipv4); 632 } 633 634 case sizeof(struct ipv6hdr): { 635 struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph; 636 memcpy(&tuple->ipv6.daddr, &ipv6->daddr, 637 sizeof(tuple->ipv6.daddr)); 638 memcpy(&tuple->ipv6.saddr, &ipv6->saddr, 639 sizeof(tuple->ipv6.saddr)); 640 tuple->ipv6.sport = sport; 641 tuple->ipv6.dport = dport; 642 return sizeof(tuple->ipv6); 643 } 644 645 default: 646 return 0; 647 } 648} 649 650static INLINING verdict_t classify_tcp(struct __sk_buff *skb, 651 struct bpf_sock_tuple *tuple, uint64_t tuplen, 652 void *iph, struct tcphdr *tcp) 653{ 654 struct bpf_sock *sk = 655 bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); 656 if (sk == NULL) { 657 return UNKNOWN; 658 } 659 660 if (sk->state != BPF_TCP_LISTEN) { 661 bpf_sk_release(sk); 662 return ESTABLISHED; 663 } 664 665 if (iph != NULL && tcp != NULL) { 666 /* Kludge: we've run out of arguments, but need the length of the ip header. */ 667 uint64_t iphlen = sizeof(struct iphdr); 668 if (tuplen == sizeof(tuple->ipv6)) { 669 iphlen = sizeof(struct ipv6hdr); 670 } 671 672 if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp, 673 sizeof(*tcp)) == 0) { 674 bpf_sk_release(sk); 675 return SYN_COOKIE; 676 } 677 } 678 679 bpf_sk_release(sk); 680 return UNKNOWN; 681} 682 683static INLINING verdict_t classify_udp(struct __sk_buff *skb, 684 struct bpf_sock_tuple *tuple, uint64_t tuplen) 685{ 686 struct bpf_sock *sk = 687 bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); 688 if (sk == NULL) { 689 return UNKNOWN; 690 } 691 692 if (sk->state == BPF_TCP_ESTABLISHED) { 693 bpf_sk_release(sk); 694 return ESTABLISHED; 695 } 696 697 bpf_sk_release(sk); 698 return UNKNOWN; 699} 700 701static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, 702 struct bpf_sock_tuple *tuple, uint64_t tuplen, 703 metrics_t *metrics) 704{ 705 switch (proto) { 706 case IPPROTO_TCP: 707 return classify_tcp(skb, tuple, tuplen, NULL, NULL); 708 709 case IPPROTO_UDP: 710 return classify_udp(skb, tuple, tuplen); 711 712 default: 713 metrics->errors_total_malformed_icmp++; 714 return INVALID; 715 } 716} 717 718static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) 719{ 720 struct icmphdr icmp; 721 if (!buf_copy(pkt, &icmp, sizeof(icmp))) { 722 metrics->errors_total_malformed_icmp++; 723 return INVALID; 724 } 725 726 /* We should never receive encapsulated echo replies. */ 727 if (icmp.type == ICMP_ECHOREPLY) { 728 metrics->errors_total_icmp_echo_replies++; 729 return INVALID; 730 } 731 732 if (icmp.type == ICMP_ECHO) { 733 return ECHO_REQUEST; 734 } 735 736 if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) { 737 metrics->errors_total_unwanted_icmp++; 738 return INVALID; 739 } 740 741 struct iphdr _ip4; 742 const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); 743 if (ipv4 == NULL) { 744 metrics->errors_total_malformed_icmp_pkt_too_big++; 745 return INVALID; 746 } 747 748 /* The source address in the outer IP header is from the entity that 749 * originated the ICMP message. Use the original IP header to restore 750 * the correct flow tuple. 751 */ 752 struct bpf_sock_tuple tuple; 753 tuple.ipv4.saddr = ipv4->daddr; 754 tuple.ipv4.daddr = ipv4->saddr; 755 756 if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) { 757 metrics->errors_total_malformed_icmp_pkt_too_big++; 758 return INVALID; 759 } 760 761 return classify_icmp(pkt->skb, ipv4->protocol, &tuple, 762 sizeof(tuple.ipv4), metrics); 763} 764 765static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) 766{ 767 struct icmp6hdr icmp6; 768 if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) { 769 metrics->errors_total_malformed_icmp++; 770 return INVALID; 771 } 772 773 /* We should never receive encapsulated echo replies. */ 774 if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) { 775 metrics->errors_total_icmp_echo_replies++; 776 return INVALID; 777 } 778 779 if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) { 780 return ECHO_REQUEST; 781 } 782 783 if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) { 784 metrics->errors_total_unwanted_icmp++; 785 return INVALID; 786 } 787 788 bool is_fragment; 789 uint8_t l4_proto; 790 struct ipv6hdr _ipv6; 791 const struct ipv6hdr *ipv6 = 792 pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); 793 if (ipv6 == NULL) { 794 metrics->errors_total_malformed_icmp_pkt_too_big++; 795 return INVALID; 796 } 797 798 if (is_fragment) { 799 metrics->errors_total_fragmented_ip++; 800 return INVALID; 801 } 802 803 /* Swap source and dest addresses. */ 804 struct bpf_sock_tuple tuple; 805 memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr)); 806 memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr)); 807 808 if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) { 809 metrics->errors_total_malformed_icmp_pkt_too_big++; 810 return INVALID; 811 } 812 813 return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6), 814 metrics); 815} 816 817static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, 818 metrics_t *metrics) 819{ 820 metrics->l4_protocol_packets_total_tcp++; 821 822 struct tcphdr _tcp; 823 struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp); 824 if (tcp == NULL) { 825 metrics->errors_total_malformed_tcp++; 826 return INVALID; 827 } 828 829 if (tcp->syn) { 830 return SYN; 831 } 832 833 struct bpf_sock_tuple tuple; 834 uint64_t tuplen = 835 fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest); 836 return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp); 837} 838 839static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, 840 metrics_t *metrics) 841{ 842 metrics->l4_protocol_packets_total_udp++; 843 844 struct udphdr _udp; 845 struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp); 846 if (udph == NULL) { 847 metrics->errors_total_malformed_udp++; 848 return INVALID; 849 } 850 851 struct bpf_sock_tuple tuple; 852 uint64_t tuplen = 853 fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest); 854 return classify_udp(pkt->skb, &tuple, tuplen); 855} 856 857static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) 858{ 859 metrics->l3_protocol_packets_total_ipv4++; 860 861 struct iphdr _ip4; 862 struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); 863 if (ipv4 == NULL) { 864 metrics->errors_total_malformed_ip++; 865 return INVALID; 866 } 867 868 if (ipv4->version != 4) { 869 metrics->errors_total_malformed_ip++; 870 return INVALID; 871 } 872 873 if (ipv4_is_fragment(ipv4)) { 874 metrics->errors_total_fragmented_ip++; 875 return INVALID; 876 } 877 878 switch (ipv4->protocol) { 879 case IPPROTO_ICMP: 880 return process_icmpv4(pkt, metrics); 881 882 case IPPROTO_TCP: 883 return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics); 884 885 case IPPROTO_UDP: 886 return process_udp(pkt, ipv4, sizeof(*ipv4), metrics); 887 888 default: 889 metrics->errors_total_unknown_l4_proto++; 890 return INVALID; 891 } 892} 893 894static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) 895{ 896 metrics->l3_protocol_packets_total_ipv6++; 897 898 uint8_t l4_proto; 899 bool is_fragment; 900 struct ipv6hdr _ipv6; 901 struct ipv6hdr *ipv6 = 902 pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); 903 if (ipv6 == NULL) { 904 metrics->errors_total_malformed_ip++; 905 return INVALID; 906 } 907 908 if (ipv6->version != 6) { 909 metrics->errors_total_malformed_ip++; 910 return INVALID; 911 } 912 913 if (is_fragment) { 914 metrics->errors_total_fragmented_ip++; 915 return INVALID; 916 } 917 918 switch (l4_proto) { 919 case IPPROTO_ICMPV6: 920 return process_icmpv6(pkt, metrics); 921 922 case IPPROTO_TCP: 923 return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics); 924 925 case IPPROTO_UDP: 926 return process_udp(pkt, ipv6, sizeof(*ipv6), metrics); 927 928 default: 929 metrics->errors_total_unknown_l4_proto++; 930 return INVALID; 931 } 932} 933 934SEC("tc") 935int cls_redirect(struct __sk_buff *skb) 936{ 937 metrics_t *metrics = get_global_metrics(); 938 if (metrics == NULL) { 939 return TC_ACT_SHOT; 940 } 941 942 metrics->processed_packets_total++; 943 944 /* Pass bogus packets as long as we're not sure they're 945 * destined for us. 946 */ 947 if (skb->protocol != bpf_htons(ETH_P_IP)) { 948 return TC_ACT_OK; 949 } 950 951 encap_headers_t *encap; 952 953 /* Make sure that all encapsulation headers are available in 954 * the linear portion of the skb. This makes it easy to manipulate them. 955 */ 956 if (bpf_skb_pull_data(skb, sizeof(*encap))) { 957 return TC_ACT_OK; 958 } 959 960 buf_t pkt = { 961 .skb = skb, 962 .head = (uint8_t *)(long)skb->data, 963 .tail = (uint8_t *)(long)skb->data_end, 964 }; 965 966 encap = buf_assign(&pkt, sizeof(*encap), NULL); 967 if (encap == NULL) { 968 return TC_ACT_OK; 969 } 970 971 if (encap->ip.ihl != 5) { 972 /* We never have any options. */ 973 return TC_ACT_OK; 974 } 975 976 if (encap->ip.daddr != ENCAPSULATION_IP || 977 encap->ip.protocol != IPPROTO_UDP) { 978 return TC_ACT_OK; 979 } 980 981 /* TODO Check UDP length? */ 982 if (encap->udp.dest != ENCAPSULATION_PORT) { 983 return TC_ACT_OK; 984 } 985 986 /* We now know that the packet is destined to us, we can 987 * drop bogus ones. 988 */ 989 if (ipv4_is_fragment((void *)&encap->ip)) { 990 metrics->errors_total_fragmented_ip++; 991 return TC_ACT_SHOT; 992 } 993 994 if (encap->gue.variant != 0) { 995 metrics->errors_total_malformed_encapsulation++; 996 return TC_ACT_SHOT; 997 } 998 999 if (encap->gue.control != 0) { 1000 metrics->errors_total_malformed_encapsulation++; 1001 return TC_ACT_SHOT; 1002 } 1003 1004 if (encap->gue.flags != 0) { 1005 metrics->errors_total_malformed_encapsulation++; 1006 return TC_ACT_SHOT; 1007 } 1008 1009 if (encap->gue.hlen != 1010 sizeof(encap->unigue) / 4 + encap->unigue.hop_count) { 1011 metrics->errors_total_malformed_encapsulation++; 1012 return TC_ACT_SHOT; 1013 } 1014 1015 if (encap->unigue.version != 0) { 1016 metrics->errors_total_malformed_encapsulation++; 1017 return TC_ACT_SHOT; 1018 } 1019 1020 if (encap->unigue.reserved != 0) { 1021 return TC_ACT_SHOT; 1022 } 1023 1024 struct in_addr next_hop; 1025 MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop)); 1026 1027 if (next_hop.s_addr == 0) { 1028 metrics->accepted_packets_total_last_hop++; 1029 return accept_locally(skb, encap); 1030 } 1031 1032 verdict_t verdict; 1033 switch (encap->gue.proto_ctype) { 1034 case IPPROTO_IPIP: 1035 verdict = process_ipv4(&pkt, metrics); 1036 break; 1037 1038 case IPPROTO_IPV6: 1039 verdict = process_ipv6(&pkt, metrics); 1040 break; 1041 1042 default: 1043 metrics->errors_total_unknown_l3_proto++; 1044 return TC_ACT_SHOT; 1045 } 1046 1047 switch (verdict) { 1048 case INVALID: 1049 /* metrics have already been bumped */ 1050 return TC_ACT_SHOT; 1051 1052 case UNKNOWN: 1053 return forward_to_next_hop(skb, encap, &next_hop, metrics); 1054 1055 case ECHO_REQUEST: 1056 metrics->accepted_packets_total_icmp_echo_request++; 1057 break; 1058 1059 case SYN: 1060 if (encap->unigue.forward_syn) { 1061 return forward_to_next_hop(skb, encap, &next_hop, 1062 metrics); 1063 } 1064 1065 metrics->accepted_packets_total_syn++; 1066 break; 1067 1068 case SYN_COOKIE: 1069 metrics->accepted_packets_total_syn_cookies++; 1070 break; 1071 1072 case ESTABLISHED: 1073 metrics->accepted_packets_total_established++; 1074 break; 1075 } 1076 1077 return accept_locally(skb, encap); 1078} 1079