1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2// Copyright (c) 2019, 2020 Cloudflare 3 4#include <stdbool.h> 5#include <stddef.h> 6#include <stdint.h> 7#include <string.h> 8 9#include <linux/bpf.h> 10#include <linux/icmp.h> 11#include <linux/icmpv6.h> 12#include <linux/if_ether.h> 13#include <linux/in.h> 14#include <linux/ip.h> 15#include <linux/ipv6.h> 16#include <linux/pkt_cls.h> 17#include <linux/tcp.h> 18#include <linux/udp.h> 19 20#include <bpf/bpf_helpers.h> 21#include <bpf/bpf_endian.h> 22 23#include "test_cls_redirect.h" 24#include "bpf_kfuncs.h" 25 26#pragma GCC diagnostic ignored "-Waddress-of-packed-member" 27 28#define offsetofend(TYPE, MEMBER) \ 29 (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) 30 31#define IP_OFFSET_MASK (0x1FFF) 32#define IP_MF (0x2000) 33 34char _license[] SEC("license") = "Dual BSD/GPL"; 35 36/** 37 * Destination port and IP used for UDP encapsulation. 38 */ 39volatile const __be16 ENCAPSULATION_PORT; 40volatile const __be32 ENCAPSULATION_IP; 41 42typedef struct { 43 uint64_t processed_packets_total; 44 uint64_t l3_protocol_packets_total_ipv4; 45 uint64_t l3_protocol_packets_total_ipv6; 46 uint64_t l4_protocol_packets_total_tcp; 47 uint64_t l4_protocol_packets_total_udp; 48 uint64_t accepted_packets_total_syn; 49 uint64_t accepted_packets_total_syn_cookies; 50 uint64_t accepted_packets_total_last_hop; 51 uint64_t accepted_packets_total_icmp_echo_request; 52 uint64_t accepted_packets_total_established; 53 uint64_t forwarded_packets_total_gue; 54 uint64_t forwarded_packets_total_gre; 55 56 uint64_t errors_total_unknown_l3_proto; 57 uint64_t errors_total_unknown_l4_proto; 58 uint64_t errors_total_malformed_ip; 59 uint64_t errors_total_fragmented_ip; 60 uint64_t errors_total_malformed_icmp; 61 uint64_t errors_total_unwanted_icmp; 62 uint64_t errors_total_malformed_icmp_pkt_too_big; 63 uint64_t errors_total_malformed_tcp; 64 uint64_t errors_total_malformed_udp; 65 uint64_t errors_total_icmp_echo_replies; 66 uint64_t errors_total_malformed_encapsulation; 67 uint64_t errors_total_encap_adjust_failed; 68 uint64_t errors_total_encap_buffer_too_small; 69 uint64_t errors_total_redirect_loop; 70 uint64_t errors_total_encap_mtu_violate; 71} metrics_t; 72 73typedef enum { 74 INVALID = 0, 75 UNKNOWN, 76 ECHO_REQUEST, 77 SYN, 78 SYN_COOKIE, 79 ESTABLISHED, 80} verdict_t; 81 82typedef struct { 83 uint16_t src, dst; 84} flow_ports_t; 85 86_Static_assert( 87 sizeof(flow_ports_t) != 88 offsetofend(struct bpf_sock_tuple, ipv4.dport) - 89 offsetof(struct bpf_sock_tuple, ipv4.sport) - 1, 90 "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); 91_Static_assert( 92 sizeof(flow_ports_t) != 93 offsetofend(struct bpf_sock_tuple, ipv6.dport) - 94 offsetof(struct bpf_sock_tuple, ipv6.sport) - 1, 95 "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); 96 97struct iphdr_info { 98 void *hdr; 99 __u64 len; 100}; 101 102typedef int ret_t; 103 104/* This is a bit of a hack. We need a return value which allows us to 105 * indicate that the regular flow of the program should continue, 106 * while allowing functions to use XDP_PASS and XDP_DROP, etc. 107 */ 108static const ret_t CONTINUE_PROCESSING = -1; 109 110/* Convenience macro to call functions which return ret_t. 111 */ 112#define MAYBE_RETURN(x) \ 113 do { \ 114 ret_t __ret = x; \ 115 if (__ret != CONTINUE_PROCESSING) \ 116 return __ret; \ 117 } while (0) 118 119static bool ipv4_is_fragment(const struct iphdr *ip) 120{ 121 uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); 122 return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; 123} 124 125static int pkt_parse_ipv4(struct bpf_dynptr *dynptr, __u64 *offset, struct iphdr *iphdr) 126{ 127 if (bpf_dynptr_read(iphdr, sizeof(*iphdr), dynptr, *offset, 0)) 128 return -1; 129 130 *offset += sizeof(*iphdr); 131 132 if (iphdr->ihl < 5) 133 return -1; 134 135 /* skip ipv4 options */ 136 *offset += (iphdr->ihl - 5) * 4; 137 138 return 0; 139} 140 141/* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ 142static bool pkt_parse_icmp_l4_ports(struct bpf_dynptr *dynptr, __u64 *offset, flow_ports_t *ports) 143{ 144 if (bpf_dynptr_read(ports, sizeof(*ports), dynptr, *offset, 0)) 145 return false; 146 147 *offset += sizeof(*ports); 148 149 /* Ports in the L4 headers are reversed, since we are parsing an ICMP 150 * payload which is going towards the eyeball. 151 */ 152 uint16_t dst = ports->src; 153 ports->src = ports->dst; 154 ports->dst = dst; 155 return true; 156} 157 158static uint16_t pkt_checksum_fold(uint32_t csum) 159{ 160 /* The highest reasonable value for an IPv4 header 161 * checksum requires two folds, so we just do that always. 162 */ 163 csum = (csum & 0xffff) + (csum >> 16); 164 csum = (csum & 0xffff) + (csum >> 16); 165 return (uint16_t)~csum; 166} 167 168static void pkt_ipv4_checksum(struct iphdr *iph) 169{ 170 iph->check = 0; 171 172 /* An IP header without options is 20 bytes. Two of those 173 * are the checksum, which we always set to zero. Hence, 174 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7, 175 * which fits in 32 bit. 176 */ 177 _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes"); 178 uint32_t acc = 0; 179 uint16_t *ipw = (uint16_t *)iph; 180 181 for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) 182 acc += ipw[i]; 183 184 iph->check = pkt_checksum_fold(acc); 185} 186 187static bool pkt_skip_ipv6_extension_headers(struct bpf_dynptr *dynptr, __u64 *offset, 188 const struct ipv6hdr *ipv6, uint8_t *upper_proto, 189 bool *is_fragment) 190{ 191 /* We understand five extension headers. 192 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all 193 * headers should occur once, except Destination Options, which may 194 * occur twice. Hence we give up after 6 headers. 195 */ 196 struct { 197 uint8_t next; 198 uint8_t len; 199 } exthdr = { 200 .next = ipv6->nexthdr, 201 }; 202 *is_fragment = false; 203 204 for (int i = 0; i < 6; i++) { 205 switch (exthdr.next) { 206 case IPPROTO_FRAGMENT: 207 *is_fragment = true; 208 /* NB: We don't check that hdrlen == 0 as per spec. */ 209 /* fallthrough; */ 210 211 case IPPROTO_HOPOPTS: 212 case IPPROTO_ROUTING: 213 case IPPROTO_DSTOPTS: 214 case IPPROTO_MH: 215 if (bpf_dynptr_read(&exthdr, sizeof(exthdr), dynptr, *offset, 0)) 216 return false; 217 218 /* hdrlen is in 8-octet units, and excludes the first 8 octets. */ 219 *offset += (exthdr.len + 1) * 8; 220 221 /* Decode next header */ 222 break; 223 224 default: 225 /* The next header is not one of the known extension 226 * headers, treat it as the upper layer header. 227 * 228 * This handles IPPROTO_NONE. 229 * 230 * Encapsulating Security Payload (50) and Authentication 231 * Header (51) also end up here (and will trigger an 232 * unknown proto error later). They have a custom header 233 * format and seem too esoteric to care about. 234 */ 235 *upper_proto = exthdr.next; 236 return true; 237 } 238 } 239 240 /* We never found an upper layer header. */ 241 return false; 242} 243 244static int pkt_parse_ipv6(struct bpf_dynptr *dynptr, __u64 *offset, struct ipv6hdr *ipv6, 245 uint8_t *proto, bool *is_fragment) 246{ 247 if (bpf_dynptr_read(ipv6, sizeof(*ipv6), dynptr, *offset, 0)) 248 return -1; 249 250 *offset += sizeof(*ipv6); 251 252 if (!pkt_skip_ipv6_extension_headers(dynptr, offset, ipv6, proto, is_fragment)) 253 return -1; 254 255 return 0; 256} 257 258/* Global metrics, per CPU 259 */ 260struct { 261 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 262 __uint(max_entries, 1); 263 __type(key, unsigned int); 264 __type(value, metrics_t); 265} metrics_map SEC(".maps"); 266 267static metrics_t *get_global_metrics(void) 268{ 269 uint64_t key = 0; 270 return bpf_map_lookup_elem(&metrics_map, &key); 271} 272 273static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) 274{ 275 const int payload_off = 276 sizeof(*encap) + 277 sizeof(struct in_addr) * encap->unigue.hop_count; 278 int32_t encap_overhead = payload_off - sizeof(struct ethhdr); 279 280 /* Changing the ethertype if the encapsulated packet is ipv6 */ 281 if (encap->gue.proto_ctype == IPPROTO_IPV6) 282 encap->eth.h_proto = bpf_htons(ETH_P_IPV6); 283 284 if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, 285 BPF_F_ADJ_ROOM_FIXED_GSO | 286 BPF_F_ADJ_ROOM_NO_CSUM_RESET) || 287 bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC)) 288 return TC_ACT_SHOT; 289 290 return bpf_redirect(skb->ifindex, BPF_F_INGRESS); 291} 292 293static ret_t forward_with_gre(struct __sk_buff *skb, struct bpf_dynptr *dynptr, 294 encap_headers_t *encap, struct in_addr *next_hop, 295 metrics_t *metrics) 296{ 297 const int payload_off = 298 sizeof(*encap) + 299 sizeof(struct in_addr) * encap->unigue.hop_count; 300 int32_t encap_overhead = 301 payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); 302 int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; 303 __u8 encap_buffer[sizeof(encap_gre_t)] = {}; 304 uint16_t proto = ETH_P_IP; 305 uint32_t mtu_len = 0; 306 encap_gre_t *encap_gre; 307 308 metrics->forwarded_packets_total_gre++; 309 310 /* Loop protection: the inner packet's TTL is decremented as a safeguard 311 * against any forwarding loop. As the only interesting field is the TTL 312 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes 313 * as they handle the split packets if needed (no need for the data to be 314 * in the linear section). 315 */ 316 if (encap->gue.proto_ctype == IPPROTO_IPV6) { 317 proto = ETH_P_IPV6; 318 uint8_t ttl; 319 int rc; 320 321 rc = bpf_skb_load_bytes( 322 skb, payload_off + offsetof(struct ipv6hdr, hop_limit), 323 &ttl, 1); 324 if (rc != 0) { 325 metrics->errors_total_malformed_encapsulation++; 326 return TC_ACT_SHOT; 327 } 328 329 if (ttl == 0) { 330 metrics->errors_total_redirect_loop++; 331 return TC_ACT_SHOT; 332 } 333 334 ttl--; 335 rc = bpf_skb_store_bytes( 336 skb, payload_off + offsetof(struct ipv6hdr, hop_limit), 337 &ttl, 1, 0); 338 if (rc != 0) { 339 metrics->errors_total_malformed_encapsulation++; 340 return TC_ACT_SHOT; 341 } 342 } else { 343 uint8_t ttl; 344 int rc; 345 346 rc = bpf_skb_load_bytes( 347 skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 348 1); 349 if (rc != 0) { 350 metrics->errors_total_malformed_encapsulation++; 351 return TC_ACT_SHOT; 352 } 353 354 if (ttl == 0) { 355 metrics->errors_total_redirect_loop++; 356 return TC_ACT_SHOT; 357 } 358 359 /* IPv4 also has a checksum to patch. While the TTL is only one byte, 360 * this function only works for 2 and 4 bytes arguments (the result is 361 * the same). 362 */ 363 rc = bpf_l3_csum_replace( 364 skb, payload_off + offsetof(struct iphdr, check), ttl, 365 ttl - 1, 2); 366 if (rc != 0) { 367 metrics->errors_total_malformed_encapsulation++; 368 return TC_ACT_SHOT; 369 } 370 371 ttl--; 372 rc = bpf_skb_store_bytes( 373 skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1, 374 0); 375 if (rc != 0) { 376 metrics->errors_total_malformed_encapsulation++; 377 return TC_ACT_SHOT; 378 } 379 } 380 381 if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) { 382 metrics->errors_total_encap_mtu_violate++; 383 return TC_ACT_SHOT; 384 } 385 386 if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, 387 BPF_F_ADJ_ROOM_FIXED_GSO | 388 BPF_F_ADJ_ROOM_NO_CSUM_RESET) || 389 bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) { 390 metrics->errors_total_encap_adjust_failed++; 391 return TC_ACT_SHOT; 392 } 393 394 if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) { 395 metrics->errors_total_encap_buffer_too_small++; 396 return TC_ACT_SHOT; 397 } 398 399 encap_gre = bpf_dynptr_slice_rdwr(dynptr, 0, encap_buffer, sizeof(encap_buffer)); 400 if (!encap_gre) { 401 metrics->errors_total_encap_buffer_too_small++; 402 return TC_ACT_SHOT; 403 } 404 405 encap_gre->ip.protocol = IPPROTO_GRE; 406 encap_gre->ip.daddr = next_hop->s_addr; 407 encap_gre->ip.saddr = ENCAPSULATION_IP; 408 encap_gre->ip.tot_len = 409 bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta); 410 encap_gre->gre.flags = 0; 411 encap_gre->gre.protocol = bpf_htons(proto); 412 pkt_ipv4_checksum((void *)&encap_gre->ip); 413 414 if (encap_gre == encap_buffer) 415 bpf_dynptr_write(dynptr, 0, encap_buffer, sizeof(encap_buffer), 0); 416 417 return bpf_redirect(skb->ifindex, 0); 418} 419 420static ret_t forward_to_next_hop(struct __sk_buff *skb, struct bpf_dynptr *dynptr, 421 encap_headers_t *encap, struct in_addr *next_hop, 422 metrics_t *metrics) 423{ 424 /* swap L2 addresses */ 425 /* This assumes that packets are received from a router. 426 * So just swapping the MAC addresses here will make the packet go back to 427 * the router, which will send it to the appropriate machine. 428 */ 429 unsigned char temp[ETH_ALEN]; 430 memcpy(temp, encap->eth.h_dest, sizeof(temp)); 431 memcpy(encap->eth.h_dest, encap->eth.h_source, 432 sizeof(encap->eth.h_dest)); 433 memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source)); 434 435 if (encap->unigue.next_hop == encap->unigue.hop_count - 1 && 436 encap->unigue.last_hop_gre) { 437 return forward_with_gre(skb, dynptr, encap, next_hop, metrics); 438 } 439 440 metrics->forwarded_packets_total_gue++; 441 uint32_t old_saddr = encap->ip.saddr; 442 encap->ip.saddr = encap->ip.daddr; 443 encap->ip.daddr = next_hop->s_addr; 444 if (encap->unigue.next_hop < encap->unigue.hop_count) { 445 encap->unigue.next_hop++; 446 } 447 448 /* Remove ip->saddr, add next_hop->s_addr */ 449 const uint64_t off = offsetof(typeof(*encap), ip.check); 450 int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4); 451 if (ret < 0) { 452 return TC_ACT_SHOT; 453 } 454 455 return bpf_redirect(skb->ifindex, 0); 456} 457 458static ret_t skip_next_hops(__u64 *offset, int n) 459{ 460 switch (n) { 461 case 1: 462 *offset += sizeof(struct in_addr); 463 case 0: 464 return CONTINUE_PROCESSING; 465 466 default: 467 return TC_ACT_SHOT; 468 } 469} 470 471/* Get the next hop from the GLB header. 472 * 473 * Sets next_hop->s_addr to 0 if there are no more hops left. 474 * pkt is positioned just after the variable length GLB header 475 * iff the call is successful. 476 */ 477static ret_t get_next_hop(struct bpf_dynptr *dynptr, __u64 *offset, encap_headers_t *encap, 478 struct in_addr *next_hop) 479{ 480 if (encap->unigue.next_hop > encap->unigue.hop_count) 481 return TC_ACT_SHOT; 482 483 /* Skip "used" next hops. */ 484 MAYBE_RETURN(skip_next_hops(offset, encap->unigue.next_hop)); 485 486 if (encap->unigue.next_hop == encap->unigue.hop_count) { 487 /* No more next hops, we are at the end of the GLB header. */ 488 next_hop->s_addr = 0; 489 return CONTINUE_PROCESSING; 490 } 491 492 if (bpf_dynptr_read(next_hop, sizeof(*next_hop), dynptr, *offset, 0)) 493 return TC_ACT_SHOT; 494 495 *offset += sizeof(*next_hop); 496 497 /* Skip the remainig next hops (may be zero). */ 498 return skip_next_hops(offset, encap->unigue.hop_count - encap->unigue.next_hop - 1); 499} 500 501/* Fill a bpf_sock_tuple to be used with the socket lookup functions. 502 * This is a kludge that let's us work around verifier limitations: 503 * 504 * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) 505 * 506 * clang will substitue a costant for sizeof, which allows the verifier 507 * to track it's value. Based on this, it can figure out the constant 508 * return value, and calling code works while still being "generic" to 509 * IPv4 and IPv6. 510 */ 511static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, 512 uint64_t iphlen, uint16_t sport, uint16_t dport) 513{ 514 switch (iphlen) { 515 case sizeof(struct iphdr): { 516 struct iphdr *ipv4 = (struct iphdr *)iph; 517 tuple->ipv4.daddr = ipv4->daddr; 518 tuple->ipv4.saddr = ipv4->saddr; 519 tuple->ipv4.sport = sport; 520 tuple->ipv4.dport = dport; 521 return sizeof(tuple->ipv4); 522 } 523 524 case sizeof(struct ipv6hdr): { 525 struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph; 526 memcpy(&tuple->ipv6.daddr, &ipv6->daddr, 527 sizeof(tuple->ipv6.daddr)); 528 memcpy(&tuple->ipv6.saddr, &ipv6->saddr, 529 sizeof(tuple->ipv6.saddr)); 530 tuple->ipv6.sport = sport; 531 tuple->ipv6.dport = dport; 532 return sizeof(tuple->ipv6); 533 } 534 535 default: 536 return 0; 537 } 538} 539 540static verdict_t classify_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, 541 uint64_t tuplen, void *iph, struct tcphdr *tcp) 542{ 543 struct bpf_sock *sk = 544 bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); 545 546 if (sk == NULL) 547 return UNKNOWN; 548 549 if (sk->state != BPF_TCP_LISTEN) { 550 bpf_sk_release(sk); 551 return ESTABLISHED; 552 } 553 554 if (iph != NULL && tcp != NULL) { 555 /* Kludge: we've run out of arguments, but need the length of the ip header. */ 556 uint64_t iphlen = sizeof(struct iphdr); 557 558 if (tuplen == sizeof(tuple->ipv6)) 559 iphlen = sizeof(struct ipv6hdr); 560 561 if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp, 562 sizeof(*tcp)) == 0) { 563 bpf_sk_release(sk); 564 return SYN_COOKIE; 565 } 566 } 567 568 bpf_sk_release(sk); 569 return UNKNOWN; 570} 571 572static verdict_t classify_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, uint64_t tuplen) 573{ 574 struct bpf_sock *sk = 575 bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); 576 577 if (sk == NULL) 578 return UNKNOWN; 579 580 if (sk->state == BPF_TCP_ESTABLISHED) { 581 bpf_sk_release(sk); 582 return ESTABLISHED; 583 } 584 585 bpf_sk_release(sk); 586 return UNKNOWN; 587} 588 589static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, struct bpf_sock_tuple *tuple, 590 uint64_t tuplen, metrics_t *metrics) 591{ 592 switch (proto) { 593 case IPPROTO_TCP: 594 return classify_tcp(skb, tuple, tuplen, NULL, NULL); 595 596 case IPPROTO_UDP: 597 return classify_udp(skb, tuple, tuplen); 598 599 default: 600 metrics->errors_total_malformed_icmp++; 601 return INVALID; 602 } 603} 604 605static verdict_t process_icmpv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr, __u64 *offset, 606 metrics_t *metrics) 607{ 608 struct icmphdr icmp; 609 struct iphdr ipv4; 610 611 if (bpf_dynptr_read(&icmp, sizeof(icmp), dynptr, *offset, 0)) { 612 metrics->errors_total_malformed_icmp++; 613 return INVALID; 614 } 615 616 *offset += sizeof(icmp); 617 618 /* We should never receive encapsulated echo replies. */ 619 if (icmp.type == ICMP_ECHOREPLY) { 620 metrics->errors_total_icmp_echo_replies++; 621 return INVALID; 622 } 623 624 if (icmp.type == ICMP_ECHO) 625 return ECHO_REQUEST; 626 627 if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) { 628 metrics->errors_total_unwanted_icmp++; 629 return INVALID; 630 } 631 632 if (pkt_parse_ipv4(dynptr, offset, &ipv4)) { 633 metrics->errors_total_malformed_icmp_pkt_too_big++; 634 return INVALID; 635 } 636 637 /* The source address in the outer IP header is from the entity that 638 * originated the ICMP message. Use the original IP header to restore 639 * the correct flow tuple. 640 */ 641 struct bpf_sock_tuple tuple; 642 tuple.ipv4.saddr = ipv4.daddr; 643 tuple.ipv4.daddr = ipv4.saddr; 644 645 if (!pkt_parse_icmp_l4_ports(dynptr, offset, (flow_ports_t *)&tuple.ipv4.sport)) { 646 metrics->errors_total_malformed_icmp_pkt_too_big++; 647 return INVALID; 648 } 649 650 return classify_icmp(skb, ipv4.protocol, &tuple, 651 sizeof(tuple.ipv4), metrics); 652} 653 654static verdict_t process_icmpv6(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb, 655 metrics_t *metrics) 656{ 657 struct bpf_sock_tuple tuple; 658 struct ipv6hdr ipv6; 659 struct icmp6hdr icmp6; 660 bool is_fragment; 661 uint8_t l4_proto; 662 663 if (bpf_dynptr_read(&icmp6, sizeof(icmp6), dynptr, *offset, 0)) { 664 metrics->errors_total_malformed_icmp++; 665 return INVALID; 666 } 667 668 /* We should never receive encapsulated echo replies. */ 669 if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) { 670 metrics->errors_total_icmp_echo_replies++; 671 return INVALID; 672 } 673 674 if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) { 675 return ECHO_REQUEST; 676 } 677 678 if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) { 679 metrics->errors_total_unwanted_icmp++; 680 return INVALID; 681 } 682 683 if (pkt_parse_ipv6(dynptr, offset, &ipv6, &l4_proto, &is_fragment)) { 684 metrics->errors_total_malformed_icmp_pkt_too_big++; 685 return INVALID; 686 } 687 688 if (is_fragment) { 689 metrics->errors_total_fragmented_ip++; 690 return INVALID; 691 } 692 693 /* Swap source and dest addresses. */ 694 memcpy(&tuple.ipv6.saddr, &ipv6.daddr, sizeof(tuple.ipv6.saddr)); 695 memcpy(&tuple.ipv6.daddr, &ipv6.saddr, sizeof(tuple.ipv6.daddr)); 696 697 if (!pkt_parse_icmp_l4_ports(dynptr, offset, (flow_ports_t *)&tuple.ipv6.sport)) { 698 metrics->errors_total_malformed_icmp_pkt_too_big++; 699 return INVALID; 700 } 701 702 return classify_icmp(skb, l4_proto, &tuple, sizeof(tuple.ipv6), 703 metrics); 704} 705 706static verdict_t process_tcp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb, 707 struct iphdr_info *info, metrics_t *metrics) 708{ 709 struct bpf_sock_tuple tuple; 710 struct tcphdr tcp; 711 uint64_t tuplen; 712 713 metrics->l4_protocol_packets_total_tcp++; 714 715 if (bpf_dynptr_read(&tcp, sizeof(tcp), dynptr, *offset, 0)) { 716 metrics->errors_total_malformed_tcp++; 717 return INVALID; 718 } 719 720 *offset += sizeof(tcp); 721 722 if (tcp.syn) 723 return SYN; 724 725 tuplen = fill_tuple(&tuple, info->hdr, info->len, tcp.source, tcp.dest); 726 return classify_tcp(skb, &tuple, tuplen, info->hdr, &tcp); 727} 728 729static verdict_t process_udp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb, 730 struct iphdr_info *info, metrics_t *metrics) 731{ 732 struct bpf_sock_tuple tuple; 733 struct udphdr udph; 734 uint64_t tuplen; 735 736 metrics->l4_protocol_packets_total_udp++; 737 738 if (bpf_dynptr_read(&udph, sizeof(udph), dynptr, *offset, 0)) { 739 metrics->errors_total_malformed_udp++; 740 return INVALID; 741 } 742 *offset += sizeof(udph); 743 744 tuplen = fill_tuple(&tuple, info->hdr, info->len, udph.source, udph.dest); 745 return classify_udp(skb, &tuple, tuplen); 746} 747 748static verdict_t process_ipv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr, 749 __u64 *offset, metrics_t *metrics) 750{ 751 struct iphdr ipv4; 752 struct iphdr_info info = { 753 .hdr = &ipv4, 754 .len = sizeof(ipv4), 755 }; 756 757 metrics->l3_protocol_packets_total_ipv4++; 758 759 if (pkt_parse_ipv4(dynptr, offset, &ipv4)) { 760 metrics->errors_total_malformed_ip++; 761 return INVALID; 762 } 763 764 if (ipv4.version != 4) { 765 metrics->errors_total_malformed_ip++; 766 return INVALID; 767 } 768 769 if (ipv4_is_fragment(&ipv4)) { 770 metrics->errors_total_fragmented_ip++; 771 return INVALID; 772 } 773 774 switch (ipv4.protocol) { 775 case IPPROTO_ICMP: 776 return process_icmpv4(skb, dynptr, offset, metrics); 777 778 case IPPROTO_TCP: 779 return process_tcp(dynptr, offset, skb, &info, metrics); 780 781 case IPPROTO_UDP: 782 return process_udp(dynptr, offset, skb, &info, metrics); 783 784 default: 785 metrics->errors_total_unknown_l4_proto++; 786 return INVALID; 787 } 788} 789 790static verdict_t process_ipv6(struct __sk_buff *skb, struct bpf_dynptr *dynptr, 791 __u64 *offset, metrics_t *metrics) 792{ 793 struct ipv6hdr ipv6; 794 struct iphdr_info info = { 795 .hdr = &ipv6, 796 .len = sizeof(ipv6), 797 }; 798 uint8_t l4_proto; 799 bool is_fragment; 800 801 metrics->l3_protocol_packets_total_ipv6++; 802 803 if (pkt_parse_ipv6(dynptr, offset, &ipv6, &l4_proto, &is_fragment)) { 804 metrics->errors_total_malformed_ip++; 805 return INVALID; 806 } 807 808 if (ipv6.version != 6) { 809 metrics->errors_total_malformed_ip++; 810 return INVALID; 811 } 812 813 if (is_fragment) { 814 metrics->errors_total_fragmented_ip++; 815 return INVALID; 816 } 817 818 switch (l4_proto) { 819 case IPPROTO_ICMPV6: 820 return process_icmpv6(dynptr, offset, skb, metrics); 821 822 case IPPROTO_TCP: 823 return process_tcp(dynptr, offset, skb, &info, metrics); 824 825 case IPPROTO_UDP: 826 return process_udp(dynptr, offset, skb, &info, metrics); 827 828 default: 829 metrics->errors_total_unknown_l4_proto++; 830 return INVALID; 831 } 832} 833 834SEC("tc") 835int cls_redirect(struct __sk_buff *skb) 836{ 837 __u8 encap_buffer[sizeof(encap_headers_t)] = {}; 838 struct bpf_dynptr dynptr; 839 struct in_addr next_hop; 840 /* Tracks offset of the dynptr. This will be unnecessary once 841 * bpf_dynptr_advance() is available. 842 */ 843 __u64 off = 0; 844 ret_t ret; 845 846 bpf_dynptr_from_skb(skb, 0, &dynptr); 847 848 metrics_t *metrics = get_global_metrics(); 849 if (metrics == NULL) 850 return TC_ACT_SHOT; 851 852 metrics->processed_packets_total++; 853 854 /* Pass bogus packets as long as we're not sure they're 855 * destined for us. 856 */ 857 if (skb->protocol != bpf_htons(ETH_P_IP)) 858 return TC_ACT_OK; 859 860 encap_headers_t *encap; 861 862 /* Make sure that all encapsulation headers are available in 863 * the linear portion of the skb. This makes it easy to manipulate them. 864 */ 865 if (bpf_skb_pull_data(skb, sizeof(*encap))) 866 return TC_ACT_OK; 867 868 encap = bpf_dynptr_slice_rdwr(&dynptr, 0, encap_buffer, sizeof(encap_buffer)); 869 if (!encap) 870 return TC_ACT_OK; 871 872 off += sizeof(*encap); 873 874 if (encap->ip.ihl != 5) 875 /* We never have any options. */ 876 return TC_ACT_OK; 877 878 if (encap->ip.daddr != ENCAPSULATION_IP || 879 encap->ip.protocol != IPPROTO_UDP) 880 return TC_ACT_OK; 881 882 /* TODO Check UDP length? */ 883 if (encap->udp.dest != ENCAPSULATION_PORT) 884 return TC_ACT_OK; 885 886 /* We now know that the packet is destined to us, we can 887 * drop bogus ones. 888 */ 889 if (ipv4_is_fragment((void *)&encap->ip)) { 890 metrics->errors_total_fragmented_ip++; 891 return TC_ACT_SHOT; 892 } 893 894 if (encap->gue.variant != 0) { 895 metrics->errors_total_malformed_encapsulation++; 896 return TC_ACT_SHOT; 897 } 898 899 if (encap->gue.control != 0) { 900 metrics->errors_total_malformed_encapsulation++; 901 return TC_ACT_SHOT; 902 } 903 904 if (encap->gue.flags != 0) { 905 metrics->errors_total_malformed_encapsulation++; 906 return TC_ACT_SHOT; 907 } 908 909 if (encap->gue.hlen != 910 sizeof(encap->unigue) / 4 + encap->unigue.hop_count) { 911 metrics->errors_total_malformed_encapsulation++; 912 return TC_ACT_SHOT; 913 } 914 915 if (encap->unigue.version != 0) { 916 metrics->errors_total_malformed_encapsulation++; 917 return TC_ACT_SHOT; 918 } 919 920 if (encap->unigue.reserved != 0) 921 return TC_ACT_SHOT; 922 923 MAYBE_RETURN(get_next_hop(&dynptr, &off, encap, &next_hop)); 924 925 if (next_hop.s_addr == 0) { 926 metrics->accepted_packets_total_last_hop++; 927 return accept_locally(skb, encap); 928 } 929 930 verdict_t verdict; 931 switch (encap->gue.proto_ctype) { 932 case IPPROTO_IPIP: 933 verdict = process_ipv4(skb, &dynptr, &off, metrics); 934 break; 935 936 case IPPROTO_IPV6: 937 verdict = process_ipv6(skb, &dynptr, &off, metrics); 938 break; 939 940 default: 941 metrics->errors_total_unknown_l3_proto++; 942 return TC_ACT_SHOT; 943 } 944 945 switch (verdict) { 946 case INVALID: 947 /* metrics have already been bumped */ 948 return TC_ACT_SHOT; 949 950 case UNKNOWN: 951 return forward_to_next_hop(skb, &dynptr, encap, &next_hop, metrics); 952 953 case ECHO_REQUEST: 954 metrics->accepted_packets_total_icmp_echo_request++; 955 break; 956 957 case SYN: 958 if (encap->unigue.forward_syn) { 959 return forward_to_next_hop(skb, &dynptr, encap, &next_hop, 960 metrics); 961 } 962 963 metrics->accepted_packets_total_syn++; 964 break; 965 966 case SYN_COOKIE: 967 metrics->accepted_packets_total_syn_cookies++; 968 break; 969 970 case ESTABLISHED: 971 metrics->accepted_packets_total_established++; 972 break; 973 } 974 975 ret = accept_locally(skb, encap); 976 977 if (encap == encap_buffer) 978 bpf_dynptr_write(&dynptr, 0, encap_buffer, sizeof(encap_buffer), 0); 979 980 return ret; 981} 982