1/* 2 * sfe-cm.c 3 * Shortcut forwarding engine connection manager. 4 * 5 * Copyright (c) 2013-2015 The Linux Foundation. All rights reserved. 6 * Permission to use, copy, modify, and/or distribute this software for 7 * any purpose with or without fee is hereby granted, provided that the 8 * above copyright notice and this permission notice appear in all copies. 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 15 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18#include <linux/module.h> 19#include <linux/sysfs.h> 20#include <linux/skbuff.h> 21#include <net/route.h> 22#include <net/ip6_route.h> 23#include <net/addrconf.h> 24#include <net/dsfield.h> 25#include <linux/inetdevice.h> 26#include <linux/netfilter_bridge.h> 27#include <linux/netfilter_ipv6.h> 28#include <net/netfilter/nf_conntrack_acct.h> 29#include <net/netfilter/nf_conntrack_helper.h> 30#include <net/netfilter/nf_conntrack_zones.h> 31#include <net/netfilter/nf_conntrack_core.h> 32#include <linux/netfilter/xt_dscp.h> 33#include <linux/if_bridge.h> 34 35#include "sfe.h" 36#include "sfe_cm.h" 37#include "sfe_backport.h" 38 39typedef enum sfe_cm_exception { 40 SFE_CM_EXCEPTION_PACKET_BROADCAST, 41 SFE_CM_EXCEPTION_PACKET_MULTICAST, 42 SFE_CM_EXCEPTION_NO_IIF, 43 SFE_CM_EXCEPTION_NO_CT, 44 SFE_CM_EXCEPTION_CT_NO_TRACK, 45 SFE_CM_EXCEPTION_CT_NO_CONFIRM, 46 SFE_CM_EXCEPTION_CT_IS_ALG, 47 SFE_CM_EXCEPTION_IS_IPV4_MCAST, 48 SFE_CM_EXCEPTION_IS_IPV6_MCAST, 49 SFE_CM_EXCEPTION_TCP_NOT_ASSURED, 50 SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED, 51 SFE_CM_EXCEPTION_UNKNOW_PROTOCOL, 52 SFE_CM_EXCEPTION_NO_SRC_DEV, 53 SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV, 54 SFE_CM_EXCEPTION_NO_DEST_DEV, 55 SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV, 56 SFE_CM_EXCEPTION_NO_BRIDGE, 57 SFE_CM_EXCEPTION_LOCAL_OUT, 58 SFE_CM_EXCEPTION_MAX 59} sfe_cm_exception_t; 60 61static char *sfe_cm_exception_events_string[SFE_CM_EXCEPTION_MAX] = { 62 "PACKET_BROADCAST", 63 "PACKET_MULTICAST", 64 "NO_IIF", 65 "NO_CT", 66 "CT_NO_TRACK", 67 "CT_NO_CONFIRM", 68 "CT_IS_ALG", 69 "IS_IPV4_MCAST", 70 "IS_IPV6_MCAST", 71 "TCP_NOT_ASSURED", 72 "TCP_NOT_ESTABLISHED", 73 "UNKNOW_PROTOCOL", 74 "NO_SRC_DEV", 75 "NO_SRC_XLATE_DEV", 76 "NO_DEST_DEV", 77 "NO_DEST_XLATE_DEV", 78 "NO_BRIDGE", 79 "LOCAL_OUT" 80}; 81 82/* 83 * Per-module structure. 84 */ 85struct sfe_cm { 86 spinlock_t lock; /* Lock for SMP correctness */ 87 88 /* 89 * Control state. 90 */ 91 struct kobject *sys_sfe_cm; /* sysfs linkage */ 92 93 /* 94 * Callback notifiers. 95 */ 96 struct notifier_block dev_notifier; 97 /* Device notifier */ 98 struct notifier_block inet_notifier; 99 /* IPv4 notifier */ 100 struct notifier_block inet6_notifier; 101 /* IPv6 notifier */ 102 uint32_t exceptions[SFE_CM_EXCEPTION_MAX]; 103}; 104 105struct sfe_cm __sc; 106 107/* 108 * Expose the hook for the receive processing. 109 */ 110extern int (*athrs_fast_nat_recv)(struct sk_buff *skb); 111 112/* 113 * Expose what should be a static flag in the TCP connection tracker. 114 */ 115extern int nf_ct_tcp_no_window_check; 116 117/* 118 * sfe_cm_incr_exceptions() 119 * increase an exception counter. 120 */ 121static inline void sfe_cm_incr_exceptions(sfe_cm_exception_t except) 122{ 123 struct sfe_cm *sc = &__sc; 124 125 spin_lock_bh(&sc->lock); 126 sc->exceptions[except]++; 127 spin_unlock_bh(&sc->lock); 128} 129 130/* 131 * sfe_cm_recv() 132 * Handle packet receives. 133 * 134 * Returns 1 if the packet is forwarded or 0 if it isn't. 135 */ 136int sfe_cm_recv(struct sk_buff *skb) 137{ 138 struct net_device *dev; 139 140 /* 141 * We know that for the vast majority of packets we need the transport 142 * layer header so we may as well start to fetch it now! 143 */ 144 prefetch(skb->data + 32); 145 barrier(); 146 147 dev = skb->dev; 148 149 /* 150 * We're only interested in IPv4 and IPv6 packets. 151 */ 152 if (likely(htons(ETH_P_IP) == skb->protocol)) { 153#if (SFE_HOOK_ABOVE_BRIDGE) 154 struct in_device *in_dev; 155 156 /* 157 * Does our input device support IP processing? 158 */ 159 in_dev = (struct in_device *)dev->ip_ptr; 160 if (unlikely(!in_dev)) { 161 DEBUG_TRACE("no IP processing for device: %s\n", dev->name); 162 return 0; 163 } 164 165 /* 166 * Does it have an IP address? If it doesn't then we can't do anything 167 * interesting here! 168 */ 169 if (unlikely(!in_dev->ifa_list)) { 170 DEBUG_TRACE("no IP address for device: %s\n", dev->name); 171 return 0; 172 } 173#endif 174 175 return sfe_ipv4_recv(dev, skb); 176 } 177 178 if (likely(htons(ETH_P_IPV6) == skb->protocol)) { 179#if (SFE_HOOK_ABOVE_BRIDGE) 180 struct inet6_dev *in_dev; 181 182 /* 183 * Does our input device support IPv6 processing? 184 */ 185 in_dev = (struct inet6_dev *)dev->ip6_ptr; 186 if (unlikely(!in_dev)) { 187 DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name); 188 return 0; 189 } 190 191 /* 192 * Does it have an IPv6 address? If it doesn't then we can't do anything 193 * interesting here! 194 */ 195 if (unlikely(list_empty(&in_dev->addr_list))) { 196 DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name); 197 return 0; 198 } 199#endif 200 201 return sfe_ipv6_recv(dev, skb); 202 } 203 204 DEBUG_TRACE("not IP packet\n"); 205 return 0; 206} 207 208/* 209 * sfe_cm_find_dev_and_mac_addr() 210 * Find the device and MAC address for a given IPv4/IPv6 address. 211 * 212 * Returns true if we find the device and MAC address, otherwise false. 213 * 214 * We look up the rtable entry for the address and, from its neighbour 215 * structure, obtain the hardware address. This means this function also 216 * works if the neighbours are routers too. 217 */ 218static bool sfe_cm_find_dev_and_mac_addr(sfe_ip_addr_t *addr, struct net_device **dev, uint8_t *mac_addr, int is_v4) 219{ 220 struct neighbour *neigh; 221 struct rtable *rt; 222 struct rt6_info *rt6; 223 struct dst_entry *dst; 224 struct net_device *mac_dev; 225 226 /* 227 * Look up the rtable entry for the IP address then get the hardware 228 * address from its neighbour structure. This means this work when the 229 * neighbours are routers too. 230 */ 231 if (likely(is_v4)) { 232 rt = ip_route_output(&init_net, addr->ip, 0, 0, 0); 233 if (unlikely(IS_ERR(rt))) { 234 goto ret_fail; 235 } 236 237 dst = (struct dst_entry *)rt; 238 } else { 239 rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0); 240 if (!rt6) { 241 goto ret_fail; 242 } 243 244 dst = (struct dst_entry *)rt6; 245 } 246 247 rcu_read_lock(); 248 neigh = dst_neigh_lookup(dst, addr); 249 if (unlikely(!neigh)) { 250 rcu_read_unlock(); 251 dst_release(dst); 252 goto ret_fail; 253 } 254 255 if (unlikely(!(neigh->nud_state & NUD_VALID))) { 256 rcu_read_unlock(); 257 neigh_release(neigh); 258 dst_release(dst); 259 goto ret_fail; 260 } 261 262 mac_dev = neigh->dev; 263 if (!mac_dev) { 264 rcu_read_unlock(); 265 neigh_release(neigh); 266 dst_release(dst); 267 goto ret_fail; 268 } 269 270 memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len); 271 272 dev_hold(mac_dev); 273 *dev = mac_dev; 274 rcu_read_unlock(); 275 neigh_release(neigh); 276 dst_release(dst); 277 278 return true; 279 280ret_fail: 281 if (is_v4) { 282 DEBUG_TRACE("failed to find MAC address for IP: %pI4\n", &addr->ip); 283 284 } else { 285 DEBUG_TRACE("failed to find MAC address for IP: %pI6\n", addr->ip6); 286 } 287 288 return false; 289} 290 291/* 292 * sfe_cm_post_routing() 293 * Called for packets about to leave the box - either locally generated or forwarded from another interface 294 */ 295static unsigned int sfe_cm_post_routing(struct sk_buff *skb, int is_v4) 296{ 297 struct sfe_connection_create sic; 298 struct net_device *in; 299 struct nf_conn *ct; 300 enum ip_conntrack_info ctinfo; 301 struct net_device *dev; 302 struct net_device *src_dev; 303 struct net_device *dest_dev; 304 struct net_device *src_br_dev = NULL; 305 struct net_device *dest_br_dev = NULL; 306 struct nf_conntrack_tuple orig_tuple; 307 struct nf_conntrack_tuple reply_tuple; 308 309 /* 310 * Don't process broadcast or multicast packets. 311 */ 312 if (unlikely(skb->pkt_type == PACKET_BROADCAST)) { 313 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_BROADCAST); 314 DEBUG_TRACE("broadcast, ignoring\n"); 315 return NF_ACCEPT; 316 } 317 if (unlikely(skb->pkt_type == PACKET_MULTICAST)) { 318 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_MULTICAST); 319 DEBUG_TRACE("multicast, ignoring\n"); 320 return NF_ACCEPT; 321 } 322 323#ifdef CONFIG_XFRM 324 /* 325 * Packet to xfrm for encapsulation, we can't process it 326 */ 327 if (unlikely(skb_dst(skb)->xfrm)) { 328 DEBUG_TRACE("packet to xfrm, ignoring\n"); 329 return NF_ACCEPT; 330 } 331#endif 332 333 /* 334 * Don't process locally generated packets. 335 */ 336 if (skb->sk) { 337 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_LOCAL_OUT); 338 DEBUG_TRACE("skip local out packet\n"); 339 return NF_ACCEPT; 340 } 341 342 /* 343 * Don't process packets that are not being forwarded. 344 */ 345 in = dev_get_by_index(&init_net, skb->skb_iif); 346 if (!in) { 347 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_IIF); 348 DEBUG_TRACE("packet not forwarding\n"); 349 return NF_ACCEPT; 350 } 351 352 dev_put(in); 353 354 /* 355 * Don't process packets that aren't being tracked by conntrack. 356 */ 357 ct = nf_ct_get(skb, &ctinfo); 358 if (unlikely(!ct)) { 359 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_CT); 360 DEBUG_TRACE("no conntrack connection, ignoring\n"); 361 return NF_ACCEPT; 362 } 363 364 /* 365 * Don't process untracked connections. 366 */ 367 if (unlikely(ct == &nf_conntrack_untracked)) { 368 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_TRACK); 369 DEBUG_TRACE("untracked connection\n"); 370 return NF_ACCEPT; 371 } 372 373 /* 374 * Unconfirmed connection may be dropped by Linux at the final step, 375 * So we don't process unconfirmed connections. 376 */ 377 if (!nf_ct_is_confirmed(ct)) { 378 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_CONFIRM); 379 DEBUG_TRACE("unconfirmed connection\n"); 380 return NF_ACCEPT; 381 } 382 383 /* 384 * Don't process connections that require support from a 'helper' (typically a NAT ALG). 385 */ 386 if (unlikely(nfct_help(ct))) { 387 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_IS_ALG); 388 DEBUG_TRACE("connection has helper\n"); 389 return NF_ACCEPT; 390 } 391 392 /* 393 * Look up the details of our connection in conntrack. 394 * 395 * Note that the data we get from conntrack is for the "ORIGINAL" direction 396 * but our packet may actually be in the "REPLY" direction. 397 */ 398 orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 399 reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; 400 sic.protocol = (int32_t)orig_tuple.dst.protonum; 401 402 sic.flags = 0; 403 404 /* 405 * Get addressing information, non-NAT first 406 */ 407 if (likely(is_v4)) { 408 uint32_t dscp; 409 410 sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip; 411 sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; 412 413 if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) { 414 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV4_MCAST); 415 DEBUG_TRACE("multicast address\n"); 416 return NF_ACCEPT; 417 } 418 419 /* 420 * NAT'ed addresses - note these are as seen from the 'reply' direction 421 * When NAT does not apply to this connection these will be identical to the above. 422 */ 423 sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip; 424 sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip; 425 426 dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; 427 if (dscp) { 428 sic.src_dscp = sic.dest_dscp = dscp; 429 sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; 430 } 431 } else { 432 uint32_t dscp; 433 434 sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); 435 sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); 436 437 if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) || 438 ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) { 439 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV6_MCAST); 440 DEBUG_TRACE("multicast address\n"); 441 return NF_ACCEPT; 442 } 443 444 /* 445 * NAT'ed addresses - note these are as seen from the 'reply' direction 446 * When NAT does not apply to this connection these will be identical to the above. 447 */ 448 sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6); 449 sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6); 450 451 dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; 452 if (dscp) { 453 sic.src_dscp = sic.dest_dscp = dscp; 454 sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; 455 } 456 } 457 458 switch (sic.protocol) { 459 case IPPROTO_TCP: 460 sic.src_port = orig_tuple.src.u.tcp.port; 461 sic.dest_port = orig_tuple.dst.u.tcp.port; 462 sic.src_port_xlate = reply_tuple.dst.u.tcp.port; 463 sic.dest_port_xlate = reply_tuple.src.u.tcp.port; 464 sic.src_td_window_scale = ct->proto.tcp.seen[0].td_scale; 465 sic.src_td_max_window = ct->proto.tcp.seen[0].td_maxwin; 466 sic.src_td_end = ct->proto.tcp.seen[0].td_end; 467 sic.src_td_max_end = ct->proto.tcp.seen[0].td_maxend; 468 sic.dest_td_window_scale = ct->proto.tcp.seen[1].td_scale; 469 sic.dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin; 470 sic.dest_td_end = ct->proto.tcp.seen[1].td_end; 471 sic.dest_td_max_end = ct->proto.tcp.seen[1].td_maxend; 472 if (nf_ct_tcp_no_window_check 473 || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL) 474 || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) { 475 sic.flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK; 476 } 477 478 /* 479 * Don't try to manage a non-established connection. 480 */ 481 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) { 482 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ASSURED); 483 DEBUG_TRACE("non-established connection\n"); 484 return NF_ACCEPT; 485 } 486 487 /* 488 * If the connection is shutting down do not manage it. 489 * state can not be SYN_SENT, SYN_RECV because connection is assured 490 * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE. 491 */ 492 spin_lock_bh(&ct->lock); 493 if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) { 494 spin_unlock_bh(&ct->lock); 495 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED); 496 DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n", 497 ct->proto.tcp.state, &sic.src_ip, ntohs(sic.src_port), 498 &sic.dest_ip, ntohs(sic.dest_port)); 499 return NF_ACCEPT; 500 } 501 spin_unlock_bh(&ct->lock); 502 break; 503 504 case IPPROTO_UDP: 505 sic.src_port = orig_tuple.src.u.udp.port; 506 sic.dest_port = orig_tuple.dst.u.udp.port; 507 sic.src_port_xlate = reply_tuple.dst.u.udp.port; 508 sic.dest_port_xlate = reply_tuple.src.u.udp.port; 509 break; 510 511 default: 512 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_UNKNOW_PROTOCOL); 513 DEBUG_TRACE("unhandled protocol %d\n", sic.protocol); 514 return NF_ACCEPT; 515 } 516 517#ifdef CONFIG_XFRM 518 sic.original_accel = 1; 519 sic.reply_accel = 1; 520 521 /* 522 * For packets de-capsulated from xfrm, we still can accelerate it 523 * on the direction we just received the packet. 524 */ 525 if (unlikely(skb->sp)) { 526 if (sic.protocol == IPPROTO_TCP && 527 !(sic.flags & SFE_CREATE_FLAG_NO_SEQ_CHECK)) { 528 return NF_ACCEPT; 529 } 530 531 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { 532 sic.reply_accel = 0; 533 } else { 534 sic.original_accel = 0; 535 } 536 } 537#endif 538 539 /* 540 * Get QoS information 541 */ 542 if (skb->priority) { 543 sic.src_priority = sic.dest_priority = skb->priority; 544 sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY; 545 } 546 547 /* 548 * Get the net device and MAC addresses that correspond to the various source and 549 * destination host addresses. 550 */ 551 if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip, &src_dev, sic.src_mac, is_v4)) { 552 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_DEV); 553 return NF_ACCEPT; 554 } 555 556 if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) { 557 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV); 558 goto done1; 559 } 560 561 dev_put(dev); 562 563 if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip, &dev, sic.dest_mac, is_v4)) { 564 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_DEV); 565 goto done1; 566 } 567 568 dev_put(dev); 569 570 if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip_xlate, &dest_dev, sic.dest_mac_xlate, is_v4)) { 571 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV); 572 goto done1; 573 } 574 575#if (!SFE_HOOK_ABOVE_BRIDGE) 576 /* 577 * Now our devices may actually be a bridge interface. If that's 578 * the case then we need to hunt down the underlying interface. 579 */ 580 if (src_dev->priv_flags & IFF_EBRIDGE) { 581 src_br_dev = br_port_dev_get(src_dev, sic.src_mac); 582 if (!src_br_dev) { 583 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE); 584 DEBUG_TRACE("no port found on bridge\n"); 585 goto done2; 586 } 587 588 src_dev = src_br_dev; 589 } 590 591 if (dest_dev->priv_flags & IFF_EBRIDGE) { 592 dest_br_dev = br_port_dev_get(dest_dev, sic.dest_mac_xlate); 593 if (!dest_br_dev) { 594 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE); 595 DEBUG_TRACE("no port found on bridge\n"); 596 goto done3; 597 } 598 599 dest_dev = dest_br_dev; 600 } 601#else 602 /* 603 * Our devices may actually be part of a bridge interface. If that's 604 * the case then find the bridge interface instead. 605 */ 606 if (src_dev->priv_flags & IFF_BRIDGE_PORT) { 607 src_br_dev = sfe_dev_get_master(src_dev); 608 if (!src_br_dev) { 609 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE); 610 DEBUG_TRACE("no bridge found for: %s\n", src_dev->name); 611 goto done2; 612 } 613 614 src_dev = src_br_dev; 615 } 616 617 if (dest_dev->priv_flags & IFF_BRIDGE_PORT) { 618 dest_br_dev = sfe_dev_get_master(dest_dev); 619 if (!dest_br_dev) { 620 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE); 621 DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name); 622 goto done3; 623 } 624 625 dest_dev = dest_br_dev; 626 } 627#endif 628 629 sic.src_dev = src_dev; 630 sic.dest_dev = dest_dev; 631 632 sic.src_mtu = src_dev->mtu; 633 sic.dest_mtu = dest_dev->mtu; 634 635 if (likely(is_v4)) { 636 sfe_ipv4_create_rule(&sic); 637 } else { 638 sfe_ipv6_create_rule(&sic); 639 } 640 641 /* 642 * If we had bridge ports then release them too. 643 */ 644 if (dest_br_dev) { 645 dev_put(dest_br_dev); 646 } 647 648done3: 649 if (src_br_dev) { 650 dev_put(src_br_dev); 651 } 652 653done2: 654 dev_put(dest_dev); 655 656done1: 657 dev_put(src_dev); 658 659 return NF_ACCEPT; 660} 661 662/* 663 * sfe_cm_ipv4_post_routing_hook() 664 * Called for packets about to leave the box - either locally generated or forwarded from another interface 665 */ 666sfe_cm_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) 667{ 668 return sfe_cm_post_routing(skb, true); 669} 670 671/* 672 * sfe_cm_ipv6_post_routing_hook() 673 * Called for packets about to leave the box - either locally generated or forwarded from another interface 674 */ 675sfe_cm_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) 676{ 677 return sfe_cm_post_routing(skb, false); 678} 679 680 681#ifdef CONFIG_NF_CONNTRACK_EVENTS 682/* 683 * sfe_cm_conntrack_event() 684 * Callback event invoked when a conntrack connection's state changes. 685 */ 686#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS 687static int sfe_cm_conntrack_event(struct notifier_block *this, 688 unsigned long events, void *ptr) 689#else 690static int sfe_cm_conntrack_event(unsigned int events, struct nf_ct_event *item) 691#endif 692{ 693#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS 694 struct nf_ct_event *item = ptr; 695#endif 696 struct sfe_connection_destroy sid; 697 struct nf_conn *ct = item->ct; 698 struct nf_conntrack_tuple orig_tuple; 699 700 /* 701 * If we don't have a conntrack entry then we're done. 702 */ 703 if (unlikely(!ct)) { 704 DEBUG_WARN("no ct in conntrack event callback\n"); 705 return NOTIFY_DONE; 706 } 707 708 /* 709 * If this is an untracked connection then we can't have any state either. 710 */ 711 if (unlikely(ct == &nf_conntrack_untracked)) { 712 DEBUG_TRACE("ignoring untracked conn\n"); 713 return NOTIFY_DONE; 714 } 715 716 /* 717 * We're only interested in destroy events. 718 */ 719 if (unlikely(!(events & (1 << IPCT_DESTROY)))) { 720 DEBUG_TRACE("ignoring non-destroy event\n"); 721 return NOTIFY_DONE; 722 } 723 724 orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 725 sid.protocol = (int32_t)orig_tuple.dst.protonum; 726 727 /* 728 * Extract information from the conntrack connection. We're only interested 729 * in nominal connection information (i.e. we're ignoring any NAT information). 730 */ 731 switch (sid.protocol) { 732 case IPPROTO_TCP: 733 sid.src_port = orig_tuple.src.u.tcp.port; 734 sid.dest_port = orig_tuple.dst.u.tcp.port; 735 break; 736 737 case IPPROTO_UDP: 738 sid.src_port = orig_tuple.src.u.udp.port; 739 sid.dest_port = orig_tuple.dst.u.udp.port; 740 break; 741 742 default: 743 DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol); 744 return NOTIFY_DONE; 745 } 746 747 if (likely(nf_ct_l3num(ct) == AF_INET)) { 748 sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip; 749 sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; 750 751 sfe_ipv4_destroy_rule(&sid); 752 } else if (likely(nf_ct_l3num(ct) == AF_INET6)) { 753 sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); 754 sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); 755 756 sfe_ipv6_destroy_rule(&sid); 757 } else { 758 DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n"); 759 } 760 761 return NOTIFY_DONE; 762} 763 764/* 765 * Netfilter conntrack event system to monitor connection tracking changes 766 */ 767#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS 768static struct notifier_block sfe_cm_conntrack_notifier = { 769 .notifier_call = sfe_cm_conntrack_event, 770}; 771#else 772static struct nf_ct_event_notifier sfe_cm_conntrack_notifier = { 773 .fcn = sfe_cm_conntrack_event, 774}; 775#endif 776#endif 777 778/* 779 * Structure to establish a hook into the post routing netfilter point - this 780 * will pick up local outbound and packets going from one interface to another. 781 * 782 * Note: see include/linux/netfilter_ipv4.h for info related to priority levels. 783 * We want to examine packets after NAT translation and any ALG processing. 784 */ 785static struct nf_hook_ops sfe_cm_ops_post_routing[] __read_mostly = { 786 { 787 .hook = __sfe_cm_ipv4_post_routing_hook, 788 .owner = THIS_MODULE, 789 .pf = NFPROTO_IPV4, 790 .hooknum = NF_INET_POST_ROUTING, 791 .priority = NF_IP_PRI_NAT_SRC + 1, 792 }, 793#ifdef SFE_SUPPORT_IPV6 794 { 795 .hook = __sfe_cm_ipv6_post_routing_hook, 796 .owner = THIS_MODULE, 797 .pf = NFPROTO_IPV6, 798 .hooknum = NF_INET_POST_ROUTING, 799 .priority = NF_IP6_PRI_NAT_SRC + 1, 800 }, 801#endif 802}; 803 804/* 805 * sfe_cm_sync_rule() 806 * Synchronize a connection's state. 807 */ 808static void sfe_cm_sync_rule(struct sfe_connection_sync *sis) 809{ 810 struct nf_conntrack_tuple_hash *h; 811 struct nf_conntrack_tuple tuple; 812 struct nf_conn *ct; 813 SFE_NF_CONN_ACCT(acct); 814 815 /* 816 * Create a tuple so as to be able to look up a connection 817 */ 818 memset(&tuple, 0, sizeof(tuple)); 819 tuple.src.u.all = (__be16)sis->src_port; 820 tuple.dst.dir = IP_CT_DIR_ORIGINAL; 821 tuple.dst.protonum = (uint8_t)sis->protocol; 822 tuple.dst.u.all = (__be16)sis->dest_port; 823 824 if (sis->is_v6) { 825 tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6); 826 tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6); 827 tuple.src.l3num = AF_INET6; 828 829 DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n", 830 (int)tuple.dst.protonum, 831 &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all), 832 &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all)); 833 } else { 834 tuple.src.u3.ip = sis->src_ip.ip; 835 tuple.dst.u3.ip = sis->dest_ip.ip; 836 tuple.src.l3num = AF_INET; 837 838 DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n", 839 (int)tuple.dst.protonum, 840 &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all), 841 &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all)); 842 } 843 844 /* 845 * Look up conntrack connection 846 */ 847 h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple); 848 if (unlikely(!h)) { 849 DEBUG_TRACE("no connection found\n"); 850 return; 851 } 852 853 ct = nf_ct_tuplehash_to_ctrack(h); 854 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); 855 856 /* 857 * Only update if this is not a fixed timeout 858 */ 859 if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { 860 spin_lock_bh(&ct->lock); 861 ct->timeout.expires += sis->delta_jiffies; 862 spin_unlock_bh(&ct->lock); 863 } 864 865 acct = nf_conn_acct_find(ct); 866 if (acct) { 867 spin_lock_bh(&ct->lock); 868 atomic64_set(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets, sis->src_packet_count); 869 atomic64_set(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes, sis->src_byte_count); 870 atomic64_set(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets, sis->dest_packet_count); 871 atomic64_set(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes, sis->dest_byte_count); 872 spin_unlock_bh(&ct->lock); 873 } 874 875 switch (sis->protocol) { 876 case IPPROTO_TCP: 877 spin_lock_bh(&ct->lock); 878 if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) { 879 ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window; 880 } 881 if ((int32_t)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) { 882 ct->proto.tcp.seen[0].td_end = sis->src_td_end; 883 } 884 if ((int32_t)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) { 885 ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end; 886 } 887 if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) { 888 ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window; 889 } 890 if ((int32_t)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) { 891 ct->proto.tcp.seen[1].td_end = sis->dest_td_end; 892 } 893 if ((int32_t)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) { 894 ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end; 895 } 896 spin_unlock_bh(&ct->lock); 897 break; 898 } 899 900 /* 901 * Release connection 902 */ 903 nf_ct_put(ct); 904} 905 906/* 907 * sfe_cm_device_event() 908 */ 909int sfe_cm_device_event(struct notifier_block *this, unsigned long event, void *ptr) 910{ 911 struct net_device *dev = SFE_DEV_EVENT_PTR(ptr); 912 913 switch (event) { 914 case NETDEV_DOWN: 915 if (dev) { 916 sfe_ipv4_destroy_all_rules_for_dev(dev); 917 sfe_ipv6_destroy_all_rules_for_dev(dev); 918 } 919 break; 920 } 921 922 return NOTIFY_DONE; 923} 924 925/* 926 * sfe_cm_inet_event() 927 */ 928static int sfe_cm_inet_event(struct notifier_block *this, unsigned long event, void *ptr) 929{ 930 struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; 931 return sfe_cm_propagate_event(this, event, dev); 932} 933 934/* 935 * sfe_cm_inet6_event() 936 */ 937static int sfe_cm_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) 938{ 939 struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev; 940 return sfe_cm_propagate_event(this, event, dev); 941} 942 943/* 944 * sfe_cm_get_exceptions 945 * dump exception counters 946 */ 947static ssize_t sfe_cm_get_exceptions(struct device *dev, 948 struct device_attribute *attr, 949 char *buf) 950{ 951 int idx, len; 952 struct sfe_cm *sc = &__sc; 953 954 spin_lock_bh(&sc->lock); 955 for (len = 0, idx = 0; idx < SFE_CM_EXCEPTION_MAX; idx++) { 956 if (sc->exceptions[idx]) { 957 len += sprintf(buf + len, "%s = %d\n", sfe_cm_exception_events_string[idx], sc->exceptions[idx]); 958 } 959 } 960 spin_unlock_bh(&sc->lock); 961 962 return len; 963} 964 965/* 966 * sysfs attributes. 967 */ 968static const struct device_attribute sfe_cm_exceptions_attr = 969 __ATTR(exceptions, S_IRUGO, sfe_cm_get_exceptions, NULL); 970 971/* 972 * sfe_cm_init() 973 */ 974static int __init sfe_cm_init(void) 975{ 976 struct sfe_cm *sc = &__sc; 977 int result = -1; 978 979 DEBUG_INFO("SFE CM init\n"); 980 981 /* 982 * Create sys/sfe_cm 983 */ 984 sc->sys_sfe_cm = kobject_create_and_add("sfe_cm", NULL); 985 if (!sc->sys_sfe_cm) { 986 DEBUG_ERROR("failed to register sfe_cm\n"); 987 goto exit1; 988 } 989 990 /* 991 * Create sys/sfe_cm/exceptions 992 */ 993 result = sysfs_create_file(sc->sys_sfe_cm, &sfe_cm_exceptions_attr.attr); 994 if (result) { 995 DEBUG_ERROR("failed to register exceptions file: %d\n", result); 996 goto exit2; 997 } 998 999 sc->dev_notifier.notifier_call = sfe_cm_device_event; 1000 sc->dev_notifier.priority = 1; 1001 register_netdevice_notifier(&sc->dev_notifier); 1002 1003 sc->inet_notifier.notifier_call = sfe_cm_inet_event; 1004 sc->inet_notifier.priority = 1; 1005 register_inetaddr_notifier(&sc->inet_notifier); 1006 1007 sc->inet6_notifier.notifier_call = sfe_cm_inet6_event; 1008 sc->inet6_notifier.priority = 1; 1009 register_inet6addr_notifier(&sc->inet6_notifier); 1010 /* 1011 * Register our netfilter hooks. 1012 */ 1013 result = nf_register_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); 1014 if (result < 0) { 1015 DEBUG_ERROR("can't register nf post routing hook: %d\n", result); 1016 goto exit3; 1017 } 1018 1019#ifdef CONFIG_NF_CONNTRACK_EVENTS 1020 /* 1021 * Register a notifier hook to get fast notifications of expired connections. 1022 */ 1023 result = nf_conntrack_register_notifier(&init_net, &sfe_cm_conntrack_notifier); 1024 if (result < 0) { 1025 DEBUG_ERROR("can't register nf notifier hook: %d\n", result); 1026 goto exit4; 1027 } 1028#endif 1029 1030 spin_lock_init(&sc->lock); 1031 1032 /* 1033 * Hook the receive path in the network stack. 1034 */ 1035 BUG_ON(athrs_fast_nat_recv != NULL); 1036 RCU_INIT_POINTER(athrs_fast_nat_recv, sfe_cm_recv); 1037 1038 /* 1039 * Hook the shortcut sync callback. 1040 */ 1041 sfe_ipv4_register_sync_rule_callback(sfe_cm_sync_rule); 1042 sfe_ipv6_register_sync_rule_callback(sfe_cm_sync_rule); 1043 return 0; 1044 1045#ifdef CONFIG_NF_CONNTRACK_EVENTS 1046exit4: 1047#endif 1048 nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); 1049 1050exit3: 1051 unregister_inet6addr_notifier(&sc->inet6_notifier); 1052 unregister_inetaddr_notifier(&sc->inet_notifier); 1053 unregister_netdevice_notifier(&sc->dev_notifier); 1054exit2: 1055 kobject_put(sc->sys_sfe_cm); 1056 1057exit1: 1058 return result; 1059} 1060 1061/* 1062 * sfe_cm_exit() 1063 */ 1064static void __exit sfe_cm_exit(void) 1065{ 1066 struct sfe_cm *sc = &__sc; 1067 1068 DEBUG_INFO("SFE CM exit\n"); 1069 1070 /* 1071 * Unregister our sync callback. 1072 */ 1073 sfe_ipv4_register_sync_rule_callback(NULL); 1074 sfe_ipv6_register_sync_rule_callback(NULL); 1075 1076 /* 1077 * Unregister our receive callback. 1078 */ 1079 RCU_INIT_POINTER(athrs_fast_nat_recv, NULL); 1080 1081 /* 1082 * Wait for all callbacks to complete. 1083 */ 1084 rcu_barrier(); 1085 1086 /* 1087 * Destroy all connections. 1088 */ 1089 sfe_ipv4_destroy_all_rules_for_dev(NULL); 1090 sfe_ipv6_destroy_all_rules_for_dev(NULL); 1091 1092#ifdef CONFIG_NF_CONNTRACK_EVENTS 1093 nf_conntrack_unregister_notifier(&init_net, &sfe_cm_conntrack_notifier); 1094 1095#endif 1096 nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); 1097 1098 unregister_inet6addr_notifier(&sc->inet6_notifier); 1099 unregister_inetaddr_notifier(&sc->inet_notifier); 1100 unregister_netdevice_notifier(&sc->dev_notifier); 1101 1102 kobject_put(sc->sys_sfe_cm); 1103} 1104 1105module_init(sfe_cm_init) 1106module_exit(sfe_cm_exit) 1107 1108MODULE_AUTHOR("Qualcomm Atheros Inc."); 1109MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager"); 1110MODULE_LICENSE("Dual BSD/GPL"); 1111 1112