1/* Modified by Broadcom Corp. Portions Copyright (c) Broadcom Corp, 2012. */ 2/* Connection state tracking for netfilter. This is separated from, 3 but required by, the NAT layer; it can also be used by an iptables 4 extension. */ 5 6/* (C) 1999-2001 Paul `Rusty' Russell 7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License version 2 as 12 * published by the Free Software Foundation. 13 */ 14 15#include <linux/types.h> 16#include <linux/netfilter.h> 17#include <linux/module.h> 18#include <linux/sched.h> 19#include <linux/skbuff.h> 20#include <linux/proc_fs.h> 21#include <linux/vmalloc.h> 22#include <linux/stddef.h> 23#include <linux/slab.h> 24#include <linux/random.h> 25#include <linux/jhash.h> 26#include <linux/err.h> 27#include <linux/percpu.h> 28#include <linux/moduleparam.h> 29#include <linux/notifier.h> 30#include <linux/kernel.h> 31#include <linux/netdevice.h> 32#include <linux/socket.h> 33#include <linux/mm.h> 34#include <linux/nsproxy.h> 35#include <linux/rculist_nulls.h> 36 37#include <net/netfilter/nf_conntrack.h> 38#include <net/netfilter/nf_conntrack_l3proto.h> 39#include <net/netfilter/nf_conntrack_l4proto.h> 40#include <net/netfilter/nf_conntrack_expect.h> 41#include <net/netfilter/nf_conntrack_helper.h> 42#include <net/netfilter/nf_conntrack_core.h> 43#include <net/netfilter/nf_conntrack_extend.h> 44#include <net/netfilter/nf_conntrack_acct.h> 45#include <net/netfilter/nf_conntrack_ecache.h> 46#include <net/netfilter/nf_conntrack_zones.h> 47#include <net/netfilter/nf_nat.h> 48#include <net/netfilter/nf_nat_core.h> 49 50#define NF_CONNTRACK_VERSION "0.5.0" 51 52#ifdef HNDCTF 53#include <linux/if.h> 54#include <linux/if_vlan.h> 55#if defined(CTF_PPTP) || defined(CTF_L2TP) 56#include <linux/if_pppox.h> 57#endif 58#include <linux/in.h> 59#include <linux/ip.h> 60#include <linux/tcp.h> 61 62#ifdef CONFIG_IPV6 63#include <linux/ipv6.h> 64#include <net/ipv6.h> 65#include <net/ip6_route.h> 66#define IPVERSION_IS_4(ipver) ((ipver) == 4) 67#else 68#define IPVERSION_IS_4(ipver) 1 69#endif /* CONFIG_IPV6 */ 70 71#include <net/ip.h> 72#include <net/route.h> 73#include <typedefs.h> 74#include <osl.h> 75#include <ctf/hndctf.h> 76#include <ctf/ctf_cfg.h> 77 78#define NFC_CTF_ENABLED (1 << 31) 79#else 80#define BCMFASTPATH_HOST 81#endif /* HNDCTF */ 82 83int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, 84 enum nf_nat_manip_type manip, 85 const struct nlattr *attr) __read_mostly; 86EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); 87 88DEFINE_SPINLOCK(nf_conntrack_lock); 89EXPORT_SYMBOL_GPL(nf_conntrack_lock); 90 91unsigned int nf_conntrack_htable_size __read_mostly; 92EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 93 94unsigned int nf_conntrack_max __read_mostly; 95EXPORT_SYMBOL_GPL(nf_conntrack_max); 96 97DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); 98EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); 99 100#ifdef HNDCTF 101/* 102 * Display an IP address in readable format. 103 */ 104/* Returns the number of 1-bits in x */ 105static int 106_popcounts(uint32 x) 107{ 108 x = x - ((x >> 1) & 0x55555555); 109 x = ((x >> 2) & 0x33333333) + (x & 0x33333333); 110 x = (x + (x >> 4)) & 0x0F0F0F0F; 111 x = (x + (x >> 16)); 112 return (x + (x >> 8)) & 0x0000003F; 113} 114bool 115ip_conntrack_is_ipc_allowed(struct sk_buff *skb, u_int32_t hooknum) 116{ 117 struct net_device *dev; 118 119 if (!CTF_ENAB(kcih)) 120 return FALSE; 121 122 if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_POST_ROUTING) { 123 dev = skb->dev; 124 if (dev->priv_flags & IFF_802_1Q_VLAN) 125 dev = vlan_dev_real_dev(dev); 126 127 /* Add ipc entry if packet is received on ctf enabled interface 128 * and the packet is not a defrag'd one. 129 */ 130 if (ctf_isenabled(kcih, dev) && (skb->len <= dev->mtu)) 131 skb->nfcache |= NFC_CTF_ENABLED; 132 } 133 134 /* Add the cache entries only if the device has registered and 135 * enabled ctf. 136 */ 137 if (skb->nfcache & NFC_CTF_ENABLED) 138 return TRUE; 139 140 return FALSE; 141} 142 143void 144ip_conntrack_ipct_add(struct sk_buff *skb, u_int32_t hooknum, 145 struct nf_conn *ct, enum ip_conntrack_info ci, 146 struct nf_conntrack_tuple *manip) 147{ 148 ctf_ipc_t ipc_entry; 149 struct hh_cache *hh; 150 struct ethhdr *eth; 151 struct iphdr *iph; 152 struct tcphdr *tcph; 153 struct rtable *rt; 154 struct nf_conn_help *help; 155 enum ip_conntrack_dir dir; 156 uint8 ipver, protocol; 157#ifdef CONFIG_IPV6 158 struct ipv6hdr *ip6h = NULL; 159#endif /* CONFIG_IPV6 */ 160 uint32 nud_flags; 161 162 if ((skb == NULL) || (ct == NULL)) 163 return; 164 165 /* Check CTF enabled */ 166 if (!ip_conntrack_is_ipc_allowed(skb, hooknum)) 167 return; 168 /* We only add cache entires for non-helper connections and at 169 * pre or post routing hooks. 170 */ 171 help = nfct_help(ct); 172 if ((help && help->helper) || (ct->ctf_flags & CTF_FLAGS_EXCLUDED) || 173 ((hooknum != NF_INET_PRE_ROUTING) && (hooknum != NF_INET_POST_ROUTING))) 174 return; 175 176 iph = ip_hdr(skb); 177 ipver = iph->version; 178 179 /* Support both IPv4 and IPv6 */ 180 if (ipver == 4) { 181 tcph = ((struct tcphdr *)(((__u8 *)iph) + (iph->ihl << 2))); 182 protocol = iph->protocol; 183 } 184#ifdef CONFIG_IPV6 185 else if (ipver == 6) { 186 ip6h = (struct ipv6hdr *)iph; 187 tcph = (struct tcphdr *)ctf_ipc_lkup_l4proto(kcih, ip6h, &protocol); 188 if (tcph == NULL) 189 return; 190 } 191#endif /* CONFIG_IPV6 */ 192 else 193 return; 194 195 /* Only TCP and UDP are supported */ 196 if (protocol == IPPROTO_TCP) { 197 /* Add ipc entries for connections in established state only */ 198 if ((ci != IP_CT_ESTABLISHED) && (ci != (IP_CT_ESTABLISHED+IP_CT_IS_REPLY))) 199 return; 200 201 if (ct->proto.tcp.state >= TCP_CONNTRACK_FIN_WAIT && 202 ct->proto.tcp.state <= TCP_CONNTRACK_TIME_WAIT) 203 return; 204 } 205 else if (protocol != IPPROTO_UDP) 206 return; 207 208 dir = CTINFO2DIR(ci); 209 if (ct->ctf_flags & (1 << dir)) 210 return; 211 212 /* Do route lookup for alias address if we are doing DNAT in this 213 * direction. 214 */ 215 if (skb_dst(skb) == NULL) { 216 /* Find the destination interface */ 217 if (IPVERSION_IS_4(ipver)) { 218 u_int32_t daddr; 219 220 if ((manip != NULL) && (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST)) 221 daddr = manip->dst.u3.ip; 222 else 223 daddr = iph->daddr; 224 ip_route_input(skb, daddr, iph->saddr, iph->tos, skb->dev); 225 } 226#ifdef CONFIG_IPV6 227 else 228 ip6_route_input(skb); 229#endif /* CONFIG_IPV6 */ 230 } 231 232 /* Ensure the packet belongs to a forwarding connection and it is 233 * destined to an unicast address. 234 */ 235 rt = (struct rtable *)skb_dst(skb); 236 237 nud_flags = NUD_PERMANENT | NUD_REACHABLE | NUD_STALE | NUD_DELAY | NUD_PROBE; 238#if defined(CTF_PPPOE) || defined(CTF_PPTP) || defined(CTF_L2TP) 239 if ((skb_dst(skb) != NULL) && (skb_dst(skb)->dev != NULL) && 240 (skb_dst(skb)->dev->flags & IFF_POINTOPOINT)) 241 nud_flags |= NUD_NOARP; 242#endif /* CTF_PPPOE | CTF_PPTP | CTF_L2TP */ 243 244 if ((rt == NULL) || ( 245#ifdef CONFIG_IPV6 246 !IPVERSION_IS_4(ipver) ? 247 ((rt->dst.input != ip6_forward) || 248 !(ipv6_addr_type(&ip6h->daddr) & IPV6_ADDR_UNICAST)) : 249#endif /* CONFIG_IPV6 */ 250 ((rt->dst.input != ip_forward) || (rt->rt_type != RTN_UNICAST))) || 251 (rt->dst.neighbour == NULL) || 252 ((rt->dst.neighbour->nud_state & nud_flags) == 0)) 253 return; 254 255 memset(&ipc_entry, 0, sizeof(ipc_entry)); 256 257 /* Init the neighboring sender address */ 258 memcpy(ipc_entry.sa.octet, eth_hdr(skb)->h_source, ETH_ALEN); 259 260 /* If the packet is received on a bridge device then save 261 * the bridge cache entry pointer in the ip cache entry. 262 * This will be referenced in the data path to update the 263 * live counter of brc entry whenever a received packet 264 * matches corresponding ipc entry matches. 265 */ 266 if ((skb->dev != NULL) && ctf_isbridge(kcih, skb->dev)) { 267 ipc_entry.brcp = ctf_brc_lkup(kcih, eth_hdr(skb)->h_source); 268 if (ipc_entry.brcp != NULL) 269 ctf_brc_release(kcih, ipc_entry.brcp); 270 } 271 272 hh = skb_dst(skb)->hh; 273 if (hh != NULL) { 274 eth = (struct ethhdr *)(((unsigned char *)hh->hh_data) + 2); 275 memcpy(ipc_entry.dhost.octet, eth->h_dest, ETH_ALEN); 276 memcpy(ipc_entry.shost.octet, eth->h_source, ETH_ALEN); 277 } else { 278 memcpy(ipc_entry.dhost.octet, rt->dst.neighbour->ha, ETH_ALEN); 279 memcpy(ipc_entry.shost.octet, skb_dst(skb)->dev->dev_addr, ETH_ALEN); 280 } 281 282 /* Add ctf ipc entry for this direction */ 283 if (IPVERSION_IS_4(ipver)) { 284 ipc_entry.tuple.sip[0] = iph->saddr; 285 ipc_entry.tuple.dip[0] = iph->daddr; 286#ifdef CONFIG_IPV6 287 } else { 288 memcpy(ipc_entry.tuple.sip, &ip6h->saddr, sizeof(ipc_entry.tuple.sip)); 289 memcpy(ipc_entry.tuple.dip, &ip6h->daddr, sizeof(ipc_entry.tuple.dip)); 290#endif /* CONFIG_IPV6 */ 291 } 292 ipc_entry.tuple.proto = protocol; 293 ipc_entry.tuple.sp = tcph->source; 294 ipc_entry.tuple.dp = tcph->dest; 295 296 ipc_entry.next = NULL; 297 298 /* For vlan interfaces fill the vlan id and the tag/untag actions */ 299 if (skb_dst(skb)->dev->priv_flags & IFF_802_1Q_VLAN) { 300 ipc_entry.txif = (void *)vlan_dev_real_dev(skb_dst(skb)->dev); 301 ipc_entry.vid = vlan_dev_vlan_id(skb_dst(skb)->dev); 302 ipc_entry.action = ((vlan_dev_vlan_flags(skb_dst(skb)->dev) & 1) ? 303 CTF_ACTION_TAG : CTF_ACTION_UNTAG); 304 } else { 305 ipc_entry.txif = skb_dst(skb)->dev; 306 ipc_entry.action = CTF_ACTION_UNTAG; 307 } 308 309#if defined(CTF_PPTP) || defined(CTF_L2TP) 310 if (((skb_dst(skb)->dev->flags & IFF_POINTOPOINT) || (skb->dev->flags & IFF_POINTOPOINT) )) { 311 int pppunit = 0; 312 struct net_device *pppox_tx_dev=NULL; 313 ctf_ppp_t ctfppp; 314 315 /* For pppoe interfaces fill the session id and header add/del actions */ 316 if (skb_dst(skb)->dev->flags & IFF_POINTOPOINT) { 317 /* Transmit interface and sid will be populated by pppoe module */ 318 ipc_entry.ppp_ifp = skb_dst(skb)->dev; 319 } else if (skb->dev->flags & IFF_POINTOPOINT) { 320 ipc_entry.ppp_ifp = skb->dev; 321 } else{ 322 ipc_entry.ppp_ifp = NULL; 323 ipc_entry.pppoe_sid = 0xffff; 324 } 325 326 if (ipc_entry.ppp_ifp){ 327 struct net_device *pppox_tx_dev=NULL; 328 ctf_ppp_t ctfppp; 329 if (ppp_get_conn_pkt_info(ipc_entry.ppp_ifp,&ctfppp)){ 330 return; 331 } 332 else { 333 if(ctfppp.psk.pppox_protocol == PX_PROTO_OE){ 334 goto PX_PROTO_PPPOE; 335 } 336#ifdef CTF_PPTP 337 else if (ctfppp.psk.pppox_protocol == PX_PROTO_PPTP){ 338 struct pptp_opt *opt; 339 if(ctfppp.psk.po == NULL) 340 return; 341 opt=&ctfppp.psk.po->proto.pptp; 342 if (skb_dst(skb)->dev->flags & IFF_POINTOPOINT){ 343 ipc_entry.action |= CTF_ACTION_PPTP_ADD; 344 345 /* For PPTP, to get rt information*/ 346 if ((manip != NULL) && (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)){ 347 struct flowi fl = { .oif = 0, 348 .nl_u = { .ip4_u = 349 { .daddr = opt->dst_addr.sin_addr.s_addr, 350 .saddr = opt->src_addr.sin_addr.s_addr, 351 .tos = RT_TOS(0) } }, 352 .proto = IPPROTO_GRE }; 353 if (ip_route_output_key(&init_net,&rt, &fl) ) { 354 return; 355 } 356 if (rt==NULL) 357 return; 358 359 if (skb_dst(skb)->hh == NULL) { 360 memcpy(ipc_entry.dhost.octet, rt->dst.neighbour->ha, ETH_ALEN); 361 } 362 } 363 364 pppox_tx_dev = rt->dst.dev; 365 memcpy(ipc_entry.shost.octet, rt->dst.dev->dev_addr, ETH_ALEN); 366 ctf_pptp_cache(kcih, 367 dst_metric(&rt->dst, RTAX_LOCK)&(1<<RTAX_MTU), 368 dst_metric(&rt->dst, RTAX_HOPLIMIT)); 369 } 370 else{ 371 ipc_entry.action |= CTF_ACTION_PPTP_DEL; 372 } 373 374 ipc_entry.pppox_opt = &ctfppp.psk.po->proto.pptp; 375 } 376#endif 377#ifdef CTF_L2TP 378 else if (ctfppp.psk.pppox_protocol == PX_PROTO_OL2TP){ 379 struct l2tp_opt *opt; 380 if (ctfppp.psk.po == NULL) 381 return; 382 opt=&ctfppp.psk.po->proto.l2tp; 383 if (skb_dst(skb)->dev->flags & IFF_POINTOPOINT){ 384 ipc_entry.action |= CTF_ACTION_L2TP_ADD; 385 386 /* For PPTP, to get rt information*/ 387 if ((manip != NULL) && (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)){ 388 struct flowi fl = { .oif = 0, 389 .nl_u = { .ip4_u = 390 { .daddr = opt->inet.daddr, 391 .saddr = opt->inet.saddr, 392 .tos = RT_TOS(0) } }, 393 .proto = IPPROTO_UDP }; 394 if (ip_route_output_key(&init_net,&rt, &fl) ) { 395 return; 396 } 397 if (rt==NULL) 398 return; 399 400 if (skb_dst(skb)->hh == NULL) { 401 memcpy(ipc_entry.dhost.octet, rt->dst.neighbour->ha, ETH_ALEN); 402 } 403 } 404 405 opt->inet.ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT); 406 pppox_tx_dev = rt->dst.dev; 407 memcpy(ipc_entry.shost.octet, rt->dst.dev->dev_addr, ETH_ALEN); 408 } 409 else{ 410 ipc_entry.action |= CTF_ACTION_L2TP_DEL; 411 } 412 413 ipc_entry.pppox_opt = &ctfppp.psk.po->proto.l2tp; 414 } 415#endif 416 else 417 return; 418 419 /* For vlan interfaces fill the vlan id and the tag/untag actions */ 420 if(pppox_tx_dev){ 421 if (pppox_tx_dev ->priv_flags & IFF_802_1Q_VLAN) { 422 ipc_entry.txif = (void *)vlan_dev_real_dev(pppox_tx_dev); 423 ipc_entry.vid = vlan_dev_vlan_id(pppox_tx_dev); 424 ipc_entry.action |= ((vlan_dev_vlan_flags(pppox_tx_dev) & 1) ? 425 CTF_ACTION_TAG : CTF_ACTION_UNTAG); 426 } else { 427 ipc_entry.txif = pppox_tx_dev; 428 ipc_entry.action |= CTF_ACTION_UNTAG; 429 } 430 } 431 } 432 } 433 } 434 435 if (ipc_entry.action & 436 (CTF_ACTION_L2TP_ADD | CTF_ACTION_L2TP_DEL | CTF_ACTION_PPTP_ADD | CTF_ACTION_PPTP_DEL)) { 437 goto PX_PROTO_PPTP_L2TP; 438 } 439PX_PROTO_PPPOE: 440#endif /* CTF_PPTP | CTF_L2TP */ 441 442#ifdef CTF_PPPOE 443 /* For pppoe interfaces fill the session id and header add/del actions */ 444 ipc_entry.pppoe_sid = -1; 445 if (skb_dst(skb)->dev->flags & IFF_POINTOPOINT) { 446 /* Transmit interface and sid will be populated by pppoe module */ 447 ipc_entry.action |= CTF_ACTION_PPPOE_ADD; 448 skb->ctf_pppoe_cb[0] = 2; 449 ipc_entry.ppp_ifp = skb_dst(skb)->dev; 450 } else if ((skb->dev->flags & IFF_POINTOPOINT) && (skb->ctf_pppoe_cb[0] == 1)) { 451 ipc_entry.action |= CTF_ACTION_PPPOE_DEL; 452 ipc_entry.pppoe_sid = *(uint16 *)&skb->ctf_pppoe_cb[2]; 453 ipc_entry.ppp_ifp = skb->dev; 454 } 455#endif 456 457#if defined(CTF_PPTP) || defined(CTF_L2TP) 458PX_PROTO_PPTP_L2TP: 459#endif 460 if (((ipc_entry.tuple.proto == IPPROTO_TCP) && (kcih->ipc_suspend & CTF_SUSPEND_TCP_MASK)) || 461 ((ipc_entry.tuple.proto == IPPROTO_UDP) && (kcih->ipc_suspend & CTF_SUSPEND_UDP_MASK))) { 462 /* The default action is suspend */ 463 ipc_entry.action |= CTF_ACTION_SUSPEND; 464 ipc_entry.susp_cnt = ((ipc_entry.tuple.proto == IPPROTO_TCP) ? 465 _popcounts(kcih->ipc_suspend & CTF_SUSPEND_TCP_MASK) : 466 _popcounts(kcih->ipc_suspend & CTF_SUSPEND_UDP_MASK)); 467 } 468 469 /* Copy the DSCP value. ECN bits must be cleared. */ 470 if (IPVERSION_IS_4(ipver)) 471 ipc_entry.tos = IPV4_TOS(iph); 472#ifdef CONFIG_IPV6 473 else 474 ipc_entry.tos = IPV6_TRAFFIC_CLASS(ip6h); 475#endif /* CONFIG_IPV6 */ 476 ipc_entry.tos &= IPV4_TOS_DSCP_MASK; 477 if (ipc_entry.tos) 478 ipc_entry.action |= CTF_ACTION_TOS; 479 480#ifdef CONFIG_NF_CONNTRACK_MARK 481 /* Initialize the mark for this connection */ 482 if (ct->mark != 0) { 483 ipc_entry.mark.value = ct->mark; 484 ipc_entry.action |= CTF_ACTION_MARK; 485 } 486#endif /* CONFIG_NF_CONNTRACK_MARK */ 487 488 /* Update the manip ip and port */ 489 if (manip != NULL) { 490 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) { 491 ipc_entry.nat.ip = manip->src.u3.ip; 492 ipc_entry.nat.port = manip->src.u.tcp.port; 493 ipc_entry.action |= CTF_ACTION_SNAT; 494 } else { 495 ipc_entry.nat.ip = manip->dst.u3.ip; 496 ipc_entry.nat.port = manip->dst.u.tcp.port; 497 ipc_entry.action |= CTF_ACTION_DNAT; 498 } 499 } 500 501 /* Do bridge cache lookup to determine outgoing interface 502 * and any vlan tagging actions if needed. 503 */ 504 if (ctf_isbridge(kcih, ipc_entry.txif)) { 505 ctf_brc_t *brcp; 506 507 brcp = ctf_brc_lkup(kcih, ipc_entry.dhost.octet); 508 509 if (brcp == NULL) 510 return; 511 else { 512 ipc_entry.txbif = ipc_entry.txif; 513 ipc_entry.action |= brcp->action; 514 ipc_entry.txif = brcp->txifp; 515 ipc_entry.vid = brcp->vid; 516 ctf_brc_release(kcih, brcp); 517 } 518 } 519 520#ifdef DEBUG 521 if (IPVERSION_IS_4(ipver)) 522 printk("%s: Adding ipc entry for [%d]%u.%u.%u.%u:%u - %u.%u.%u.%u:%u\n", __FUNCTION__, 523 ipc_entry.tuple.proto, 524 NIPQUAD(ipc_entry.tuple.sip[0]), ntohs(ipc_entry.tuple.sp), 525 NIPQUAD(ipc_entry.tuple.dip[0]), ntohs(ipc_entry.tuple.dp)); 526#ifdef CONFIG_IPV6 527 else 528 printk("\n%s: Adding ipc entry for [%d]\n" 529 "%08x.%08x.%08x.%08x:%u => %08x.%08x.%08x.%08x:%u\n", 530 __FUNCTION__, ipc_entry.tuple.proto, 531 ntohl(ipc_entry.tuple.sip[0]), ntohl(ipc_entry.tuple.sip[1]), 532 ntohl(ipc_entry.tuple.sip[2]), ntohl(ipc_entry.tuple.sip[3]), 533 ntohs(ipc_entry.tuple.sp), 534 ntohl(ipc_entry.tuple.dip[0]), ntohl(ipc_entry.tuple.dip[1]), 535 ntohl(ipc_entry.tuple.dip[2]), ntohl(ipc_entry.tuple.dip[3]), 536 ntohs(ipc_entry.tuple.dp)); 537#endif /* CONFIG_IPV6 */ 538 printk("sa %02x:%02x:%02x:%02x:%02x:%02x\n", 539 ipc_entry.shost.octet[0], ipc_entry.shost.octet[1], 540 ipc_entry.shost.octet[2], ipc_entry.shost.octet[3], 541 ipc_entry.shost.octet[4], ipc_entry.shost.octet[5]); 542 printk("da %02x:%02x:%02x:%02x:%02x:%02x\n", 543 ipc_entry.dhost.octet[0], ipc_entry.dhost.octet[1], 544 ipc_entry.dhost.octet[2], ipc_entry.dhost.octet[3], 545 ipc_entry.dhost.octet[4], ipc_entry.dhost.octet[5]); 546 printk("[%d] vid: %d action %x\n", hooknum, ipc_entry.vid, ipc_entry.action); 547 if (manip != NULL) 548 printk("manip_ip: %u.%u.%u.%u manip_port %u\n", 549 NIPQUAD(ipc_entry.nat.ip), ntohs(ipc_entry.nat.port)); 550 printk("txif: %s\n", ((struct net_device *)ipc_entry.txif)->name); 551#endif 552 553 ctf_ipc_add(kcih, &ipc_entry, !IPVERSION_IS_4(ipver)); 554 555#ifdef CTF_PPPOE 556 if (skb->ctf_pppoe_cb[0] == 2) { 557 ctf_ipc_t *ipct; 558 ipct = ctf_ipc_lkup(kcih, &ipc_entry, ipver == 6); 559 *(uint32 *)&skb->ctf_pppoe_cb[4] = (uint32)ipct; 560 if (ipct != NULL) 561 ctf_ipc_release(kcih, ipct); 562 } 563#endif 564 565 /* Update the attributes flag to indicate a CTF conn */ 566 ct->ctf_flags |= (CTF_FLAGS_CACHED | (1 << dir)); 567} 568 569int 570ip_conntrack_ipct_delete(struct nf_conn *ct, int ct_timeout) 571{ 572 ctf_ipc_t *ipct; 573 struct nf_conntrack_tuple *orig, *repl; 574 ctf_ipc_t orig_ipct, repl_ipct; 575 int ipaddr_sz; 576 bool v6; 577 578 if (!CTF_ENAB(kcih)) 579 return (0); 580 581 if (!(ct->ctf_flags & CTF_FLAGS_CACHED)) 582 return (0); 583 584 orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 585 586 if ((orig->dst.protonum != IPPROTO_TCP) && (orig->dst.protonum != IPPROTO_UDP)) 587 return (0); 588 589 repl = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; 590 591#ifdef CONFIG_IPV6 592 v6 = (orig->src.l3num == AF_INET6); 593 ipaddr_sz = (v6) ? sizeof(struct in6_addr) : sizeof(struct in_addr); 594#else 595 v6 = FALSE; 596 ipaddr_sz = sizeof(struct in_addr); 597#endif /* CONFIG_IPV6 */ 598 599 memset(&orig_ipct, 0, sizeof(orig_ipct)); 600 memcpy(orig_ipct.tuple.sip, &orig->src.u3.ip, ipaddr_sz); 601 memcpy(orig_ipct.tuple.dip, &orig->dst.u3.ip, ipaddr_sz); 602 orig_ipct.tuple.proto = orig->dst.protonum; 603 orig_ipct.tuple.sp = orig->src.u.tcp.port; 604 orig_ipct.tuple.dp = orig->dst.u.tcp.port; 605 606 memset(&repl_ipct, 0, sizeof(repl_ipct)); 607 memcpy(repl_ipct.tuple.sip, &repl->src.u3.ip, ipaddr_sz); 608 memcpy(repl_ipct.tuple.dip, &repl->dst.u3.ip, ipaddr_sz); 609 repl_ipct.tuple.proto = repl->dst.protonum; 610 repl_ipct.tuple.sp = repl->src.u.tcp.port; 611 repl_ipct.tuple.dp = repl->dst.u.tcp.port; 612 613 /* If the refresh counter of ipc entry is non zero, it indicates 614 * that the packet transfer is active and we should not delete 615 * the conntrack entry. 616 */ 617 if (ct_timeout) { 618 ipct = ctf_ipc_lkup(kcih, &orig_ipct, v6); 619 620 /* Postpone the deletion of ct entry if there are frames 621 * flowing in this direction. 622 */ 623 if (ipct != NULL) { 624#ifdef BCMFA 625 ctf_live(kcih, ipct, v6); 626#endif 627 if (ipct->live > 0) { 628 ipct->live = 0; 629 ctf_ipc_release(kcih, ipct); 630 ct->timeout.expires = jiffies + ct->expire_jiffies; 631 add_timer(&ct->timeout); 632 return (-1); 633 } 634 ctf_ipc_release(kcih, ipct); 635 } 636 637 ipct = ctf_ipc_lkup(kcih, &repl_ipct, v6); 638 639 if (ipct != NULL) { 640#ifdef BCMFA 641 ctf_live(kcih, ipct, v6); 642#endif 643 if (ipct->live > 0) { 644 ipct->live = 0; 645 ctf_ipc_release(kcih, ipct); 646 ct->timeout.expires = jiffies + ct->expire_jiffies; 647 add_timer(&ct->timeout); 648 return (-1); 649 } 650 ctf_ipc_release(kcih, ipct); 651 } 652 } 653 654 /* If there are no packets over this connection for timeout period 655 * delete the entries. 656 */ 657 ctf_ipc_delete(kcih, &orig_ipct, v6); 658 659 ctf_ipc_delete(kcih, &repl_ipct, v6); 660 661#ifdef DEBUG 662 printk("%s: Deleting the tuple %x %x %d %d %d\n", 663 __FUNCTION__, orig->src.u3.ip, orig->dst.u3.ip, orig->dst.protonum, 664 orig->src.u.tcp.port, orig->dst.u.tcp.port); 665 printk("%s: Deleting the tuple %x %x %d %d %d\n", 666 __FUNCTION__, repl->dst.u3.ip, repl->src.u3.ip, repl->dst.protonum, 667 repl->dst.u.tcp.port, repl->src.u.tcp.port); 668#endif 669 670 return (0); 671} 672 673void 674ip_conntrack_ipct_default_fwd_set(uint8 protocol, ctf_fwd_t fwd, uint8 userid) 675{ 676 ctf_cfg_request_t req; 677 ctf_fwd_t *f; 678 uint8 *p; 679 uint8 *uid; 680 681 memset(&req, '\0', sizeof(req)); 682 req.command_id = CTFCFG_CMD_DEFAULT_FWD_SET; 683 req.size = sizeof(ctf_fwd_t) + sizeof(uint8) + sizeof(uint8); 684 f = (ctf_fwd_t *) req.arg; 685 *f = fwd; 686 p = (req.arg + sizeof(ctf_fwd_t)); 687 *p = protocol; 688 uid = (req.arg + sizeof(ctf_fwd_t) + sizeof(uint8)); 689 *uid = userid; 690 691 ctf_cfg_req_process(kcih, &req); 692} 693EXPORT_SYMBOL(ip_conntrack_ipct_default_fwd_set); 694 695 696uint32 697ip_conntrack_ipct_resume(struct sk_buff *skb, u_int32_t hooknum, 698 struct nf_conn *ct, enum ip_conntrack_info ci) 699{ 700 struct iphdr *iph; 701 struct tcphdr *tcph; 702 struct nf_conn_help *help; 703 uint8 ipver, protocol; 704#ifdef CONFIG_IPV6 705 struct ipv6hdr *ip6h = NULL; 706#endif /* CONFIG_IPV6 */ 707 uint32 *ct_mark_p; 708 709 ctf_cfg_request_t req; 710 ctf_tuple_t tuple, *tp = NULL; 711 712 if ((skb == NULL) || (ct == NULL)) 713 return 0; 714 715 /* Check CTF enabled */ 716 if (!ip_conntrack_is_ipc_allowed(skb, hooknum)) 717 return 0; 718 719 /* We only add cache entires for non-helper connections and at 720 * pre or post routing hooks. 721 */ 722 help = nfct_help(ct); 723 if ((help && help->helper) || (ct->ctf_flags & CTF_FLAGS_EXCLUDED) || 724 ((hooknum != NF_INET_PRE_ROUTING) && (hooknum != NF_INET_POST_ROUTING))) 725 return 0; 726 727 iph = ip_hdr(skb); 728 ipver = iph->version; 729 730 /* Support both IPv4 and IPv6 */ 731 if (ipver == 4) { 732 tcph = ((struct tcphdr *)(((__u8 *)iph) + (iph->ihl << 2))); 733 protocol = iph->protocol; 734 } 735#ifdef CONFIG_IPV6 736 else if (ipver == 6) { 737 ip6h = (struct ipv6hdr *)iph; 738 tcph = (struct tcphdr *)ctf_ipc_lkup_l4proto(kcih, ip6h, &protocol); 739 if (tcph == NULL) 740 return 0; 741 } 742#endif /* CONFIG_IPV6 */ 743 else 744 return 0; 745 746 /* Only TCP and UDP are supported */ 747 if (protocol == IPPROTO_TCP) { 748 /* Add ipc entries for connections in established state only */ 749 if ((ci != IP_CT_ESTABLISHED) && (ci != (IP_CT_ESTABLISHED+IP_CT_IS_REPLY))) 750 return 0; 751 752 if (ct->proto.tcp.state >= TCP_CONNTRACK_FIN_WAIT && 753 ct->proto.tcp.state <= TCP_CONNTRACK_TIME_WAIT) 754 return 0; 755 } 756 else if (protocol != IPPROTO_UDP) 757 return 0; 758 759 memset(&tuple, '\0', sizeof(tuple)); 760 if (IPVERSION_IS_4(ipver)) { 761 memcpy(&tuple.src_addr, &iph->saddr, sizeof(uint32)); 762 memcpy(&tuple.dst_addr, &iph->daddr, sizeof(uint32)); 763 tuple.family = AF_INET; 764#ifdef CONFIG_IPV6 765 } else { 766 memcpy(&tuple.src_addr, &ip6h->saddr, IPV6_ADDR_LEN); 767 memcpy(&tuple.dst_addr, &ip6h->daddr, IPV6_ADDR_LEN); 768 tuple.family = AF_INET6; 769#endif /* CONFIG_IPV6 */ 770 } 771 tuple.src_port = tcph->source; 772 tuple.dst_port = tcph->dest; 773 tuple.protocol = protocol; 774 775#ifdef CONFIG_NF_CONNTRACK_MARK 776 if (ct->mark != 0) { 777 /* To Update Mark */ 778 memset(&req, '\0', sizeof(req)); 779 req.command_id = CTFCFG_CMD_UPD_MARK; 780 req.size = sizeof(ctf_tuple_t) + sizeof(uint32); 781 tp = (ctf_tuple_t *) req.arg; 782 *tp = tuple; 783 ct_mark_p = (uint32 *)(req.arg + sizeof(ctf_tuple_t)); 784 *ct_mark_p = ct->mark; 785 ctf_cfg_req_process(kcih, &req); 786 787 /* To Update ipct txif */ 788 memset(&req, '\0', sizeof(req)); 789 req.command_id = CTFCFG_CMD_CHANGE_TXIF_TO_BR; 790 req.size = sizeof(ctf_tuple_t); 791 tp = (ctf_tuple_t *) req.arg; 792 *tp = tuple; 793 ctf_cfg_req_process(kcih, &req); 794 } 795#endif /* CONFIG_NF_CONNTRACK_MARK */ 796 797 /* To Resume */ 798 memset(&req, '\0', sizeof(req)); 799 req.command_id = CTFCFG_CMD_RESUME; 800 req.size = sizeof(ctf_tuple_t); 801 tp = (ctf_tuple_t *) req.arg; 802 *tp = tuple; 803 ctf_cfg_req_process(kcih, &req); 804 return req.status; 805} 806EXPORT_SYMBOL(ip_conntrack_ipct_resume); 807#endif /* HNDCTF */ 808 809 810static int nf_conntrack_hash_rnd_initted; 811static unsigned int nf_conntrack_hash_rnd; 812 813static u_int32_t BCMFASTPATH_HOST __hash_conntrack(const struct nf_conntrack_tuple *tuple, 814 u16 zone, unsigned int size, unsigned int rnd) 815{ 816 unsigned int n; 817 u_int32_t h; 818 819 /* The direction must be ignored, so we hash everything up to the 820 * destination ports (which is a multiple of 4) and treat the last 821 * three bytes manually. 822 */ 823 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); 824 h = jhash2((u32 *)tuple, n, 825 zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) | 826 tuple->dst.protonum)); 827 828 return ((u64)h * size) >> 32; 829} 830 831static inline u_int32_t hash_conntrack(const struct net *net, u16 zone, 832 const struct nf_conntrack_tuple *tuple) 833{ 834 return __hash_conntrack(tuple, zone, net->ct.htable_size, 835 nf_conntrack_hash_rnd); 836} 837 838bool 839nf_ct_get_tuple(const struct sk_buff *skb, 840 unsigned int nhoff, 841 unsigned int dataoff, 842 u_int16_t l3num, 843 u_int8_t protonum, 844 struct nf_conntrack_tuple *tuple, 845 const struct nf_conntrack_l3proto *l3proto, 846 const struct nf_conntrack_l4proto *l4proto) 847{ 848 memset(tuple, 0, sizeof(*tuple)); 849 850 tuple->src.l3num = l3num; 851 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) 852 return false; 853 854 tuple->dst.protonum = protonum; 855 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 856 857 return l4proto->pkt_to_tuple(skb, dataoff, tuple); 858} 859EXPORT_SYMBOL_GPL(nf_ct_get_tuple); 860 861bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 862 u_int16_t l3num, struct nf_conntrack_tuple *tuple) 863{ 864 struct nf_conntrack_l3proto *l3proto; 865 struct nf_conntrack_l4proto *l4proto; 866 unsigned int protoff; 867 u_int8_t protonum; 868 int ret; 869 870 rcu_read_lock(); 871 872 l3proto = __nf_ct_l3proto_find(l3num); 873 ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum); 874 if (ret != NF_ACCEPT) { 875 rcu_read_unlock(); 876 return false; 877 } 878 879 l4proto = __nf_ct_l4proto_find(l3num, protonum); 880 881 ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple, 882 l3proto, l4proto); 883 884 rcu_read_unlock(); 885 return ret; 886} 887EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 888 889bool 890nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 891 const struct nf_conntrack_tuple *orig, 892 const struct nf_conntrack_l3proto *l3proto, 893 const struct nf_conntrack_l4proto *l4proto) 894{ 895 memset(inverse, 0, sizeof(*inverse)); 896 897 inverse->src.l3num = orig->src.l3num; 898 if (l3proto->invert_tuple(inverse, orig) == 0) 899 return false; 900 901 inverse->dst.dir = !orig->dst.dir; 902 903 inverse->dst.protonum = orig->dst.protonum; 904 return l4proto->invert_tuple(inverse, orig); 905} 906EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 907 908static void 909clean_from_lists(struct nf_conn *ct) 910{ 911 pr_debug("clean_from_lists(%p)\n", ct); 912 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 913 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 914 915 /* Destroy all pending expectations */ 916 nf_ct_remove_expectations(ct); 917} 918 919static void 920destroy_conntrack(struct nf_conntrack *nfct) 921{ 922 struct nf_conn *ct = (struct nf_conn *)nfct; 923 struct net *net = nf_ct_net(ct); 924 struct nf_conntrack_l4proto *l4proto; 925 926 pr_debug("destroy_conntrack(%p)\n", ct); 927 NF_CT_ASSERT(atomic_read(&nfct->use) == 0); 928 NF_CT_ASSERT(!timer_pending(&ct->timeout)); 929 930#ifdef HNDCTF 931 ip_conntrack_ipct_delete(ct, 0); 932#endif /* HNDCTF*/ 933 /* To make sure we don't get any weird locking issues here: 934 * destroy_conntrack() MUST NOT be called with a write lock 935 * to nf_conntrack_lock!!! -HW */ 936 rcu_read_lock(); 937 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 938 if (l4proto && l4proto->destroy) 939 l4proto->destroy(ct); 940 941 rcu_read_unlock(); 942 943 spin_lock_bh(&nf_conntrack_lock); 944 /* Expectations will have been removed in clean_from_lists, 945 * except TFTP can create an expectation on the first packet, 946 * before connection is in the list, so we need to clean here, 947 * too. */ 948 nf_ct_remove_expectations(ct); 949 950 /* We overload first tuple to link into unconfirmed list. */ 951 if (!nf_ct_is_confirmed(ct)) { 952 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); 953 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 954 } 955 956 NF_CT_STAT_INC(net, delete); 957 spin_unlock_bh(&nf_conntrack_lock); 958 959 if (ct->master) 960 nf_ct_put(ct->master); 961 962 pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); 963 nf_conntrack_free(ct); 964} 965 966void nf_ct_delete_from_lists(struct nf_conn *ct) 967{ 968 struct net *net = nf_ct_net(ct); 969 970 nf_ct_helper_destroy(ct); 971 spin_lock_bh(&nf_conntrack_lock); 972 /* Inside lock so preempt is disabled on module removal path. 973 * Otherwise we can get spurious warnings. */ 974 NF_CT_STAT_INC(net, delete_list); 975 clean_from_lists(ct); 976 spin_unlock_bh(&nf_conntrack_lock); 977} 978EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists); 979 980static void death_by_event(unsigned long ul_conntrack) 981{ 982 struct nf_conn *ct = (void *)ul_conntrack; 983 struct net *net = nf_ct_net(ct); 984 985 if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { 986 /* bad luck, let's retry again */ 987 ct->timeout.expires = jiffies + 988 (random32() % net->ct.sysctl_events_retry_timeout); 989 add_timer(&ct->timeout); 990 return; 991 } 992 /* we've got the event delivered, now it's dying */ 993 set_bit(IPS_DYING_BIT, &ct->status); 994 spin_lock(&nf_conntrack_lock); 995 hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 996 spin_unlock(&nf_conntrack_lock); 997 nf_ct_put(ct); 998} 999 1000void nf_ct_insert_dying_list(struct nf_conn *ct) 1001{ 1002 struct net *net = nf_ct_net(ct); 1003 1004 /* add this conntrack to the dying list */ 1005 spin_lock_bh(&nf_conntrack_lock); 1006 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 1007 &net->ct.dying); 1008 spin_unlock_bh(&nf_conntrack_lock); 1009 /* set a new timer to retry event delivery */ 1010 setup_timer(&ct->timeout, death_by_event, (unsigned long)ct); 1011 ct->timeout.expires = jiffies + 1012 (random32() % net->ct.sysctl_events_retry_timeout); 1013 add_timer(&ct->timeout); 1014} 1015EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); 1016 1017static void death_by_timeout(unsigned long ul_conntrack) 1018{ 1019 struct nf_conn *ct = (void *)ul_conntrack; 1020#ifdef HNDCTF 1021 /* If negative error is returned it means the entry hasn't 1022 * timed out yet. 1023 */ 1024 if (ip_conntrack_ipct_delete(ct, jiffies >= ct->timeout.expires ? 1 : 0) != 0) 1025 return; 1026#endif /* HNDCTF */ 1027 1028 if (!test_bit(IPS_DYING_BIT, &ct->status) && 1029 unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { 1030 /* destroy event was not delivered */ 1031 nf_ct_delete_from_lists(ct); 1032 nf_ct_insert_dying_list(ct); 1033 return; 1034 } 1035 set_bit(IPS_DYING_BIT, &ct->status); 1036 nf_ct_delete_from_lists(ct); 1037 nf_ct_put(ct); 1038} 1039 1040/* 1041 * Warning : 1042 * - Caller must take a reference on returned object 1043 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 1044 * OR 1045 * - Caller must lock nf_conntrack_lock before calling this function 1046 */ 1047struct nf_conntrack_tuple_hash * BCMFASTPATH_HOST 1048__nf_conntrack_find(struct net *net, u16 zone, 1049 const struct nf_conntrack_tuple *tuple) 1050{ 1051 struct nf_conntrack_tuple_hash *h; 1052 struct hlist_nulls_node *n; 1053 unsigned int hash = hash_conntrack(net, zone, tuple); 1054 1055 /* Disable BHs the entire time since we normally need to disable them 1056 * at least once for the stats anyway. 1057 */ 1058 local_bh_disable(); 1059begin: 1060 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { 1061 if (nf_ct_tuple_equal(tuple, &h->tuple) && 1062 nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) { 1063 NF_CT_STAT_INC(net, found); 1064 local_bh_enable(); 1065 return h; 1066 } 1067 NF_CT_STAT_INC(net, searched); 1068 } 1069 /* 1070 * if the nulls value we got at the end of this lookup is 1071 * not the expected one, we must restart lookup. 1072 * We probably met an item that was moved to another chain. 1073 */ 1074 if (get_nulls_value(n) != hash) { 1075 NF_CT_STAT_INC(net, search_restart); 1076 goto begin; 1077 } 1078 local_bh_enable(); 1079 1080 return NULL; 1081} 1082EXPORT_SYMBOL_GPL(__nf_conntrack_find); 1083 1084/* Find a connection corresponding to a tuple. */ 1085struct nf_conntrack_tuple_hash * BCMFASTPATH_HOST 1086nf_conntrack_find_get(struct net *net, u16 zone, 1087 const struct nf_conntrack_tuple *tuple) 1088{ 1089 struct nf_conntrack_tuple_hash *h; 1090 struct nf_conn *ct; 1091 1092 rcu_read_lock(); 1093begin: 1094 h = __nf_conntrack_find(net, zone, tuple); 1095 if (h) { 1096 ct = nf_ct_tuplehash_to_ctrack(h); 1097 if (unlikely(nf_ct_is_dying(ct) || 1098 !atomic_inc_not_zero(&ct->ct_general.use))) 1099 h = NULL; 1100 else { 1101 if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) || 1102 nf_ct_zone(ct) != zone)) { 1103 nf_ct_put(ct); 1104 goto begin; 1105 } 1106 } 1107 } 1108 rcu_read_unlock(); 1109 1110 return h; 1111} 1112EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 1113 1114static void __nf_conntrack_hash_insert(struct nf_conn *ct, 1115 unsigned int hash, 1116 unsigned int repl_hash) 1117{ 1118 struct net *net = nf_ct_net(ct); 1119 1120 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 1121 &net->ct.hash[hash]); 1122 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 1123 &net->ct.hash[repl_hash]); 1124} 1125 1126void nf_conntrack_hash_insert(struct nf_conn *ct) 1127{ 1128 struct net *net = nf_ct_net(ct); 1129 unsigned int hash, repl_hash; 1130 u16 zone; 1131 1132 zone = nf_ct_zone(ct); 1133 hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 1134 repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 1135 1136 __nf_conntrack_hash_insert(ct, hash, repl_hash); 1137} 1138EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert); 1139 1140/* Confirm a connection given skb; places it in hash table */ 1141int 1142__nf_conntrack_confirm(struct sk_buff *skb) 1143{ 1144 unsigned int hash, repl_hash; 1145 struct nf_conntrack_tuple_hash *h; 1146 struct nf_conn *ct; 1147 struct nf_conn_help *help; 1148 struct hlist_nulls_node *n; 1149 enum ip_conntrack_info ctinfo; 1150 struct net *net; 1151 u16 zone; 1152 1153 ct = nf_ct_get(skb, &ctinfo); 1154 net = nf_ct_net(ct); 1155 1156 /* ipt_REJECT uses nf_conntrack_attach to attach related 1157 ICMP/TCP RST packets in other direction. Actual packet 1158 which created connection will be IP_CT_NEW or for an 1159 expected connection, IP_CT_RELATED. */ 1160 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 1161 return NF_ACCEPT; 1162 1163 zone = nf_ct_zone(ct); 1164 hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 1165 repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 1166 1167 /* We're not in hash table, and we refuse to set up related 1168 connections for unconfirmed conns. But packet copies and 1169 REJECT will give spurious warnings here. */ 1170 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ 1171 1172 /* No external references means noone else could have 1173 confirmed us. */ 1174 NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); 1175 pr_debug("Confirming conntrack %p\n", ct); 1176 1177 spin_lock_bh(&nf_conntrack_lock); 1178 1179 /* We have to check the DYING flag inside the lock to prevent 1180 a race against nf_ct_get_next_corpse() possibly called from 1181 user context, else we insert an already 'dead' hash, blocking 1182 further use of that particular connection -JM */ 1183 1184 if (unlikely(nf_ct_is_dying(ct))) { 1185 spin_unlock_bh(&nf_conntrack_lock); 1186 return NF_ACCEPT; 1187 } 1188 1189 /* See if there's one in the list already, including reverse: 1190 NAT could have grabbed it without realizing, since we're 1191 not in the hash. If there is, we lost race. */ 1192 hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) 1193 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1194 &h->tuple) && 1195 zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) 1196 goto out; 1197 hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) 1198 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1199 &h->tuple) && 1200 zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) 1201 goto out; 1202 1203 /* Remove from unconfirmed list */ 1204 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 1205 1206 /* Timer relative to confirmation time, not original 1207 setting time, otherwise we'd get timer wrap in 1208 weird delay cases. */ 1209 ct->timeout.expires += jiffies; 1210 add_timer(&ct->timeout); 1211 atomic_inc(&ct->ct_general.use); 1212 set_bit(IPS_CONFIRMED_BIT, &ct->status); 1213 1214 /* Since the lookup is lockless, hash insertion must be done after 1215 * starting the timer and setting the CONFIRMED bit. The RCU barriers 1216 * guarantee that no other CPU can find the conntrack before the above 1217 * stores are visible. 1218 */ 1219 __nf_conntrack_hash_insert(ct, hash, repl_hash); 1220 NF_CT_STAT_INC(net, insert); 1221 spin_unlock_bh(&nf_conntrack_lock); 1222 1223 help = nfct_help(ct); 1224 if (help && help->helper) 1225 nf_conntrack_event_cache(IPCT_HELPER, ct); 1226 1227 nf_conntrack_event_cache(master_ct(ct) ? 1228 IPCT_RELATED : IPCT_NEW, ct); 1229 return NF_ACCEPT; 1230 1231out: 1232 NF_CT_STAT_INC(net, insert_failed); 1233 spin_unlock_bh(&nf_conntrack_lock); 1234 return NF_DROP; 1235} 1236EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 1237 1238/* Returns true if a connection correspondings to the tuple (required 1239 for NAT). */ 1240int 1241nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 1242 const struct nf_conn *ignored_conntrack) 1243{ 1244 struct net *net = nf_ct_net(ignored_conntrack); 1245 struct nf_conntrack_tuple_hash *h; 1246 struct hlist_nulls_node *n; 1247 struct nf_conn *ct; 1248 u16 zone = nf_ct_zone(ignored_conntrack); 1249 unsigned int hash = hash_conntrack(net, zone, tuple); 1250 1251 /* Disable BHs the entire time since we need to disable them at 1252 * least once for the stats anyway. 1253 */ 1254 rcu_read_lock_bh(); 1255 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { 1256 ct = nf_ct_tuplehash_to_ctrack(h); 1257 if (ct != ignored_conntrack && 1258 nf_ct_tuple_equal(tuple, &h->tuple) && 1259 nf_ct_zone(ct) == zone) { 1260 NF_CT_STAT_INC(net, found); 1261 rcu_read_unlock_bh(); 1262 return 1; 1263 } 1264 NF_CT_STAT_INC(net, searched); 1265 } 1266 rcu_read_unlock_bh(); 1267 1268 return 0; 1269} 1270EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1271 1272#define NF_CT_EVICTION_RANGE 8 1273 1274/* There's a small race here where we may free a just-assured 1275 connection. Too bad: we're in trouble anyway. */ 1276static noinline int early_drop(struct net *net, unsigned int hash) 1277{ 1278 /* Use oldest entry, which is roughly LRU */ 1279 struct nf_conntrack_tuple_hash *h; 1280 struct nf_conn *ct = NULL, *tmp; 1281 struct hlist_nulls_node *n; 1282 unsigned int i, cnt = 0; 1283 int dropped = 0; 1284 1285 rcu_read_lock(); 1286 for (i = 0; i < net->ct.htable_size; i++) { 1287 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], 1288 hnnode) { 1289 tmp = nf_ct_tuplehash_to_ctrack(h); 1290 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) 1291 ct = tmp; 1292 cnt++; 1293 } 1294 1295 if (ct != NULL) { 1296 if (likely(!nf_ct_is_dying(ct) && 1297 atomic_inc_not_zero(&ct->ct_general.use))) 1298 break; 1299 else 1300 ct = NULL; 1301 } 1302 1303 if (cnt >= NF_CT_EVICTION_RANGE) 1304 break; 1305 1306 hash = (hash + 1) % net->ct.htable_size; 1307 } 1308 rcu_read_unlock(); 1309 1310 if (!ct) 1311 return dropped; 1312 1313#ifdef HNDCTF 1314 ip_conntrack_ipct_delete(ct, 0); 1315#endif /* HNDCTF */ 1316 1317 if (del_timer(&ct->timeout)) { 1318 death_by_timeout((unsigned long)ct); 1319 dropped = 1; 1320 NF_CT_STAT_INC_ATOMIC(net, early_drop); 1321 } 1322 nf_ct_put(ct); 1323 return dropped; 1324} 1325 1326struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, 1327 const struct nf_conntrack_tuple *orig, 1328 const struct nf_conntrack_tuple *repl, 1329 gfp_t gfp) 1330{ 1331 struct nf_conn *ct; 1332 1333 if (unlikely(!nf_conntrack_hash_rnd_initted)) { 1334 get_random_bytes(&nf_conntrack_hash_rnd, 1335 sizeof(nf_conntrack_hash_rnd)); 1336 nf_conntrack_hash_rnd_initted = 1; 1337 } 1338 1339 /* We don't want any race condition at early drop stage */ 1340 atomic_inc(&net->ct.count); 1341 1342 if (nf_conntrack_max && 1343 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { 1344 unsigned int hash = hash_conntrack(net, zone, orig); 1345 if (!early_drop(net, hash)) { 1346 atomic_dec(&net->ct.count); 1347 if (net_ratelimit()) 1348 printk(KERN_WARNING 1349 "nf_conntrack: table full, dropping" 1350 " packet.\n"); 1351 return ERR_PTR(-ENOMEM); 1352 } 1353 } 1354 1355 /* 1356 * Do not use kmem_cache_zalloc(), as this cache uses 1357 * SLAB_DESTROY_BY_RCU. 1358 */ 1359 ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); 1360 if (ct == NULL) { 1361 pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n"); 1362 atomic_dec(&net->ct.count); 1363 return ERR_PTR(-ENOMEM); 1364 } 1365 /* 1366 * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next 1367 * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged. 1368 */ 1369 memset(&ct->tuplehash[IP_CT_DIR_MAX], 0, 1370 sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX])); 1371 spin_lock_init(&ct->lock); 1372 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1373 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1374 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1375 ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL; 1376 /* Don't set timer yet: wait for confirmation */ 1377 setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); 1378 write_pnet(&ct->ct_net, net); 1379#ifdef CONFIG_NF_CONNTRACK_ZONES 1380 if (zone) { 1381 struct nf_conntrack_zone *nf_ct_zone; 1382 1383 nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC); 1384 if (!nf_ct_zone) 1385 goto out_free; 1386 nf_ct_zone->id = zone; 1387 } 1388#endif 1389 /* 1390 * changes to lookup keys must be done before setting refcnt to 1 1391 */ 1392 smp_wmb(); 1393 atomic_set(&ct->ct_general.use, 1); 1394 return ct; 1395 1396#ifdef CONFIG_NF_CONNTRACK_ZONES 1397out_free: 1398 kmem_cache_free(net->ct.nf_conntrack_cachep, ct); 1399 return ERR_PTR(-ENOMEM); 1400#endif 1401} 1402EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1403 1404void nf_conntrack_free(struct nf_conn *ct) 1405{ 1406 struct net *net = nf_ct_net(ct); 1407 1408 nf_ct_ext_destroy(ct); 1409 atomic_dec(&net->ct.count); 1410 nf_ct_ext_free(ct); 1411 kmem_cache_free(net->ct.nf_conntrack_cachep, ct); 1412} 1413EXPORT_SYMBOL_GPL(nf_conntrack_free); 1414 1415/* Allocate a new conntrack: we return -ENOMEM if classification 1416 failed due to stress. Otherwise it really is unclassifiable. */ 1417static struct nf_conntrack_tuple_hash * 1418init_conntrack(struct net *net, struct nf_conn *tmpl, 1419 const struct nf_conntrack_tuple *tuple, 1420 struct nf_conntrack_l3proto *l3proto, 1421 struct nf_conntrack_l4proto *l4proto, 1422 struct sk_buff *skb, 1423 unsigned int dataoff) 1424{ 1425 struct nf_conn *ct; 1426 struct nf_conn_help *help; 1427 struct nf_conntrack_tuple repl_tuple; 1428 struct nf_conntrack_ecache *ecache; 1429 struct nf_conntrack_expect *exp; 1430 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; 1431 1432 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { 1433 pr_debug("Can't invert tuple.\n"); 1434 return NULL; 1435 } 1436 1437 ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC); 1438 if (IS_ERR(ct)) { 1439 pr_debug("Can't allocate conntrack.\n"); 1440 return (struct nf_conntrack_tuple_hash *)ct; 1441 } 1442 1443 if (!l4proto->new(ct, skb, dataoff)) { 1444 nf_conntrack_free(ct); 1445 pr_debug("init conntrack: can't track with proto module\n"); 1446 return NULL; 1447 } 1448 1449 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1450 1451 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1452 nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1453 ecache ? ecache->expmask : 0, 1454 GFP_ATOMIC); 1455 1456 spin_lock_bh(&nf_conntrack_lock); 1457 exp = nf_ct_find_expectation(net, zone, tuple); 1458 if (exp) { 1459 pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", 1460 ct, exp); 1461 /* Welcome, Mr. Bond. We've been expecting you... */ 1462 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1463 ct->master = exp->master; 1464 if (exp->helper) { 1465 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1466 if (help) 1467 rcu_assign_pointer(help->helper, exp->helper); 1468 } 1469 1470#ifdef CONFIG_NF_CONNTRACK_MARK 1471 ct->mark = exp->master->mark; 1472#endif 1473#ifdef CONFIG_NF_CONNTRACK_SECMARK 1474 ct->secmark = exp->master->secmark; 1475#endif 1476 nf_conntrack_get(&ct->master->ct_general); 1477 NF_CT_STAT_INC(net, expect_new); 1478 } else { 1479 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1480 NF_CT_STAT_INC(net, new); 1481 } 1482 1483 /* Overload tuple linked list to put us in unconfirmed list. */ 1484 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 1485 &net->ct.unconfirmed); 1486 1487 spin_unlock_bh(&nf_conntrack_lock); 1488 1489 if (exp) { 1490 if (exp->expectfn) 1491 exp->expectfn(ct, exp); 1492 nf_ct_expect_put(exp); 1493 } 1494 1495 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1496} 1497 1498/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ 1499static inline struct nf_conn * 1500resolve_normal_ct(struct net *net, struct nf_conn *tmpl, 1501 struct sk_buff *skb, 1502 unsigned int dataoff, 1503 u_int16_t l3num, 1504 u_int8_t protonum, 1505 struct nf_conntrack_l3proto *l3proto, 1506 struct nf_conntrack_l4proto *l4proto, 1507 int *set_reply, 1508 enum ip_conntrack_info *ctinfo) 1509{ 1510 struct nf_conntrack_tuple tuple; 1511 struct nf_conntrack_tuple_hash *h; 1512 struct nf_conn *ct; 1513 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; 1514 1515 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1516 dataoff, l3num, protonum, &tuple, l3proto, 1517 l4proto)) { 1518 pr_debug("resolve_normal_ct: Can't get tuple\n"); 1519 return NULL; 1520 } 1521 1522 /* look for tuple match */ 1523 h = nf_conntrack_find_get(net, zone, &tuple); 1524 if (!h) { 1525 h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, 1526 skb, dataoff); 1527 if (!h) 1528 return NULL; 1529 if (IS_ERR(h)) 1530 return (void *)h; 1531 } 1532 ct = nf_ct_tuplehash_to_ctrack(h); 1533 1534 /* It exists; we have (non-exclusive) reference. */ 1535 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1536 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; 1537 /* Please set reply bit if this packet OK */ 1538 *set_reply = 1; 1539 } else { 1540 /* Once we've had two way comms, always ESTABLISHED. */ 1541 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { 1542 pr_debug("nf_conntrack_in: normal packet for %p\n", ct); 1543 *ctinfo = IP_CT_ESTABLISHED; 1544 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { 1545 pr_debug("nf_conntrack_in: related packet for %p\n", 1546 ct); 1547 *ctinfo = IP_CT_RELATED; 1548 } else { 1549 pr_debug("nf_conntrack_in: new packet for %p\n", ct); 1550 *ctinfo = IP_CT_NEW; 1551 } 1552 *set_reply = 0; 1553 } 1554 skb->nfct = &ct->ct_general; 1555 skb->nfctinfo = *ctinfo; 1556 return ct; 1557} 1558 1559unsigned int BCMFASTPATH_HOST 1560nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, 1561 struct sk_buff *skb) 1562{ 1563 struct nf_conn *ct, *tmpl = NULL; 1564 enum ip_conntrack_info ctinfo; 1565 struct nf_conntrack_l3proto *l3proto; 1566 struct nf_conntrack_l4proto *l4proto; 1567 unsigned int dataoff; 1568 u_int8_t protonum; 1569 int set_reply = 0; 1570 int ret; 1571 1572 if (skb->nfct) { 1573 /* Previously seen (loopback or untracked)? Ignore. */ 1574 tmpl = (struct nf_conn *)skb->nfct; 1575 if (!nf_ct_is_template(tmpl)) { 1576 NF_CT_STAT_INC_ATOMIC(net, ignore); 1577 return NF_ACCEPT; 1578 } 1579 skb->nfct = NULL; 1580 } 1581 1582 /* rcu_read_lock()ed by nf_hook_slow */ 1583 l3proto = __nf_ct_l3proto_find(pf); 1584 ret = l3proto->get_l4proto(skb, skb_network_offset(skb), 1585 &dataoff, &protonum); 1586 if (ret <= 0) { 1587 pr_debug("not prepared to track yet or error occured\n"); 1588 NF_CT_STAT_INC_ATOMIC(net, error); 1589 NF_CT_STAT_INC_ATOMIC(net, invalid); 1590 ret = -ret; 1591 goto out; 1592 } 1593 1594 l4proto = __nf_ct_l4proto_find(pf, protonum); 1595 1596 /* It may be an special packet, error, unclean... 1597 * inverse of the return code tells to the netfilter 1598 * core what to do with the packet. */ 1599 if (l4proto->error != NULL) { 1600 ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo, 1601 pf, hooknum); 1602 if (ret <= 0) { 1603 NF_CT_STAT_INC_ATOMIC(net, error); 1604 NF_CT_STAT_INC_ATOMIC(net, invalid); 1605 ret = -ret; 1606 goto out; 1607 } 1608 } 1609 1610 ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, 1611 l3proto, l4proto, &set_reply, &ctinfo); 1612 if (!ct) { 1613 /* Not valid part of a connection */ 1614 NF_CT_STAT_INC_ATOMIC(net, invalid); 1615 ret = NF_ACCEPT; 1616 goto out; 1617 } 1618 1619 if (IS_ERR(ct)) { 1620 /* Too stressed to deal. */ 1621 NF_CT_STAT_INC_ATOMIC(net, drop); 1622 ret = NF_DROP; 1623 goto out; 1624 } 1625 1626 NF_CT_ASSERT(skb->nfct); 1627 1628 ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum); 1629 if (ret <= 0) { 1630 /* Invalid: inverse of the return code tells 1631 * the netfilter core what to do */ 1632 pr_debug("nf_conntrack_in: Can't track with proto module\n"); 1633 nf_conntrack_put(skb->nfct); 1634 skb->nfct = NULL; 1635 NF_CT_STAT_INC_ATOMIC(net, invalid); 1636 if (ret == -NF_DROP) 1637 NF_CT_STAT_INC_ATOMIC(net, drop); 1638 ret = -ret; 1639 goto out; 1640 } 1641 1642 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 1643 nf_conntrack_event_cache(IPCT_REPLY, ct); 1644out: 1645 if (tmpl) 1646 nf_ct_put(tmpl); 1647 1648 return ret; 1649} 1650EXPORT_SYMBOL_GPL(nf_conntrack_in); 1651 1652bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, 1653 const struct nf_conntrack_tuple *orig) 1654{ 1655 bool ret; 1656 1657 rcu_read_lock(); 1658 ret = nf_ct_invert_tuple(inverse, orig, 1659 __nf_ct_l3proto_find(orig->src.l3num), 1660 __nf_ct_l4proto_find(orig->src.l3num, 1661 orig->dst.protonum)); 1662 rcu_read_unlock(); 1663 return ret; 1664} 1665EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr); 1666 1667/* Alter reply tuple (maybe alter helper). This is for NAT, and is 1668 implicitly racy: see __nf_conntrack_confirm */ 1669void nf_conntrack_alter_reply(struct nf_conn *ct, 1670 const struct nf_conntrack_tuple *newreply) 1671{ 1672 struct nf_conn_help *help = nfct_help(ct); 1673 1674 /* Should be unconfirmed, so not in hash table yet */ 1675 NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); 1676 1677 pr_debug("Altering reply tuple of %p to ", ct); 1678 nf_ct_dump_tuple(newreply); 1679 1680 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 1681 if (ct->master || (help && !hlist_empty(&help->expectations))) 1682 return; 1683 1684 rcu_read_lock(); 1685 __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); 1686 rcu_read_unlock(); 1687} 1688EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); 1689 1690/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 1691void __nf_ct_refresh_acct(struct nf_conn *ct, 1692 enum ip_conntrack_info ctinfo, 1693 const struct sk_buff *skb, 1694 unsigned long extra_jiffies, 1695 int do_acct) 1696{ 1697 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); 1698 NF_CT_ASSERT(skb); 1699 1700 /* Only update if this is not a fixed timeout */ 1701 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 1702 goto acct; 1703 1704 /* If not in hash table, timer will not be active yet */ 1705 if (!nf_ct_is_confirmed(ct)) { 1706#ifdef HNDCTF 1707 ct->expire_jiffies = extra_jiffies; 1708#endif /* HNDCTF */ 1709 ct->timeout.expires = extra_jiffies; 1710 } else { 1711 unsigned long newtime = jiffies + extra_jiffies; 1712 1713 /* Only update the timeout if the new timeout is at least 1714 HZ jiffies from the old timeout. Need del_timer for race 1715 avoidance (may already be dying). */ 1716 if (newtime - ct->timeout.expires >= HZ) 1717 { 1718#ifdef HNDCTF 1719 ct->expire_jiffies = extra_jiffies; 1720#endif /* HNDCTF */ 1721 mod_timer_pending(&ct->timeout, newtime); 1722 } 1723 } 1724 1725acct: 1726 if (do_acct) { 1727 struct nf_conn_counter *acct; 1728 1729 acct = nf_conn_acct_find(ct); 1730 if (acct) { 1731 spin_lock_bh(&ct->lock); 1732 acct[CTINFO2DIR(ctinfo)].packets++; 1733 acct[CTINFO2DIR(ctinfo)].bytes += skb->len; 1734 spin_unlock_bh(&ct->lock); 1735 } 1736 } 1737} 1738EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 1739 1740bool __nf_ct_kill_acct(struct nf_conn *ct, 1741 enum ip_conntrack_info ctinfo, 1742 const struct sk_buff *skb, 1743 int do_acct) 1744{ 1745 if (do_acct) { 1746 struct nf_conn_counter *acct; 1747 1748 acct = nf_conn_acct_find(ct); 1749 if (acct) { 1750 spin_lock_bh(&ct->lock); 1751 acct[CTINFO2DIR(ctinfo)].packets++; 1752 acct[CTINFO2DIR(ctinfo)].bytes += 1753 skb->len - skb_network_offset(skb); 1754 spin_unlock_bh(&ct->lock); 1755 } 1756 } 1757 1758 if (del_timer(&ct->timeout)) { 1759 ct->timeout.function((unsigned long)ct); 1760 return true; 1761 } 1762 return false; 1763} 1764EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); 1765 1766#ifdef CONFIG_NF_CONNTRACK_ZONES 1767static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = { 1768 .len = sizeof(struct nf_conntrack_zone), 1769 .align = __alignof__(struct nf_conntrack_zone), 1770 .id = NF_CT_EXT_ZONE, 1771}; 1772#endif 1773 1774#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 1775 1776#include <linux/netfilter/nfnetlink.h> 1777#include <linux/netfilter/nfnetlink_conntrack.h> 1778#include <linux/mutex.h> 1779 1780/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be 1781 * in ip_conntrack_core, since we don't want the protocols to autoload 1782 * or depend on ctnetlink */ 1783int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 1784 const struct nf_conntrack_tuple *tuple) 1785{ 1786 NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port); 1787 NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port); 1788 return 0; 1789 1790nla_put_failure: 1791 return -1; 1792} 1793EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 1794 1795const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 1796 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 1797 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 1798}; 1799EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 1800 1801int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 1802 struct nf_conntrack_tuple *t) 1803{ 1804 if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT]) 1805 return -EINVAL; 1806 1807 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 1808 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 1809 1810 return 0; 1811} 1812EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 1813 1814int nf_ct_port_nlattr_tuple_size(void) 1815{ 1816 return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 1817} 1818EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 1819#endif 1820 1821/* Used by ipt_REJECT and ip6t_REJECT. */ 1822static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) 1823{ 1824 struct nf_conn *ct; 1825 enum ip_conntrack_info ctinfo; 1826 1827 /* This ICMP is in reverse direction to the packet which caused it */ 1828 ct = nf_ct_get(skb, &ctinfo); 1829 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 1830 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; 1831 else 1832 ctinfo = IP_CT_RELATED; 1833 1834 /* Attach to new skbuff, and increment count */ 1835 nskb->nfct = &ct->ct_general; 1836 nskb->nfctinfo = ctinfo; 1837 nf_conntrack_get(nskb->nfct); 1838} 1839 1840/* Bring out ya dead! */ 1841static struct nf_conn * 1842get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), 1843 void *data, unsigned int *bucket) 1844{ 1845 struct nf_conntrack_tuple_hash *h; 1846 struct nf_conn *ct; 1847 struct hlist_nulls_node *n; 1848 1849 spin_lock_bh(&nf_conntrack_lock); 1850 for (; *bucket < net->ct.htable_size; (*bucket)++) { 1851 hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { 1852 ct = nf_ct_tuplehash_to_ctrack(h); 1853 if (iter(ct, data)) 1854 goto found; 1855 } 1856 } 1857 hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) { 1858 ct = nf_ct_tuplehash_to_ctrack(h); 1859 if (iter(ct, data)) 1860 set_bit(IPS_DYING_BIT, &ct->status); 1861 } 1862 spin_unlock_bh(&nf_conntrack_lock); 1863 return NULL; 1864found: 1865 atomic_inc(&ct->ct_general.use); 1866 spin_unlock_bh(&nf_conntrack_lock); 1867 return ct; 1868} 1869 1870void nf_ct_iterate_cleanup(struct net *net, 1871 int (*iter)(struct nf_conn *i, void *data), 1872 void *data) 1873{ 1874 struct nf_conn *ct; 1875 unsigned int bucket = 0; 1876 1877 while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { 1878#ifdef HNDCTF 1879 ip_conntrack_ipct_delete(ct, 0); 1880#endif /* HNDCTF */ 1881 /* Time to push up daises... */ 1882 if (del_timer(&ct->timeout)) 1883 death_by_timeout((unsigned long)ct); 1884 /* ... else the timer will get him soon. */ 1885 1886 nf_ct_put(ct); 1887 } 1888} 1889EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); 1890 1891struct __nf_ct_flush_report { 1892 u32 pid; 1893 int report; 1894}; 1895 1896static int kill_report(struct nf_conn *i, void *data) 1897{ 1898 struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; 1899 1900 /* If we fail to deliver the event, death_by_timeout() will retry */ 1901 if (nf_conntrack_event_report(IPCT_DESTROY, i, 1902 fr->pid, fr->report) < 0) 1903 return 1; 1904 1905 /* Avoid the delivery of the destroy event in death_by_timeout(). */ 1906 set_bit(IPS_DYING_BIT, &i->status); 1907 return 1; 1908} 1909 1910static int kill_all(struct nf_conn *i, void *data) 1911{ 1912 return 1; 1913} 1914 1915void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size) 1916{ 1917 if (vmalloced) 1918 vfree(hash); 1919 else 1920 free_pages((unsigned long)hash, 1921 get_order(sizeof(struct hlist_head) * size)); 1922} 1923EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); 1924 1925void nf_conntrack_flush_report(struct net *net, u32 pid, int report) 1926{ 1927 struct __nf_ct_flush_report fr = { 1928 .pid = pid, 1929 .report = report, 1930 }; 1931 nf_ct_iterate_cleanup(net, kill_report, &fr); 1932} 1933EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); 1934 1935static void nf_ct_release_dying_list(struct net *net) 1936{ 1937 struct nf_conntrack_tuple_hash *h; 1938 struct nf_conn *ct; 1939 struct hlist_nulls_node *n; 1940 1941 spin_lock_bh(&nf_conntrack_lock); 1942 hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) { 1943 ct = nf_ct_tuplehash_to_ctrack(h); 1944 /* never fails to remove them, no listeners at this point */ 1945 nf_ct_kill(ct); 1946 } 1947 spin_unlock_bh(&nf_conntrack_lock); 1948} 1949 1950static int untrack_refs(void) 1951{ 1952 int cnt = 0, cpu; 1953 1954 for_each_possible_cpu(cpu) { 1955 struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); 1956 1957 cnt += atomic_read(&ct->ct_general.use) - 1; 1958 } 1959 return cnt; 1960} 1961 1962static void nf_conntrack_cleanup_init_net(void) 1963{ 1964 while (untrack_refs() > 0) 1965 schedule(); 1966 1967 nf_conntrack_helper_fini(); 1968 nf_conntrack_proto_fini(); 1969#ifdef CONFIG_NF_CONNTRACK_ZONES 1970 nf_ct_extend_unregister(&nf_ct_zone_extend); 1971#endif 1972} 1973 1974static void nf_conntrack_cleanup_net(struct net *net) 1975{ 1976 i_see_dead_people: 1977 nf_ct_iterate_cleanup(net, kill_all, NULL); 1978 nf_ct_release_dying_list(net); 1979 if (atomic_read(&net->ct.count) != 0) { 1980 schedule(); 1981 goto i_see_dead_people; 1982 } 1983 1984 nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, 1985 net->ct.htable_size); 1986 nf_conntrack_ecache_fini(net); 1987 nf_conntrack_acct_fini(net); 1988 nf_conntrack_expect_fini(net); 1989 kmem_cache_destroy(net->ct.nf_conntrack_cachep); 1990 kfree(net->ct.slabname); 1991 free_percpu(net->ct.stat); 1992} 1993 1994/* Mishearing the voices in his head, our hero wonders how he's 1995 supposed to kill the mall. */ 1996void nf_conntrack_cleanup(struct net *net) 1997{ 1998 if (net_eq(net, &init_net)) 1999 rcu_assign_pointer(ip_ct_attach, NULL); 2000 2001 /* This makes sure all current packets have passed through 2002 netfilter framework. Roll on, two-stage module 2003 delete... */ 2004 synchronize_net(); 2005 2006 nf_conntrack_cleanup_net(net); 2007 2008 if (net_eq(net, &init_net)) { 2009 rcu_assign_pointer(nf_ct_destroy, NULL); 2010 nf_conntrack_cleanup_init_net(); 2011 } 2012} 2013 2014void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls) 2015{ 2016 struct hlist_nulls_head *hash; 2017 unsigned int nr_slots, i; 2018 size_t sz; 2019 2020 *vmalloced = 0; 2021 2022 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2023 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2024 sz = nr_slots * sizeof(struct hlist_nulls_head); 2025 hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, 2026 get_order(sz)); 2027 if (!hash) { 2028 *vmalloced = 1; 2029 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); 2030 hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 2031 PAGE_KERNEL); 2032 } 2033 2034 if (hash && nulls) 2035 for (i = 0; i < nr_slots; i++) 2036 INIT_HLIST_NULLS_HEAD(&hash[i], i); 2037 2038 return hash; 2039} 2040EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 2041 2042int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) 2043{ 2044 int i, bucket, vmalloced, old_vmalloced; 2045 unsigned int hashsize, old_size; 2046 struct hlist_nulls_head *hash, *old_hash; 2047 struct nf_conntrack_tuple_hash *h; 2048 struct nf_conn *ct; 2049 2050 if (current->nsproxy->net_ns != &init_net) 2051 return -EOPNOTSUPP; 2052 2053 /* On boot, we can set this without any fancy locking. */ 2054 if (!nf_conntrack_htable_size) 2055 return param_set_uint(val, kp); 2056 2057 hashsize = simple_strtoul(val, NULL, 0); 2058 if (!hashsize) 2059 return -EINVAL; 2060 2061 hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1); 2062 if (!hash) 2063 return -ENOMEM; 2064 2065 /* Lookups in the old hash might happen in parallel, which means we 2066 * might get false negatives during connection lookup. New connections 2067 * created because of a false negative won't make it into the hash 2068 * though since that required taking the lock. 2069 */ 2070 spin_lock_bh(&nf_conntrack_lock); 2071 for (i = 0; i < init_net.ct.htable_size; i++) { 2072 while (!hlist_nulls_empty(&init_net.ct.hash[i])) { 2073 h = hlist_nulls_entry(init_net.ct.hash[i].first, 2074 struct nf_conntrack_tuple_hash, hnnode); 2075 ct = nf_ct_tuplehash_to_ctrack(h); 2076 hlist_nulls_del_rcu(&h->hnnode); 2077 bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct), 2078 hashsize, 2079 nf_conntrack_hash_rnd); 2080 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2081 } 2082 } 2083 old_size = init_net.ct.htable_size; 2084 old_vmalloced = init_net.ct.hash_vmalloc; 2085 old_hash = init_net.ct.hash; 2086 2087 init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; 2088 init_net.ct.hash_vmalloc = vmalloced; 2089 init_net.ct.hash = hash; 2090 spin_unlock_bh(&nf_conntrack_lock); 2091 2092 nf_ct_free_hashtable(old_hash, old_vmalloced, old_size); 2093 return 0; 2094} 2095EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); 2096 2097module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, 2098 &nf_conntrack_htable_size, 0600); 2099 2100void nf_ct_untracked_status_or(unsigned long bits) 2101{ 2102 int cpu; 2103 2104 for_each_possible_cpu(cpu) 2105 per_cpu(nf_conntrack_untracked, cpu).status |= bits; 2106} 2107EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); 2108 2109static int nf_conntrack_init_init_net(void) 2110{ 2111 int max_factor = 8; 2112 int ret, cpu; 2113 2114 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB 2115 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ 2116 if (!nf_conntrack_htable_size) { 2117 nf_conntrack_htable_size 2118 = (((totalram_pages << PAGE_SHIFT) / 16384) 2119 / sizeof(struct hlist_head)); 2120 if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2121 nf_conntrack_htable_size = 16384; 2122 if (nf_conntrack_htable_size < 32) 2123 nf_conntrack_htable_size = 32; 2124 2125 /* Use a max. factor of four by default to get the same max as 2126 * with the old struct list_heads. When a table size is given 2127 * we use the old value of 8 to avoid reducing the max. 2128 * entries. */ 2129 max_factor = 4; 2130 } 2131 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 2132 2133 printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", 2134 NF_CONNTRACK_VERSION, nf_conntrack_htable_size, 2135 nf_conntrack_max); 2136 2137 ret = nf_conntrack_proto_init(); 2138 if (ret < 0) 2139 goto err_proto; 2140 2141 ret = nf_conntrack_helper_init(); 2142 if (ret < 0) 2143 goto err_helper; 2144 2145#ifdef CONFIG_NF_CONNTRACK_ZONES 2146 ret = nf_ct_extend_register(&nf_ct_zone_extend); 2147 if (ret < 0) 2148 goto err_extend; 2149#endif 2150 /* Set up fake conntrack: to never be deleted, not in any hashes */ 2151 for_each_possible_cpu(cpu) { 2152 struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); 2153 write_pnet(&ct->ct_net, &init_net); 2154 atomic_set(&ct->ct_general.use, 1); 2155 } 2156 /* - and look it like as a confirmed connection */ 2157 nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); 2158 return 0; 2159 2160#ifdef CONFIG_NF_CONNTRACK_ZONES 2161err_extend: 2162 nf_conntrack_helper_fini(); 2163#endif 2164err_helper: 2165 nf_conntrack_proto_fini(); 2166err_proto: 2167 return ret; 2168} 2169 2170/* 2171 * We need to use special "null" values, not used in hash table 2172 */ 2173#define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2174#define DYING_NULLS_VAL ((1<<30)+1) 2175 2176static int nf_conntrack_init_net(struct net *net) 2177{ 2178 int ret; 2179 2180 atomic_set(&net->ct.count, 0); 2181 INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL); 2182 INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL); 2183 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2184 if (!net->ct.stat) { 2185 ret = -ENOMEM; 2186 goto err_stat; 2187 } 2188 2189 net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); 2190 if (!net->ct.slabname) { 2191 ret = -ENOMEM; 2192 goto err_slabname; 2193 } 2194 2195 net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, 2196 sizeof(struct nf_conn), 0, 2197 SLAB_DESTROY_BY_RCU, NULL); 2198 if (!net->ct.nf_conntrack_cachep) { 2199 printk(KERN_ERR "Unable to create nf_conn slab cache\n"); 2200 ret = -ENOMEM; 2201 goto err_cache; 2202 } 2203 2204 net->ct.htable_size = nf_conntrack_htable_size; 2205 net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 2206 &net->ct.hash_vmalloc, 1); 2207 if (!net->ct.hash) { 2208 ret = -ENOMEM; 2209 printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); 2210 goto err_hash; 2211 } 2212 ret = nf_conntrack_expect_init(net); 2213 if (ret < 0) 2214 goto err_expect; 2215 ret = nf_conntrack_acct_init(net); 2216 if (ret < 0) 2217 goto err_acct; 2218 ret = nf_conntrack_ecache_init(net); 2219 if (ret < 0) 2220 goto err_ecache; 2221 2222 return 0; 2223 2224err_ecache: 2225 nf_conntrack_acct_fini(net); 2226err_acct: 2227 nf_conntrack_expect_fini(net); 2228err_expect: 2229 nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, 2230 net->ct.htable_size); 2231err_hash: 2232 kmem_cache_destroy(net->ct.nf_conntrack_cachep); 2233err_cache: 2234 kfree(net->ct.slabname); 2235err_slabname: 2236 free_percpu(net->ct.stat); 2237err_stat: 2238 return ret; 2239} 2240 2241s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, 2242 enum ip_conntrack_dir dir, 2243 u32 seq); 2244EXPORT_SYMBOL_GPL(nf_ct_nat_offset); 2245 2246int nf_conntrack_init(struct net *net) 2247{ 2248 int ret; 2249 2250 if (net_eq(net, &init_net)) { 2251 ret = nf_conntrack_init_init_net(); 2252 if (ret < 0) 2253 goto out_init_net; 2254 } 2255 ret = nf_conntrack_init_net(net); 2256 if (ret < 0) 2257 goto out_net; 2258 2259 if (net_eq(net, &init_net)) { 2260 /* For use by REJECT target */ 2261 rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); 2262 rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); 2263 2264 /* Howto get NAT offsets */ 2265 rcu_assign_pointer(nf_ct_nat_offset, NULL); 2266 } 2267 return 0; 2268 2269out_net: 2270 if (net_eq(net, &init_net)) 2271 nf_conntrack_cleanup_init_net(); 2272out_init_net: 2273 return ret; 2274} 2275