1/* Modified by Broadcom Corp. Portions Copyright (c) Broadcom Corp, 2012. */ 2/* Connection state tracking for netfilter. This is separated from, 3 but required by, the NAT layer; it can also be used by an iptables 4 extension. */ 5 6/* (C) 1999-2001 Paul `Rusty' Russell 7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License version 2 as 12 * published by the Free Software Foundation. 13 */ 14 15#include <linux/types.h> 16#include <linux/netfilter.h> 17#include <linux/module.h> 18#include <linux/sched.h> 19#include <linux/skbuff.h> 20#include <linux/proc_fs.h> 21#include <linux/vmalloc.h> 22#include <linux/stddef.h> 23#include <linux/slab.h> 24#include <linux/random.h> 25#include <linux/jhash.h> 26#include <linux/err.h> 27#include <linux/percpu.h> 28#include <linux/moduleparam.h> 29#include <linux/notifier.h> 30#include <linux/kernel.h> 31#include <linux/netdevice.h> 32#include <linux/socket.h> 33#include <linux/mm.h> 34#include <linux/nsproxy.h> 35#include <linux/rculist_nulls.h> 36 37#include <net/netfilter/nf_conntrack.h> 38#include <net/netfilter/nf_conntrack_l3proto.h> 39#include <net/netfilter/nf_conntrack_l4proto.h> 40#include <net/netfilter/nf_conntrack_expect.h> 41#include <net/netfilter/nf_conntrack_helper.h> 42#include <net/netfilter/nf_conntrack_core.h> 43#include <net/netfilter/nf_conntrack_extend.h> 44#include <net/netfilter/nf_conntrack_acct.h> 45#include <net/netfilter/nf_conntrack_ecache.h> 46#include <net/netfilter/nf_conntrack_zones.h> 47#include <net/netfilter/nf_nat.h> 48#include <net/netfilter/nf_nat_core.h> 49 50#define NF_CONNTRACK_VERSION "0.5.0" 51 52#ifdef HNDCTF 53#include <linux/if.h> 54#include <linux/if_vlan.h> 55#include <linux/in.h> 56#include <linux/ip.h> 57#include <linux/tcp.h> 58 59#ifdef CONFIG_IPV6 60#include <linux/ipv6.h> 61#include <net/ipv6.h> 62#include <net/ip6_route.h> 63#define IPVERSION_IS_4(ipver) ((ipver) == 4) 64#else 65#define IPVERSION_IS_4(ipver) 1 66#endif /* CONFIG_IPV6 */ 67 68#include <net/ip.h> 69#include <net/route.h> 70#include <typedefs.h> 71#include <osl.h> 72#include <ctf/hndctf.h> 73#include <ctf/ctf_cfg.h> 74 75#define NFC_CTF_ENABLED (1 << 31) 76#else 77#define BCMFASTPATH_HOST 78#endif /* HNDCTF */ 79 80int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, 81 enum nf_nat_manip_type manip, 82 const struct nlattr *attr) __read_mostly; 83EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); 84 85DEFINE_SPINLOCK(nf_conntrack_lock); 86EXPORT_SYMBOL_GPL(nf_conntrack_lock); 87 88unsigned int nf_conntrack_htable_size __read_mostly; 89EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 90 91unsigned int nf_conntrack_max __read_mostly; 92EXPORT_SYMBOL_GPL(nf_conntrack_max); 93 94DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); 95EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); 96 97#ifdef HNDCTF 98/* 99 * Display an IP address in readable format. 100 */ 101/* Returns the number of 1-bits in x */ 102static int 103_popcounts(uint32 x) 104{ 105 x = x - ((x >> 1) & 0x55555555); 106 x = ((x >> 2) & 0x33333333) + (x & 0x33333333); 107 x = (x + (x >> 4)) & 0x0F0F0F0F; 108 x = (x + (x >> 16)); 109 return (x + (x >> 8)) & 0x0000003F; 110} 111 112bool 113ip_conntrack_is_ipc_allowed(struct sk_buff *skb, u_int32_t hooknum) 114{ 115 struct net_device *dev; 116 117 if (!CTF_ENAB(kcih)) 118 return FALSE; 119 120 if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_POST_ROUTING) { 121 dev = skb->dev; 122 if (dev->priv_flags & IFF_802_1Q_VLAN) 123 dev = vlan_dev_real_dev(dev); 124 125 /* Add ipc entry if packet is received on ctf enabled interface 126 * and the packet is not a defrag'd one. 127 */ 128 if (ctf_isenabled(kcih, dev) && (skb->len <= dev->mtu)) 129 skb->nfcache |= NFC_CTF_ENABLED; 130 } 131 132 /* Add the cache entries only if the device has registered and 133 * enabled ctf. 134 */ 135 if (skb->nfcache & NFC_CTF_ENABLED) 136 return TRUE; 137 138 return FALSE; 139} 140 141void 142ip_conntrack_ipct_add(struct sk_buff *skb, u_int32_t hooknum, 143 struct nf_conn *ct, enum ip_conntrack_info ci, 144 struct nf_conntrack_tuple *manip) 145{ 146 ctf_ipc_t ipc_entry; 147 struct hh_cache *hh; 148 struct ethhdr *eth; 149 struct iphdr *iph; 150 struct tcphdr *tcph; 151 struct rtable *rt; 152 struct nf_conn_help *help; 153 enum ip_conntrack_dir dir; 154 uint8 ipver, protocol; 155#ifdef CONFIG_IPV6 156 struct ipv6hdr *ip6h = NULL; 157#endif /* CONFIG_IPV6 */ 158 uint32 nud_flags; 159 160 if ((skb == NULL) || (ct == NULL)) 161 return; 162 163 /* Check CTF enabled */ 164 if (!ip_conntrack_is_ipc_allowed(skb, hooknum)) 165 return; 166 /* We only add cache entires for non-helper connections and at 167 * pre or post routing hooks. 168 */ 169 help = nfct_help(ct); 170 if ((help && help->helper) || (ct->ctf_flags & CTF_FLAGS_EXCLUDED) || 171 ((hooknum != NF_INET_PRE_ROUTING) && (hooknum != NF_INET_POST_ROUTING))) 172 return; 173 174 iph = ip_hdr(skb); 175 ipver = iph->version; 176 177 /* Support both IPv4 and IPv6 */ 178 if (ipver == 4) { 179 tcph = ((struct tcphdr *)(((__u8 *)iph) + (iph->ihl << 2))); 180 protocol = iph->protocol; 181 } 182#ifdef CONFIG_IPV6 183 else if (ipver == 6) { 184 ip6h = (struct ipv6hdr *)iph; 185 tcph = (struct tcphdr *)ctf_ipc_lkup_l4proto(kcih, ip6h, &protocol); 186 if (tcph == NULL) 187 return; 188 } 189#endif /* CONFIG_IPV6 */ 190 else 191 return; 192 193 /* Only TCP and UDP are supported */ 194 if (protocol == IPPROTO_TCP) { 195 /* Add ipc entries for connections in established state only */ 196 if ((ci != IP_CT_ESTABLISHED) && (ci != (IP_CT_ESTABLISHED+IP_CT_IS_REPLY))) 197 return; 198 199 if (ct->proto.tcp.state >= TCP_CONNTRACK_FIN_WAIT && 200 ct->proto.tcp.state <= TCP_CONNTRACK_TIME_WAIT) 201 return; 202 } 203 else if (protocol != IPPROTO_UDP) 204 return; 205 206 dir = CTINFO2DIR(ci); 207 if (ct->ctf_flags & (1 << dir)) 208 return; 209 210 /* Do route lookup for alias address if we are doing DNAT in this 211 * direction. 212 */ 213 if (skb_dst(skb) == NULL) { 214 /* Find the destination interface */ 215 if (IPVERSION_IS_4(ipver)) { 216 u_int32_t daddr; 217 218 if ((manip != NULL) && (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST)) 219 daddr = manip->dst.u3.ip; 220 else 221 daddr = iph->daddr; 222 ip_route_input(skb, daddr, iph->saddr, iph->tos, skb->dev); 223 } 224#ifdef CONFIG_IPV6 225 else 226 ip6_route_input(skb); 227#endif /* CONFIG_IPV6 */ 228 } 229 230 /* Ensure the packet belongs to a forwarding connection and it is 231 * destined to an unicast address. 232 */ 233 rt = (struct rtable *)skb_dst(skb); 234 235 nud_flags = NUD_PERMANENT | NUD_REACHABLE | NUD_STALE | NUD_DELAY | NUD_PROBE; 236#ifdef CTF_PPPOE 237 if ((skb_dst(skb) != NULL) && (skb_dst(skb)->dev != NULL) && 238 (skb_dst(skb)->dev->flags & IFF_POINTOPOINT)) 239 nud_flags |= NUD_NOARP; 240#endif 241 242 if ((rt == NULL) || ( 243#ifdef CONFIG_IPV6 244 !IPVERSION_IS_4(ipver) ? 245 ((rt->dst.input != ip6_forward) || 246 !(ipv6_addr_type(&ip6h->daddr) & IPV6_ADDR_UNICAST)) : 247#endif /* CONFIG_IPV6 */ 248 ((rt->dst.input != ip_forward) || (rt->rt_type != RTN_UNICAST))) || 249 (rt->dst.neighbour == NULL) || 250 ((rt->dst.neighbour->nud_state & nud_flags) == 0)) 251 return; 252 253 memset(&ipc_entry, 0, sizeof(ipc_entry)); 254 255 /* Init the neighboring sender address */ 256 memcpy(ipc_entry.sa.octet, eth_hdr(skb)->h_source, ETH_ALEN); 257 258 /* If the packet is received on a bridge device then save 259 * the bridge cache entry pointer in the ip cache entry. 260 * This will be referenced in the data path to update the 261 * live counter of brc entry whenever a received packet 262 * matches corresponding ipc entry matches. 263 */ 264 if ((skb->dev != NULL) && ctf_isbridge(kcih, skb->dev)) { 265 ipc_entry.brcp = ctf_brc_lkup(kcih, eth_hdr(skb)->h_source, FALSE); 266 } 267 268 hh = skb_dst(skb)->hh; 269 if (hh != NULL) { 270 eth = (struct ethhdr *)(((unsigned char *)hh->hh_data) + 2); 271 memcpy(ipc_entry.dhost.octet, eth->h_dest, ETH_ALEN); 272 memcpy(ipc_entry.shost.octet, eth->h_source, ETH_ALEN); 273 } else { 274 memcpy(ipc_entry.dhost.octet, rt->dst.neighbour->ha, ETH_ALEN); 275 memcpy(ipc_entry.shost.octet, skb_dst(skb)->dev->dev_addr, ETH_ALEN); 276 } 277 278 /* Add ctf ipc entry for this direction */ 279 if (IPVERSION_IS_4(ipver)) { 280 ipc_entry.tuple.sip[0] = iph->saddr; 281 ipc_entry.tuple.dip[0] = iph->daddr; 282#ifdef CONFIG_IPV6 283 } else { 284 memcpy(ipc_entry.tuple.sip, &ip6h->saddr, sizeof(ipc_entry.tuple.sip)); 285 memcpy(ipc_entry.tuple.dip, &ip6h->daddr, sizeof(ipc_entry.tuple.dip)); 286#endif /* CONFIG_IPV6 */ 287 } 288 ipc_entry.tuple.proto = protocol; 289 ipc_entry.tuple.sp = tcph->source; 290 ipc_entry.tuple.dp = tcph->dest; 291 292 ipc_entry.next = NULL; 293 294 /* For vlan interfaces fill the vlan id and the tag/untag actions */ 295 if (skb_dst(skb)->dev->priv_flags & IFF_802_1Q_VLAN) { 296 ipc_entry.txif = (void *)vlan_dev_real_dev(skb_dst(skb)->dev); 297 ipc_entry.vid = vlan_dev_vlan_id(skb_dst(skb)->dev); 298 ipc_entry.action = ((vlan_dev_vlan_flags(skb_dst(skb)->dev) & 1) ? 299 CTF_ACTION_TAG : CTF_ACTION_UNTAG); 300 } else { 301 ipc_entry.txif = skb_dst(skb)->dev; 302 ipc_entry.action = CTF_ACTION_UNTAG; 303 } 304 305#ifdef CTF_PPPOE 306 /* For pppoe interfaces fill the session id and header add/del actions */ 307 ipc_entry.pppoe_sid = -1; 308 if (skb_dst(skb)->dev->flags & IFF_POINTOPOINT) { 309 /* Transmit interface and sid will be populated by pppoe module */ 310 ipc_entry.action |= CTF_ACTION_PPPOE_ADD; 311 skb->ctf_pppoe_cb[0] = 2; 312 ipc_entry.ppp_ifp = skb_dst(skb)->dev; 313 } else if ((skb->dev->flags & IFF_POINTOPOINT) && (skb->ctf_pppoe_cb[0] == 1)) { 314 ipc_entry.action |= CTF_ACTION_PPPOE_DEL; 315 ipc_entry.pppoe_sid = *(uint16 *)&skb->ctf_pppoe_cb[2]; 316 ipc_entry.ppp_ifp = skb->dev; 317 } 318#endif 319 320 if (((ipc_entry.tuple.proto == IPPROTO_TCP) && (kcih->ipc_suspend & CTF_SUSPEND_TCP_MASK)) || 321 ((ipc_entry.tuple.proto == IPPROTO_UDP) && (kcih->ipc_suspend & CTF_SUSPEND_UDP_MASK))) { 322 /* The default action is suspend */ 323 ipc_entry.action |= CTF_ACTION_SUSPEND; 324 ipc_entry.susp_cnt = ((ipc_entry.tuple.proto == IPPROTO_TCP) ? 325 _popcounts(kcih->ipc_suspend & CTF_SUSPEND_TCP_MASK) : 326 _popcounts(kcih->ipc_suspend & CTF_SUSPEND_UDP_MASK)); 327 } 328 329 /* Copy the DSCP value. ECN bits must be cleared. */ 330 if (IPVERSION_IS_4(ipver)) 331 ipc_entry.tos = IPV4_TOS(iph); 332#ifdef CONFIG_IPV6 333 else 334 ipc_entry.tos = IPV6_TRAFFIC_CLASS(ip6h); 335#endif /* CONFIG_IPV6 */ 336 ipc_entry.tos &= IPV4_TOS_DSCP_MASK; 337 if (ipc_entry.tos) 338 ipc_entry.action |= CTF_ACTION_TOS; 339 340#ifdef CONFIG_NF_CONNTRACK_MARK 341 /* Initialize the mark for this connection */ 342 if (ct->mark != 0) { 343 ipc_entry.mark.value = ct->mark; 344 ipc_entry.action |= CTF_ACTION_MARK; 345 } 346#endif /* CONFIG_NF_CONNTRACK_MARK */ 347 348 /* Update the manip ip and port */ 349 if (manip != NULL) { 350 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) { 351 ipc_entry.nat.ip = manip->src.u3.ip; 352 ipc_entry.nat.port = manip->src.u.tcp.port; 353 ipc_entry.action |= CTF_ACTION_SNAT; 354 } else { 355 ipc_entry.nat.ip = manip->dst.u3.ip; 356 ipc_entry.nat.port = manip->dst.u.tcp.port; 357 ipc_entry.action |= CTF_ACTION_DNAT; 358 } 359 } 360 361 /* Do bridge cache lookup to determine outgoing interface 362 * and any vlan tagging actions if needed. 363 */ 364 if (ctf_isbridge(kcih, ipc_entry.txif)) { 365 ctf_brc_t *brcp; 366 367 ctf_brc_acquire(kcih); 368 369 if ((brcp = ctf_brc_lkup(kcih, ipc_entry.dhost.octet, TRUE)) != NULL) { 370 ipc_entry.txbif = ipc_entry.txif; 371 ipc_entry.action |= brcp->action; 372 ipc_entry.txif = brcp->txifp; 373 ipc_entry.vid = brcp->vid; 374 } 375 376 ctf_brc_release(kcih); 377 } 378 379#ifdef DEBUG 380 if (IPVERSION_IS_4(ipver)) 381 printk("%s: Adding ipc entry for [%d]%u.%u.%u.%u:%u - %u.%u.%u.%u:%u\n", __FUNCTION__, 382 ipc_entry.tuple.proto, 383 NIPQUAD(ipc_entry.tuple.sip[0]), ntohs(ipc_entry.tuple.sp), 384 NIPQUAD(ipc_entry.tuple.dip[0]), ntohs(ipc_entry.tuple.dp)); 385#ifdef CONFIG_IPV6 386 else 387 printk("\n%s: Adding ipc entry for [%d]\n" 388 "%08x.%08x.%08x.%08x:%u => %08x.%08x.%08x.%08x:%u\n", 389 __FUNCTION__, ipc_entry.tuple.proto, 390 ntohl(ipc_entry.tuple.sip[0]), ntohl(ipc_entry.tuple.sip[1]), 391 ntohl(ipc_entry.tuple.sip[2]), ntohl(ipc_entry.tuple.sip[3]), 392 ntohs(ipc_entry.tuple.sp), 393 ntohl(ipc_entry.tuple.dip[0]), ntohl(ipc_entry.tuple.dip[1]), 394 ntohl(ipc_entry.tuple.dip[2]), ntohl(ipc_entry.tuple.dip[3]), 395 ntohs(ipc_entry.tuple.dp)); 396#endif /* CONFIG_IPV6 */ 397 printk("sa %02x:%02x:%02x:%02x:%02x:%02x\n", 398 ipc_entry.shost.octet[0], ipc_entry.shost.octet[1], 399 ipc_entry.shost.octet[2], ipc_entry.shost.octet[3], 400 ipc_entry.shost.octet[4], ipc_entry.shost.octet[5]); 401 printk("da %02x:%02x:%02x:%02x:%02x:%02x\n", 402 ipc_entry.dhost.octet[0], ipc_entry.dhost.octet[1], 403 ipc_entry.dhost.octet[2], ipc_entry.dhost.octet[3], 404 ipc_entry.dhost.octet[4], ipc_entry.dhost.octet[5]); 405 printk("[%d] vid: %d action %x\n", hooknum, ipc_entry.vid, ipc_entry.action); 406 if (manip != NULL) 407 printk("manip_ip: %u.%u.%u.%u manip_port %u\n", 408 NIPQUAD(ipc_entry.nat.ip), ntohs(ipc_entry.nat.port)); 409 printk("txif: %s\n", ((struct net_device *)ipc_entry.txif)->name); 410#endif 411 412 ctf_ipc_add(kcih, &ipc_entry, !IPVERSION_IS_4(ipver)); 413 414#ifdef CTF_PPPOE 415 if (skb->ctf_pppoe_cb[0] == 2) { 416 ctf_ipc_t *ipct; 417 ipct = ctf_ipc_lkup(kcih, &ipc_entry, ipver == 6); 418 *(uint32 *)&skb->ctf_pppoe_cb[4] = (uint32)ipct; 419 if (ipct != NULL) 420 ctf_ipc_release(kcih, ipct); 421 } 422#endif 423 424 /* Update the attributes flag to indicate a CTF conn */ 425 ct->ctf_flags |= (CTF_FLAGS_CACHED | (1 << dir)); 426} 427 428int 429ip_conntrack_ipct_delete(struct nf_conn *ct, int ct_timeout) 430{ 431 ctf_ipc_t *ipct; 432 struct nf_conntrack_tuple *orig, *repl; 433 ctf_ipc_t orig_ipct, repl_ipct; 434 int ipaddr_sz; 435 bool v6; 436 437 if (!CTF_ENAB(kcih)) 438 return (0); 439 440 orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 441 442 if ((orig->dst.protonum != IPPROTO_TCP) && (orig->dst.protonum != IPPROTO_UDP)) 443 return (0); 444 445 repl = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; 446 447#ifdef CONFIG_IPV6 448 v6 = (orig->src.l3num == AF_INET6); 449 ipaddr_sz = (v6) ? sizeof(struct in6_addr) : sizeof(struct in_addr); 450#else 451 v6 = FALSE; 452 ipaddr_sz = sizeof(struct in_addr); 453#endif /* CONFIG_IPV6 */ 454 455 memset(&orig_ipct, 0, sizeof(orig_ipct)); 456 memcpy(orig_ipct.tuple.sip, &orig->src.u3.ip, ipaddr_sz); 457 memcpy(orig_ipct.tuple.dip, &orig->dst.u3.ip, ipaddr_sz); 458 orig_ipct.tuple.proto = orig->dst.protonum; 459 orig_ipct.tuple.sp = orig->src.u.tcp.port; 460 orig_ipct.tuple.dp = orig->dst.u.tcp.port; 461 462 memset(&repl_ipct, 0, sizeof(repl_ipct)); 463 memcpy(repl_ipct.tuple.sip, &repl->src.u3.ip, ipaddr_sz); 464 memcpy(repl_ipct.tuple.dip, &repl->dst.u3.ip, ipaddr_sz); 465 repl_ipct.tuple.proto = repl->dst.protonum; 466 repl_ipct.tuple.sp = repl->src.u.tcp.port; 467 repl_ipct.tuple.dp = repl->dst.u.tcp.port; 468 469 /* If the refresh counter of ipc entry is non zero, it indicates 470 * that the packet transfer is active and we should not delete 471 * the conntrack entry. 472 */ 473 if (ct_timeout) { 474 ipct = ctf_ipc_lkup(kcih, &orig_ipct, v6); 475 476 /* Postpone the deletion of ct entry if there are frames 477 * flowing in this direction. 478 */ 479 if (ipct != NULL) { 480#ifdef BCMFA 481 ctf_live(kcih, ipct, v6); 482#endif 483 if (ipct->live > 0) { 484 ipct->live = 0; 485 ctf_ipc_release(kcih, ipct); 486 ct->timeout.expires = jiffies + ct->expire_jiffies; 487 add_timer(&ct->timeout); 488 return (-1); 489 } 490 ctf_ipc_release(kcih, ipct); 491 } 492 493 ipct = ctf_ipc_lkup(kcih, &repl_ipct, v6); 494 495 if (ipct != NULL) { 496#ifdef BCMFA 497 ctf_live(kcih, ipct, v6); 498#endif 499 if (ipct->live > 0) { 500 ipct->live = 0; 501 ctf_ipc_release(kcih, ipct); 502 ct->timeout.expires = jiffies + ct->expire_jiffies; 503 add_timer(&ct->timeout); 504 return (-1); 505 } 506 ctf_ipc_release(kcih, ipct); 507 } 508 } 509 510 /* If there are no packets over this connection for timeout period 511 * delete the entries. 512 */ 513 ctf_ipc_delete(kcih, &orig_ipct, v6); 514 515 ctf_ipc_delete(kcih, &repl_ipct, v6); 516 517#ifdef DEBUG 518 printk("%s: Deleting the tuple %x %x %d %d %d\n", 519 __FUNCTION__, orig->src.u3.ip, orig->dst.u3.ip, orig->dst.protonum, 520 orig->src.u.tcp.port, orig->dst.u.tcp.port); 521 printk("%s: Deleting the tuple %x %x %d %d %d\n", 522 __FUNCTION__, repl->dst.u3.ip, repl->src.u3.ip, repl->dst.protonum, 523 repl->dst.u.tcp.port, repl->src.u.tcp.port); 524#endif 525 526 return (0); 527} 528 529void 530ip_conntrack_ipct_default_fwd_set(uint8 protocol, ctf_fwd_t fwd, uint8 userid) 531{ 532 ctf_cfg_request_t req; 533 ctf_fwd_t *f; 534 uint8 *p; 535 uint8 *uid; 536 537 memset(&req, '\0', sizeof(req)); 538 req.command_id = CTFCFG_CMD_DEFAULT_FWD_SET; 539 req.size = sizeof(ctf_fwd_t) + sizeof(uint8) + sizeof(uint8); 540 f = (ctf_fwd_t *) req.arg; 541 *f = fwd; 542 p = (req.arg + sizeof(ctf_fwd_t)); 543 *p = protocol; 544 uid = (req.arg + sizeof(ctf_fwd_t) + sizeof(uint8)); 545 *uid = userid; 546 547 ctf_cfg_req_process(kcih, &req); 548} 549EXPORT_SYMBOL(ip_conntrack_ipct_default_fwd_set); 550 551 552uint32 553ip_conntrack_ipct_resume(struct sk_buff *skb, u_int32_t hooknum, 554 struct nf_conn *ct, enum ip_conntrack_info ci) 555{ 556 struct iphdr *iph; 557 struct tcphdr *tcph; 558 struct nf_conn_help *help; 559 uint8 ipver, protocol; 560#ifdef CONFIG_IPV6 561 struct ipv6hdr *ip6h = NULL; 562#endif /* CONFIG_IPV6 */ 563 uint32 *ct_mark_p; 564 565 ctf_cfg_request_t req; 566 ctf_tuple_t tuple, *tp = NULL; 567 568 if ((skb == NULL) || (ct == NULL)) 569 return 0; 570 571 /* Check CTF enabled */ 572 if (!ip_conntrack_is_ipc_allowed(skb, hooknum)) 573 return 0; 574 575 /* We only add cache entires for non-helper connections and at 576 * pre or post routing hooks. 577 */ 578 help = nfct_help(ct); 579 if ((help && help->helper) || (ct->ctf_flags & CTF_FLAGS_EXCLUDED) || 580 ((hooknum != NF_INET_PRE_ROUTING) && (hooknum != NF_INET_POST_ROUTING))) 581 return 0; 582 583 iph = ip_hdr(skb); 584 ipver = iph->version; 585 586 /* Support both IPv4 and IPv6 */ 587 if (ipver == 4) { 588 tcph = ((struct tcphdr *)(((__u8 *)iph) + (iph->ihl << 2))); 589 protocol = iph->protocol; 590 } 591#ifdef CONFIG_IPV6 592 else if (ipver == 6) { 593 ip6h = (struct ipv6hdr *)iph; 594 tcph = (struct tcphdr *)ctf_ipc_lkup_l4proto(kcih, ip6h, &protocol); 595 if (tcph == NULL) 596 return 0; 597 } 598#endif /* CONFIG_IPV6 */ 599 else 600 return 0; 601 602 /* Only TCP and UDP are supported */ 603 if (protocol == IPPROTO_TCP) { 604 /* Add ipc entries for connections in established state only */ 605 if ((ci != IP_CT_ESTABLISHED) && (ci != (IP_CT_ESTABLISHED+IP_CT_IS_REPLY))) 606 return 0; 607 608 if (ct->proto.tcp.state >= TCP_CONNTRACK_FIN_WAIT && 609 ct->proto.tcp.state <= TCP_CONNTRACK_TIME_WAIT) 610 return 0; 611 } 612 else if (protocol != IPPROTO_UDP) 613 return 0; 614 615 memset(&tuple, '\0', sizeof(tuple)); 616 if (IPVERSION_IS_4(ipver)) { 617 memcpy(&tuple.src_addr, &iph->saddr, sizeof(uint32)); 618 memcpy(&tuple.dst_addr, &iph->daddr, sizeof(uint32)); 619 tuple.family = AF_INET; 620#ifdef CONFIG_IPV6 621 } else { 622 memcpy(&tuple.src_addr, &ip6h->saddr, IPV6_ADDR_LEN); 623 memcpy(&tuple.dst_addr, &ip6h->daddr, IPV6_ADDR_LEN); 624 tuple.family = AF_INET6; 625#endif /* CONFIG_IPV6 */ 626 } 627 tuple.src_port = tcph->source; 628 tuple.dst_port = tcph->dest; 629 tuple.protocol = protocol; 630 631#ifdef CONFIG_NF_CONNTRACK_MARK 632 if (ct->mark != 0) { 633 /* To Update Mark */ 634 memset(&req, '\0', sizeof(req)); 635 req.command_id = CTFCFG_CMD_UPD_MARK; 636 req.size = sizeof(ctf_tuple_t) + sizeof(uint32); 637 tp = (ctf_tuple_t *) req.arg; 638 *tp = tuple; 639 ct_mark_p = (uint32 *)(req.arg + sizeof(ctf_tuple_t)); 640 *ct_mark_p = ct->mark; 641 ctf_cfg_req_process(kcih, &req); 642 643 /* To Update ipct txif */ 644 memset(&req, '\0', sizeof(req)); 645 req.command_id = CTFCFG_CMD_CHANGE_TXIF_TO_BR; 646 req.size = sizeof(ctf_tuple_t); 647 tp = (ctf_tuple_t *) req.arg; 648 *tp = tuple; 649 ctf_cfg_req_process(kcih, &req); 650 } 651#endif /* CONFIG_NF_CONNTRACK_MARK */ 652 653 /* To Resume */ 654 memset(&req, '\0', sizeof(req)); 655 req.command_id = CTFCFG_CMD_RESUME; 656 req.size = sizeof(ctf_tuple_t); 657 tp = (ctf_tuple_t *) req.arg; 658 *tp = tuple; 659 ctf_cfg_req_process(kcih, &req); 660 return req.status; 661} 662EXPORT_SYMBOL(ip_conntrack_ipct_resume); 663#endif /* HNDCTF */ 664 665 666static int nf_conntrack_hash_rnd_initted; 667static unsigned int nf_conntrack_hash_rnd; 668 669static u_int32_t BCMFASTPATH_HOST __hash_conntrack(const struct nf_conntrack_tuple *tuple, 670 u16 zone, unsigned int size, unsigned int rnd) 671{ 672 unsigned int n; 673 u_int32_t h; 674 675 /* The direction must be ignored, so we hash everything up to the 676 * destination ports (which is a multiple of 4) and treat the last 677 * three bytes manually. 678 */ 679 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); 680 h = jhash2((u32 *)tuple, n, 681 zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) | 682 tuple->dst.protonum)); 683 684 return ((u64)h * size) >> 32; 685} 686 687static inline u_int32_t hash_conntrack(const struct net *net, u16 zone, 688 const struct nf_conntrack_tuple *tuple) 689{ 690 return __hash_conntrack(tuple, zone, net->ct.htable_size, 691 nf_conntrack_hash_rnd); 692} 693 694bool 695nf_ct_get_tuple(const struct sk_buff *skb, 696 unsigned int nhoff, 697 unsigned int dataoff, 698 u_int16_t l3num, 699 u_int8_t protonum, 700 struct nf_conntrack_tuple *tuple, 701 const struct nf_conntrack_l3proto *l3proto, 702 const struct nf_conntrack_l4proto *l4proto) 703{ 704 memset(tuple, 0, sizeof(*tuple)); 705 706 tuple->src.l3num = l3num; 707 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) 708 return false; 709 710 tuple->dst.protonum = protonum; 711 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 712 713 return l4proto->pkt_to_tuple(skb, dataoff, tuple); 714} 715EXPORT_SYMBOL_GPL(nf_ct_get_tuple); 716 717bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 718 u_int16_t l3num, struct nf_conntrack_tuple *tuple) 719{ 720 struct nf_conntrack_l3proto *l3proto; 721 struct nf_conntrack_l4proto *l4proto; 722 unsigned int protoff; 723 u_int8_t protonum; 724 int ret; 725 726 rcu_read_lock(); 727 728 l3proto = __nf_ct_l3proto_find(l3num); 729 ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum); 730 if (ret != NF_ACCEPT) { 731 rcu_read_unlock(); 732 return false; 733 } 734 735 l4proto = __nf_ct_l4proto_find(l3num, protonum); 736 737 ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple, 738 l3proto, l4proto); 739 740 rcu_read_unlock(); 741 return ret; 742} 743EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 744 745bool 746nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 747 const struct nf_conntrack_tuple *orig, 748 const struct nf_conntrack_l3proto *l3proto, 749 const struct nf_conntrack_l4proto *l4proto) 750{ 751 memset(inverse, 0, sizeof(*inverse)); 752 753 inverse->src.l3num = orig->src.l3num; 754 if (l3proto->invert_tuple(inverse, orig) == 0) 755 return false; 756 757 inverse->dst.dir = !orig->dst.dir; 758 759 inverse->dst.protonum = orig->dst.protonum; 760 return l4proto->invert_tuple(inverse, orig); 761} 762EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 763 764static void 765clean_from_lists(struct nf_conn *ct) 766{ 767 pr_debug("clean_from_lists(%p)\n", ct); 768 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 769 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 770 771 /* Destroy all pending expectations */ 772 nf_ct_remove_expectations(ct); 773} 774 775static void 776destroy_conntrack(struct nf_conntrack *nfct) 777{ 778 struct nf_conn *ct = (struct nf_conn *)nfct; 779 struct net *net = nf_ct_net(ct); 780 struct nf_conntrack_l4proto *l4proto; 781 782 pr_debug("destroy_conntrack(%p)\n", ct); 783 NF_CT_ASSERT(atomic_read(&nfct->use) == 0); 784 NF_CT_ASSERT(!timer_pending(&ct->timeout)); 785 786#if 0 //Don't let conntrack detele CTF entry 787//#ifdef HNDCTF 788 ip_conntrack_ipct_delete(ct, 0); 789#endif /* HNDCTF*/ 790 /* To make sure we don't get any weird locking issues here: 791 * destroy_conntrack() MUST NOT be called with a write lock 792 * to nf_conntrack_lock!!! -HW */ 793 rcu_read_lock(); 794 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 795 if (l4proto && l4proto->destroy) 796 l4proto->destroy(ct); 797 798 rcu_read_unlock(); 799 800 spin_lock_bh(&nf_conntrack_lock); 801 /* Expectations will have been removed in clean_from_lists, 802 * except TFTP can create an expectation on the first packet, 803 * before connection is in the list, so we need to clean here, 804 * too. */ 805 nf_ct_remove_expectations(ct); 806 807 /* We overload first tuple to link into unconfirmed list. */ 808 if (!nf_ct_is_confirmed(ct)) { 809 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); 810 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 811 } 812 813 NF_CT_STAT_INC(net, delete); 814 spin_unlock_bh(&nf_conntrack_lock); 815 816 if (ct->master) 817 nf_ct_put(ct->master); 818 819 pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); 820 nf_conntrack_free(ct); 821} 822 823void nf_ct_delete_from_lists(struct nf_conn *ct) 824{ 825 struct net *net = nf_ct_net(ct); 826 827 nf_ct_helper_destroy(ct); 828 spin_lock_bh(&nf_conntrack_lock); 829 /* Inside lock so preempt is disabled on module removal path. 830 * Otherwise we can get spurious warnings. */ 831 NF_CT_STAT_INC(net, delete_list); 832 clean_from_lists(ct); 833 spin_unlock_bh(&nf_conntrack_lock); 834} 835EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists); 836 837static void death_by_event(unsigned long ul_conntrack) 838{ 839 struct nf_conn *ct = (void *)ul_conntrack; 840 struct net *net = nf_ct_net(ct); 841 842 if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { 843 /* bad luck, let's retry again */ 844 ct->timeout.expires = jiffies + 845 (random32() % net->ct.sysctl_events_retry_timeout); 846 add_timer(&ct->timeout); 847 return; 848 } 849 /* we've got the event delivered, now it's dying */ 850 set_bit(IPS_DYING_BIT, &ct->status); 851 spin_lock(&nf_conntrack_lock); 852 hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 853 spin_unlock(&nf_conntrack_lock); 854 nf_ct_put(ct); 855} 856 857void nf_ct_insert_dying_list(struct nf_conn *ct) 858{ 859 struct net *net = nf_ct_net(ct); 860 861 /* add this conntrack to the dying list */ 862 spin_lock_bh(&nf_conntrack_lock); 863 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 864 &net->ct.dying); 865 spin_unlock_bh(&nf_conntrack_lock); 866 /* set a new timer to retry event delivery */ 867 setup_timer(&ct->timeout, death_by_event, (unsigned long)ct); 868 ct->timeout.expires = jiffies + 869 (random32() % net->ct.sysctl_events_retry_timeout); 870 add_timer(&ct->timeout); 871} 872EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); 873 874static void death_by_timeout(unsigned long ul_conntrack) 875{ 876 struct nf_conn *ct = (void *)ul_conntrack; 877#if 0 //Don't let conntrack detele CTF entry 878//#ifdef HNDCTF 879 /* If negative error is returned it means the entry hasn't 880 * timed out yet. 881 */ 882 if (ip_conntrack_ipct_delete(ct, jiffies >= ct->timeout.expires ? 1 : 0) != 0) 883 return; 884#endif /* HNDCTF */ 885 886 if (!test_bit(IPS_DYING_BIT, &ct->status) && 887 unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { 888 /* destroy event was not delivered */ 889 nf_ct_delete_from_lists(ct); 890 nf_ct_insert_dying_list(ct); 891 return; 892 } 893 set_bit(IPS_DYING_BIT, &ct->status); 894 nf_ct_delete_from_lists(ct); 895 nf_ct_put(ct); 896} 897 898/* 899 * Warning : 900 * - Caller must take a reference on returned object 901 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 902 * OR 903 * - Caller must lock nf_conntrack_lock before calling this function 904 */ 905struct nf_conntrack_tuple_hash * BCMFASTPATH_HOST 906__nf_conntrack_find(struct net *net, u16 zone, 907 const struct nf_conntrack_tuple *tuple) 908{ 909 struct nf_conntrack_tuple_hash *h; 910 struct hlist_nulls_node *n; 911 unsigned int hash = hash_conntrack(net, zone, tuple); 912 913 /* Disable BHs the entire time since we normally need to disable them 914 * at least once for the stats anyway. 915 */ 916 local_bh_disable(); 917begin: 918 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { 919 if (nf_ct_tuple_equal(tuple, &h->tuple) && 920 nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) { 921 NF_CT_STAT_INC(net, found); 922 local_bh_enable(); 923 return h; 924 } 925 NF_CT_STAT_INC(net, searched); 926 } 927 /* 928 * if the nulls value we got at the end of this lookup is 929 * not the expected one, we must restart lookup. 930 * We probably met an item that was moved to another chain. 931 */ 932 if (get_nulls_value(n) != hash) { 933 NF_CT_STAT_INC(net, search_restart); 934 goto begin; 935 } 936 local_bh_enable(); 937 938 return NULL; 939} 940EXPORT_SYMBOL_GPL(__nf_conntrack_find); 941 942/* Find a connection corresponding to a tuple. */ 943struct nf_conntrack_tuple_hash * BCMFASTPATH_HOST 944nf_conntrack_find_get(struct net *net, u16 zone, 945 const struct nf_conntrack_tuple *tuple) 946{ 947 struct nf_conntrack_tuple_hash *h; 948 struct nf_conn *ct; 949 950 rcu_read_lock(); 951begin: 952 h = __nf_conntrack_find(net, zone, tuple); 953 if (h) { 954 ct = nf_ct_tuplehash_to_ctrack(h); 955 if (unlikely(nf_ct_is_dying(ct) || 956 !atomic_inc_not_zero(&ct->ct_general.use))) 957 h = NULL; 958 else { 959 if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) || 960 nf_ct_zone(ct) != zone)) { 961 nf_ct_put(ct); 962 goto begin; 963 } 964 } 965 } 966 rcu_read_unlock(); 967 968 return h; 969} 970EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 971 972static void __nf_conntrack_hash_insert(struct nf_conn *ct, 973 unsigned int hash, 974 unsigned int repl_hash) 975{ 976 struct net *net = nf_ct_net(ct); 977 978 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 979 &net->ct.hash[hash]); 980 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 981 &net->ct.hash[repl_hash]); 982} 983 984void nf_conntrack_hash_insert(struct nf_conn *ct) 985{ 986 struct net *net = nf_ct_net(ct); 987 unsigned int hash, repl_hash; 988 u16 zone; 989 990 zone = nf_ct_zone(ct); 991 hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 992 repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 993 994 __nf_conntrack_hash_insert(ct, hash, repl_hash); 995} 996EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert); 997 998/* Confirm a connection given skb; places it in hash table */ 999int 1000__nf_conntrack_confirm(struct sk_buff *skb) 1001{ 1002 unsigned int hash, repl_hash; 1003 struct nf_conntrack_tuple_hash *h; 1004 struct nf_conn *ct; 1005 struct nf_conn_help *help; 1006 struct hlist_nulls_node *n; 1007 enum ip_conntrack_info ctinfo; 1008 struct net *net; 1009 u16 zone; 1010 1011 ct = nf_ct_get(skb, &ctinfo); 1012 net = nf_ct_net(ct); 1013 1014 /* ipt_REJECT uses nf_conntrack_attach to attach related 1015 ICMP/TCP RST packets in other direction. Actual packet 1016 which created connection will be IP_CT_NEW or for an 1017 expected connection, IP_CT_RELATED. */ 1018 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 1019 return NF_ACCEPT; 1020 1021 zone = nf_ct_zone(ct); 1022 hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 1023 repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 1024 1025 /* We're not in hash table, and we refuse to set up related 1026 connections for unconfirmed conns. But packet copies and 1027 REJECT will give spurious warnings here. */ 1028 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ 1029 1030 /* No external references means noone else could have 1031 confirmed us. */ 1032 NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); 1033 pr_debug("Confirming conntrack %p\n", ct); 1034 1035 spin_lock_bh(&nf_conntrack_lock); 1036 1037 /* We have to check the DYING flag inside the lock to prevent 1038 a race against nf_ct_get_next_corpse() possibly called from 1039 user context, else we insert an already 'dead' hash, blocking 1040 further use of that particular connection -JM */ 1041 1042 if (unlikely(nf_ct_is_dying(ct))) { 1043 spin_unlock_bh(&nf_conntrack_lock); 1044 return NF_ACCEPT; 1045 } 1046 1047 /* See if there's one in the list already, including reverse: 1048 NAT could have grabbed it without realizing, since we're 1049 not in the hash. If there is, we lost race. */ 1050 hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) 1051 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1052 &h->tuple) && 1053 zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) 1054 goto out; 1055 hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) 1056 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1057 &h->tuple) && 1058 zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) 1059 goto out; 1060 1061 /* Remove from unconfirmed list */ 1062 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 1063 1064 /* Timer relative to confirmation time, not original 1065 setting time, otherwise we'd get timer wrap in 1066 weird delay cases. */ 1067 ct->timeout.expires += jiffies; 1068 add_timer(&ct->timeout); 1069 atomic_inc(&ct->ct_general.use); 1070 set_bit(IPS_CONFIRMED_BIT, &ct->status); 1071 1072 /* Since the lookup is lockless, hash insertion must be done after 1073 * starting the timer and setting the CONFIRMED bit. The RCU barriers 1074 * guarantee that no other CPU can find the conntrack before the above 1075 * stores are visible. 1076 */ 1077 __nf_conntrack_hash_insert(ct, hash, repl_hash); 1078 NF_CT_STAT_INC(net, insert); 1079 spin_unlock_bh(&nf_conntrack_lock); 1080 1081 help = nfct_help(ct); 1082 if (help && help->helper) 1083 nf_conntrack_event_cache(IPCT_HELPER, ct); 1084 1085 nf_conntrack_event_cache(master_ct(ct) ? 1086 IPCT_RELATED : IPCT_NEW, ct); 1087 return NF_ACCEPT; 1088 1089out: 1090 NF_CT_STAT_INC(net, insert_failed); 1091 spin_unlock_bh(&nf_conntrack_lock); 1092 return NF_DROP; 1093} 1094EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 1095 1096/* Returns true if a connection correspondings to the tuple (required 1097 for NAT). */ 1098int 1099nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 1100 const struct nf_conn *ignored_conntrack) 1101{ 1102 struct net *net = nf_ct_net(ignored_conntrack); 1103 struct nf_conntrack_tuple_hash *h; 1104 struct hlist_nulls_node *n; 1105 struct nf_conn *ct; 1106 u16 zone = nf_ct_zone(ignored_conntrack); 1107 unsigned int hash = hash_conntrack(net, zone, tuple); 1108 1109 /* Disable BHs the entire time since we need to disable them at 1110 * least once for the stats anyway. 1111 */ 1112 rcu_read_lock_bh(); 1113 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { 1114 ct = nf_ct_tuplehash_to_ctrack(h); 1115 if (ct != ignored_conntrack && 1116 nf_ct_tuple_equal(tuple, &h->tuple) && 1117 nf_ct_zone(ct) == zone) { 1118 NF_CT_STAT_INC(net, found); 1119 rcu_read_unlock_bh(); 1120 return 1; 1121 } 1122 NF_CT_STAT_INC(net, searched); 1123 } 1124 rcu_read_unlock_bh(); 1125 1126 return 0; 1127} 1128EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1129 1130#define NF_CT_EVICTION_RANGE 8 1131 1132/* There's a small race here where we may free a just-assured 1133 connection. Too bad: we're in trouble anyway. */ 1134static noinline int early_drop(struct net *net, unsigned int hash) 1135{ 1136 /* Use oldest entry, which is roughly LRU */ 1137 struct nf_conntrack_tuple_hash *h; 1138 struct nf_conn *ct = NULL, *tmp; 1139 struct hlist_nulls_node *n; 1140 unsigned int i, cnt = 0; 1141 int dropped = 0; 1142 1143 rcu_read_lock(); 1144 for (i = 0; i < net->ct.htable_size; i++) { 1145 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], 1146 hnnode) { 1147 tmp = nf_ct_tuplehash_to_ctrack(h); 1148 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) 1149 ct = tmp; 1150 cnt++; 1151 } 1152 1153 if (ct != NULL) { 1154 if (likely(!nf_ct_is_dying(ct) && 1155 atomic_inc_not_zero(&ct->ct_general.use))) 1156 break; 1157 else 1158 ct = NULL; 1159 } 1160 1161 if (cnt >= NF_CT_EVICTION_RANGE) 1162 break; 1163 1164 hash = (hash + 1) % net->ct.htable_size; 1165 } 1166 rcu_read_unlock(); 1167 1168 if (!ct) 1169 return dropped; 1170 1171#if 0 //Don't let conntrack detele CTF entry 1172//#ifdef HNDCTF 1173 ip_conntrack_ipct_delete(ct, 0); 1174#endif /* HNDCTF */ 1175 1176 if (del_timer(&ct->timeout)) { 1177 death_by_timeout((unsigned long)ct); 1178 dropped = 1; 1179 NF_CT_STAT_INC_ATOMIC(net, early_drop); 1180 } 1181 nf_ct_put(ct); 1182 return dropped; 1183} 1184 1185struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, 1186 const struct nf_conntrack_tuple *orig, 1187 const struct nf_conntrack_tuple *repl, 1188 gfp_t gfp) 1189{ 1190 struct nf_conn *ct; 1191 1192 if (unlikely(!nf_conntrack_hash_rnd_initted)) { 1193 get_random_bytes(&nf_conntrack_hash_rnd, 1194 sizeof(nf_conntrack_hash_rnd)); 1195 nf_conntrack_hash_rnd_initted = 1; 1196 } 1197 1198 /* We don't want any race condition at early drop stage */ 1199 atomic_inc(&net->ct.count); 1200 1201 if (nf_conntrack_max && 1202 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { 1203 unsigned int hash = hash_conntrack(net, zone, orig); 1204 if (!early_drop(net, hash)) { 1205 atomic_dec(&net->ct.count); 1206 if (net_ratelimit()) 1207 printk(KERN_WARNING 1208 "nf_conntrack: table full, dropping" 1209 " packet.\n"); 1210 return ERR_PTR(-ENOMEM); 1211 } 1212 } 1213 1214 /* 1215 * Do not use kmem_cache_zalloc(), as this cache uses 1216 * SLAB_DESTROY_BY_RCU. 1217 */ 1218 ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); 1219 if (ct == NULL) { 1220 pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n"); 1221 atomic_dec(&net->ct.count); 1222 return ERR_PTR(-ENOMEM); 1223 } 1224 /* 1225 * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next 1226 * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged. 1227 */ 1228 memset(&ct->tuplehash[IP_CT_DIR_MAX], 0, 1229 sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX])); 1230 spin_lock_init(&ct->lock); 1231 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1232 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1233 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1234 ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL; 1235 /* Don't set timer yet: wait for confirmation */ 1236 setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); 1237 write_pnet(&ct->ct_net, net); 1238#ifdef CONFIG_NF_CONNTRACK_ZONES 1239 if (zone) { 1240 struct nf_conntrack_zone *nf_ct_zone; 1241 1242 nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC); 1243 if (!nf_ct_zone) 1244 goto out_free; 1245 nf_ct_zone->id = zone; 1246 } 1247#endif 1248 /* 1249 * changes to lookup keys must be done before setting refcnt to 1 1250 */ 1251 smp_wmb(); 1252 atomic_set(&ct->ct_general.use, 1); 1253 return ct; 1254 1255#ifdef CONFIG_NF_CONNTRACK_ZONES 1256out_free: 1257 kmem_cache_free(net->ct.nf_conntrack_cachep, ct); 1258 return ERR_PTR(-ENOMEM); 1259#endif 1260} 1261EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1262 1263void nf_conntrack_free(struct nf_conn *ct) 1264{ 1265 struct net *net = nf_ct_net(ct); 1266 1267 nf_ct_ext_destroy(ct); 1268 atomic_dec(&net->ct.count); 1269 nf_ct_ext_free(ct); 1270 kmem_cache_free(net->ct.nf_conntrack_cachep, ct); 1271} 1272EXPORT_SYMBOL_GPL(nf_conntrack_free); 1273 1274/* Allocate a new conntrack: we return -ENOMEM if classification 1275 failed due to stress. Otherwise it really is unclassifiable. */ 1276static struct nf_conntrack_tuple_hash * 1277init_conntrack(struct net *net, struct nf_conn *tmpl, 1278 const struct nf_conntrack_tuple *tuple, 1279 struct nf_conntrack_l3proto *l3proto, 1280 struct nf_conntrack_l4proto *l4proto, 1281 struct sk_buff *skb, 1282 unsigned int dataoff) 1283{ 1284 struct nf_conn *ct; 1285 struct nf_conn_help *help; 1286 struct nf_conntrack_tuple repl_tuple; 1287 struct nf_conntrack_ecache *ecache; 1288 struct nf_conntrack_expect *exp; 1289 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; 1290 1291 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { 1292 pr_debug("Can't invert tuple.\n"); 1293 return NULL; 1294 } 1295 1296 ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC); 1297 if (IS_ERR(ct)) { 1298 pr_debug("Can't allocate conntrack.\n"); 1299 return (struct nf_conntrack_tuple_hash *)ct; 1300 } 1301 1302 if (!l4proto->new(ct, skb, dataoff)) { 1303 nf_conntrack_free(ct); 1304 pr_debug("init conntrack: can't track with proto module\n"); 1305 return NULL; 1306 } 1307 1308 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1309 1310 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1311 nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1312 ecache ? ecache->expmask : 0, 1313 GFP_ATOMIC); 1314 1315 spin_lock_bh(&nf_conntrack_lock); 1316 exp = nf_ct_find_expectation(net, zone, tuple); 1317 if (exp) { 1318 pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", 1319 ct, exp); 1320 /* Welcome, Mr. Bond. We've been expecting you... */ 1321 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1322 ct->master = exp->master; 1323 if (exp->helper) { 1324 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1325 if (help) 1326 rcu_assign_pointer(help->helper, exp->helper); 1327 } 1328 1329#ifdef CONFIG_NF_CONNTRACK_MARK 1330 ct->mark = exp->master->mark; 1331#endif 1332#ifdef CONFIG_NF_CONNTRACK_SECMARK 1333 ct->secmark = exp->master->secmark; 1334#endif 1335 nf_conntrack_get(&ct->master->ct_general); 1336 NF_CT_STAT_INC(net, expect_new); 1337 } else { 1338 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1339 NF_CT_STAT_INC(net, new); 1340 } 1341 1342 /* Overload tuple linked list to put us in unconfirmed list. */ 1343 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 1344 &net->ct.unconfirmed); 1345 1346 spin_unlock_bh(&nf_conntrack_lock); 1347 1348 if (exp) { 1349 if (exp->expectfn) 1350 exp->expectfn(ct, exp); 1351 nf_ct_expect_put(exp); 1352 } 1353 1354 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1355} 1356 1357/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ 1358static inline struct nf_conn * 1359resolve_normal_ct(struct net *net, struct nf_conn *tmpl, 1360 struct sk_buff *skb, 1361 unsigned int dataoff, 1362 u_int16_t l3num, 1363 u_int8_t protonum, 1364 struct nf_conntrack_l3proto *l3proto, 1365 struct nf_conntrack_l4proto *l4proto, 1366 int *set_reply, 1367 enum ip_conntrack_info *ctinfo) 1368{ 1369 struct nf_conntrack_tuple tuple; 1370 struct nf_conntrack_tuple_hash *h; 1371 struct nf_conn *ct; 1372 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; 1373 1374 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1375 dataoff, l3num, protonum, &tuple, l3proto, 1376 l4proto)) { 1377 pr_debug("resolve_normal_ct: Can't get tuple\n"); 1378 return NULL; 1379 } 1380 1381 /* look for tuple match */ 1382 h = nf_conntrack_find_get(net, zone, &tuple); 1383 if (!h) { 1384 h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, 1385 skb, dataoff); 1386 if (!h) 1387 return NULL; 1388 if (IS_ERR(h)) 1389 return (void *)h; 1390 } 1391 ct = nf_ct_tuplehash_to_ctrack(h); 1392 1393 /* It exists; we have (non-exclusive) reference. */ 1394 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1395 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; 1396 /* Please set reply bit if this packet OK */ 1397 *set_reply = 1; 1398 } else { 1399 /* Once we've had two way comms, always ESTABLISHED. */ 1400 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { 1401 pr_debug("nf_conntrack_in: normal packet for %p\n", ct); 1402 *ctinfo = IP_CT_ESTABLISHED; 1403 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { 1404 pr_debug("nf_conntrack_in: related packet for %p\n", 1405 ct); 1406 *ctinfo = IP_CT_RELATED; 1407 } else { 1408 pr_debug("nf_conntrack_in: new packet for %p\n", ct); 1409 *ctinfo = IP_CT_NEW; 1410 } 1411 *set_reply = 0; 1412 } 1413 skb->nfct = &ct->ct_general; 1414 skb->nfctinfo = *ctinfo; 1415 return ct; 1416} 1417 1418unsigned int BCMFASTPATH_HOST 1419nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, 1420 struct sk_buff *skb) 1421{ 1422 struct nf_conn *ct, *tmpl = NULL; 1423 enum ip_conntrack_info ctinfo; 1424 struct nf_conntrack_l3proto *l3proto; 1425 struct nf_conntrack_l4proto *l4proto; 1426 unsigned int dataoff; 1427 u_int8_t protonum; 1428 int set_reply = 0; 1429 int ret; 1430 1431 if (skb->nfct) { 1432 /* Previously seen (loopback or untracked)? Ignore. */ 1433 tmpl = (struct nf_conn *)skb->nfct; 1434 if (!nf_ct_is_template(tmpl)) { 1435 NF_CT_STAT_INC_ATOMIC(net, ignore); 1436 return NF_ACCEPT; 1437 } 1438 skb->nfct = NULL; 1439 } 1440 1441 /* rcu_read_lock()ed by nf_hook_slow */ 1442 l3proto = __nf_ct_l3proto_find(pf); 1443 ret = l3proto->get_l4proto(skb, skb_network_offset(skb), 1444 &dataoff, &protonum); 1445 if (ret <= 0) { 1446 pr_debug("not prepared to track yet or error occured\n"); 1447 NF_CT_STAT_INC_ATOMIC(net, error); 1448 NF_CT_STAT_INC_ATOMIC(net, invalid); 1449 ret = -ret; 1450 goto out; 1451 } 1452 1453 l4proto = __nf_ct_l4proto_find(pf, protonum); 1454 1455 /* It may be an special packet, error, unclean... 1456 * inverse of the return code tells to the netfilter 1457 * core what to do with the packet. */ 1458 if (l4proto->error != NULL) { 1459 ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo, 1460 pf, hooknum); 1461 if (ret <= 0) { 1462 NF_CT_STAT_INC_ATOMIC(net, error); 1463 NF_CT_STAT_INC_ATOMIC(net, invalid); 1464 ret = -ret; 1465 goto out; 1466 } 1467 } 1468 1469 ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, 1470 l3proto, l4proto, &set_reply, &ctinfo); 1471 if (!ct) { 1472 /* Not valid part of a connection */ 1473 NF_CT_STAT_INC_ATOMIC(net, invalid); 1474 ret = NF_ACCEPT; 1475 goto out; 1476 } 1477 1478 if (IS_ERR(ct)) { 1479 /* Too stressed to deal. */ 1480 NF_CT_STAT_INC_ATOMIC(net, drop); 1481 ret = NF_DROP; 1482 goto out; 1483 } 1484 1485 NF_CT_ASSERT(skb->nfct); 1486 1487 ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum); 1488 ret = NF_ACCEPT; 1489 if (ret <= 0) { 1490 /* Invalid: inverse of the return code tells 1491 * the netfilter core what to do */ 1492 pr_debug("nf_conntrack_in: Can't track with proto module\n"); 1493 nf_conntrack_put(skb->nfct); 1494 skb->nfct = NULL; 1495 NF_CT_STAT_INC_ATOMIC(net, invalid); 1496 if (ret == -NF_DROP) 1497 NF_CT_STAT_INC_ATOMIC(net, drop); 1498 ret = -ret; 1499 goto out; 1500 } 1501 1502 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 1503 nf_conntrack_event_cache(IPCT_REPLY, ct); 1504out: 1505 if (tmpl) 1506 nf_ct_put(tmpl); 1507 1508 return ret; 1509} 1510EXPORT_SYMBOL_GPL(nf_conntrack_in); 1511 1512bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, 1513 const struct nf_conntrack_tuple *orig) 1514{ 1515 bool ret; 1516 1517 rcu_read_lock(); 1518 ret = nf_ct_invert_tuple(inverse, orig, 1519 __nf_ct_l3proto_find(orig->src.l3num), 1520 __nf_ct_l4proto_find(orig->src.l3num, 1521 orig->dst.protonum)); 1522 rcu_read_unlock(); 1523 return ret; 1524} 1525EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr); 1526 1527/* Alter reply tuple (maybe alter helper). This is for NAT, and is 1528 implicitly racy: see __nf_conntrack_confirm */ 1529void nf_conntrack_alter_reply(struct nf_conn *ct, 1530 const struct nf_conntrack_tuple *newreply) 1531{ 1532 struct nf_conn_help *help = nfct_help(ct); 1533 1534 /* Should be unconfirmed, so not in hash table yet */ 1535 NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); 1536 1537 pr_debug("Altering reply tuple of %p to ", ct); 1538 nf_ct_dump_tuple(newreply); 1539 1540 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 1541 if (ct->master || (help && !hlist_empty(&help->expectations))) 1542 return; 1543 1544 rcu_read_lock(); 1545 __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); 1546 rcu_read_unlock(); 1547} 1548EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); 1549 1550/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 1551void __nf_ct_refresh_acct(struct nf_conn *ct, 1552 enum ip_conntrack_info ctinfo, 1553 const struct sk_buff *skb, 1554 unsigned long extra_jiffies, 1555 int do_acct) 1556{ 1557 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); 1558 NF_CT_ASSERT(skb); 1559 1560 /* Only update if this is not a fixed timeout */ 1561 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 1562 goto acct; 1563 1564 /* If not in hash table, timer will not be active yet */ 1565 if (!nf_ct_is_confirmed(ct)) { 1566#ifdef HNDCTF 1567 ct->expire_jiffies = extra_jiffies; 1568#endif /* HNDCTF */ 1569 ct->timeout.expires = extra_jiffies; 1570 } else { 1571 unsigned long newtime = jiffies + extra_jiffies; 1572 1573 /* Only update the timeout if the new timeout is at least 1574 HZ jiffies from the old timeout. Need del_timer for race 1575 avoidance (may already be dying). */ 1576 if (newtime - ct->timeout.expires >= HZ) 1577#ifdef HNDCTF 1578 ct->expire_jiffies = extra_jiffies; 1579#endif /* HNDCTF */ 1580 mod_timer_pending(&ct->timeout, newtime); 1581 } 1582 1583acct: 1584 if (do_acct) { 1585 struct nf_conn_counter *acct; 1586 1587 acct = nf_conn_acct_find(ct); 1588 if (acct) { 1589 spin_lock_bh(&ct->lock); 1590 acct[CTINFO2DIR(ctinfo)].packets++; 1591 acct[CTINFO2DIR(ctinfo)].bytes += skb->len; 1592 spin_unlock_bh(&ct->lock); 1593 } 1594 } 1595} 1596EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 1597 1598bool __nf_ct_kill_acct(struct nf_conn *ct, 1599 enum ip_conntrack_info ctinfo, 1600 const struct sk_buff *skb, 1601 int do_acct) 1602{ 1603 if (do_acct) { 1604 struct nf_conn_counter *acct; 1605 1606 acct = nf_conn_acct_find(ct); 1607 if (acct) { 1608 spin_lock_bh(&ct->lock); 1609 acct[CTINFO2DIR(ctinfo)].packets++; 1610 acct[CTINFO2DIR(ctinfo)].bytes += 1611 skb->len - skb_network_offset(skb); 1612 spin_unlock_bh(&ct->lock); 1613 } 1614 } 1615 1616 if (del_timer(&ct->timeout)) { 1617 ct->timeout.function((unsigned long)ct); 1618 return true; 1619 } 1620 return false; 1621} 1622EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); 1623 1624#ifdef CONFIG_NF_CONNTRACK_ZONES 1625static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = { 1626 .len = sizeof(struct nf_conntrack_zone), 1627 .align = __alignof__(struct nf_conntrack_zone), 1628 .id = NF_CT_EXT_ZONE, 1629}; 1630#endif 1631 1632#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 1633 1634#include <linux/netfilter/nfnetlink.h> 1635#include <linux/netfilter/nfnetlink_conntrack.h> 1636#include <linux/mutex.h> 1637 1638/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be 1639 * in ip_conntrack_core, since we don't want the protocols to autoload 1640 * or depend on ctnetlink */ 1641int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 1642 const struct nf_conntrack_tuple *tuple) 1643{ 1644 NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port); 1645 NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port); 1646 return 0; 1647 1648nla_put_failure: 1649 return -1; 1650} 1651EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 1652 1653const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 1654 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 1655 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 1656}; 1657EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 1658 1659int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 1660 struct nf_conntrack_tuple *t) 1661{ 1662 if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT]) 1663 return -EINVAL; 1664 1665 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 1666 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 1667 1668 return 0; 1669} 1670EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 1671 1672int nf_ct_port_nlattr_tuple_size(void) 1673{ 1674 return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 1675} 1676EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 1677#endif 1678 1679/* Used by ipt_REJECT and ip6t_REJECT. */ 1680static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) 1681{ 1682 struct nf_conn *ct; 1683 enum ip_conntrack_info ctinfo; 1684 1685 /* This ICMP is in reverse direction to the packet which caused it */ 1686 ct = nf_ct_get(skb, &ctinfo); 1687 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 1688 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; 1689 else 1690 ctinfo = IP_CT_RELATED; 1691 1692 /* Attach to new skbuff, and increment count */ 1693 nskb->nfct = &ct->ct_general; 1694 nskb->nfctinfo = ctinfo; 1695 nf_conntrack_get(nskb->nfct); 1696} 1697 1698/* Bring out ya dead! */ 1699static struct nf_conn * 1700get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), 1701 void *data, unsigned int *bucket) 1702{ 1703 struct nf_conntrack_tuple_hash *h; 1704 struct nf_conn *ct; 1705 struct hlist_nulls_node *n; 1706 1707 spin_lock_bh(&nf_conntrack_lock); 1708 for (; *bucket < net->ct.htable_size; (*bucket)++) { 1709 hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { 1710 ct = nf_ct_tuplehash_to_ctrack(h); 1711 if (iter(ct, data)) 1712 goto found; 1713 } 1714 } 1715 hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) { 1716 ct = nf_ct_tuplehash_to_ctrack(h); 1717 if (iter(ct, data)) 1718 set_bit(IPS_DYING_BIT, &ct->status); 1719 } 1720 spin_unlock_bh(&nf_conntrack_lock); 1721 return NULL; 1722found: 1723 atomic_inc(&ct->ct_general.use); 1724 spin_unlock_bh(&nf_conntrack_lock); 1725 return ct; 1726} 1727 1728void nf_ct_iterate_cleanup(struct net *net, 1729 int (*iter)(struct nf_conn *i, void *data), 1730 void *data) 1731{ 1732 struct nf_conn *ct; 1733 unsigned int bucket = 0; 1734 1735 while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { 1736#if 0 //Don't let conntrack detele CTF entry 1737//#ifdef HNDCTF 1738 ip_conntrack_ipct_delete(ct, 0); 1739#endif /* HNDCTF */ 1740 /* Time to push up daises... */ 1741 if (del_timer(&ct->timeout)) 1742 death_by_timeout((unsigned long)ct); 1743 /* ... else the timer will get him soon. */ 1744 1745 nf_ct_put(ct); 1746 } 1747} 1748EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); 1749 1750struct __nf_ct_flush_report { 1751 u32 pid; 1752 int report; 1753}; 1754 1755static int kill_report(struct nf_conn *i, void *data) 1756{ 1757 struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; 1758 1759 /* If we fail to deliver the event, death_by_timeout() will retry */ 1760 if (nf_conntrack_event_report(IPCT_DESTROY, i, 1761 fr->pid, fr->report) < 0) 1762 return 1; 1763 1764 /* Avoid the delivery of the destroy event in death_by_timeout(). */ 1765 set_bit(IPS_DYING_BIT, &i->status); 1766 return 1; 1767} 1768 1769static int kill_all(struct nf_conn *i, void *data) 1770{ 1771 return 1; 1772} 1773 1774void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size) 1775{ 1776 if (vmalloced) 1777 vfree(hash); 1778 else 1779 free_pages((unsigned long)hash, 1780 get_order(sizeof(struct hlist_head) * size)); 1781} 1782EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); 1783 1784void nf_conntrack_flush_report(struct net *net, u32 pid, int report) 1785{ 1786 struct __nf_ct_flush_report fr = { 1787 .pid = pid, 1788 .report = report, 1789 }; 1790 nf_ct_iterate_cleanup(net, kill_report, &fr); 1791} 1792EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); 1793 1794static void nf_ct_release_dying_list(struct net *net) 1795{ 1796 struct nf_conntrack_tuple_hash *h; 1797 struct nf_conn *ct; 1798 struct hlist_nulls_node *n; 1799 1800 spin_lock_bh(&nf_conntrack_lock); 1801 hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) { 1802 ct = nf_ct_tuplehash_to_ctrack(h); 1803 /* never fails to remove them, no listeners at this point */ 1804 nf_ct_kill(ct); 1805 } 1806 spin_unlock_bh(&nf_conntrack_lock); 1807} 1808 1809static int untrack_refs(void) 1810{ 1811 int cnt = 0, cpu; 1812 1813 for_each_possible_cpu(cpu) { 1814 struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); 1815 1816 cnt += atomic_read(&ct->ct_general.use) - 1; 1817 } 1818 return cnt; 1819} 1820 1821static void nf_conntrack_cleanup_init_net(void) 1822{ 1823 while (untrack_refs() > 0) 1824 schedule(); 1825 1826 nf_conntrack_helper_fini(); 1827 nf_conntrack_proto_fini(); 1828#ifdef CONFIG_NF_CONNTRACK_ZONES 1829 nf_ct_extend_unregister(&nf_ct_zone_extend); 1830#endif 1831} 1832 1833static void nf_conntrack_cleanup_net(struct net *net) 1834{ 1835 i_see_dead_people: 1836 nf_ct_iterate_cleanup(net, kill_all, NULL); 1837 nf_ct_release_dying_list(net); 1838 if (atomic_read(&net->ct.count) != 0) { 1839 schedule(); 1840 goto i_see_dead_people; 1841 } 1842 1843 nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, 1844 net->ct.htable_size); 1845 nf_conntrack_ecache_fini(net); 1846 nf_conntrack_acct_fini(net); 1847 nf_conntrack_expect_fini(net); 1848 kmem_cache_destroy(net->ct.nf_conntrack_cachep); 1849 kfree(net->ct.slabname); 1850 free_percpu(net->ct.stat); 1851} 1852 1853/* Mishearing the voices in his head, our hero wonders how he's 1854 supposed to kill the mall. */ 1855void nf_conntrack_cleanup(struct net *net) 1856{ 1857 if (net_eq(net, &init_net)) 1858 rcu_assign_pointer(ip_ct_attach, NULL); 1859 1860 /* This makes sure all current packets have passed through 1861 netfilter framework. Roll on, two-stage module 1862 delete... */ 1863 synchronize_net(); 1864 1865 nf_conntrack_cleanup_net(net); 1866 1867 if (net_eq(net, &init_net)) { 1868 rcu_assign_pointer(nf_ct_destroy, NULL); 1869 nf_conntrack_cleanup_init_net(); 1870 } 1871} 1872 1873void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls) 1874{ 1875 struct hlist_nulls_head *hash; 1876 unsigned int nr_slots, i; 1877 size_t sz; 1878 1879 *vmalloced = 0; 1880 1881 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 1882 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 1883 sz = nr_slots * sizeof(struct hlist_nulls_head); 1884 hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, 1885 get_order(sz)); 1886 if (!hash) { 1887 *vmalloced = 1; 1888 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); 1889 hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 1890 PAGE_KERNEL); 1891 } 1892 1893 if (hash && nulls) 1894 for (i = 0; i < nr_slots; i++) 1895 INIT_HLIST_NULLS_HEAD(&hash[i], i); 1896 1897 return hash; 1898} 1899EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 1900 1901int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) 1902{ 1903 int i, bucket, vmalloced, old_vmalloced; 1904 unsigned int hashsize, old_size; 1905 struct hlist_nulls_head *hash, *old_hash; 1906 struct nf_conntrack_tuple_hash *h; 1907 struct nf_conn *ct; 1908 1909 if (current->nsproxy->net_ns != &init_net) 1910 return -EOPNOTSUPP; 1911 1912 /* On boot, we can set this without any fancy locking. */ 1913 if (!nf_conntrack_htable_size) 1914 return param_set_uint(val, kp); 1915 1916 hashsize = simple_strtoul(val, NULL, 0); 1917 if (!hashsize) 1918 return -EINVAL; 1919 1920 hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1); 1921 if (!hash) 1922 return -ENOMEM; 1923 1924 /* Lookups in the old hash might happen in parallel, which means we 1925 * might get false negatives during connection lookup. New connections 1926 * created because of a false negative won't make it into the hash 1927 * though since that required taking the lock. 1928 */ 1929 spin_lock_bh(&nf_conntrack_lock); 1930 for (i = 0; i < init_net.ct.htable_size; i++) { 1931 while (!hlist_nulls_empty(&init_net.ct.hash[i])) { 1932 h = hlist_nulls_entry(init_net.ct.hash[i].first, 1933 struct nf_conntrack_tuple_hash, hnnode); 1934 ct = nf_ct_tuplehash_to_ctrack(h); 1935 hlist_nulls_del_rcu(&h->hnnode); 1936 bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct), 1937 hashsize, 1938 nf_conntrack_hash_rnd); 1939 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 1940 } 1941 } 1942 old_size = init_net.ct.htable_size; 1943 old_vmalloced = init_net.ct.hash_vmalloc; 1944 old_hash = init_net.ct.hash; 1945 1946 init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; 1947 init_net.ct.hash_vmalloc = vmalloced; 1948 init_net.ct.hash = hash; 1949 spin_unlock_bh(&nf_conntrack_lock); 1950 1951 nf_ct_free_hashtable(old_hash, old_vmalloced, old_size); 1952 return 0; 1953} 1954EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); 1955 1956module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, 1957 &nf_conntrack_htable_size, 0600); 1958 1959void nf_ct_untracked_status_or(unsigned long bits) 1960{ 1961 int cpu; 1962 1963 for_each_possible_cpu(cpu) 1964 per_cpu(nf_conntrack_untracked, cpu).status |= bits; 1965} 1966EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); 1967 1968static int nf_conntrack_init_init_net(void) 1969{ 1970 int max_factor = 8; 1971 int ret, cpu; 1972 1973 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB 1974 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ 1975 if (!nf_conntrack_htable_size) { 1976 nf_conntrack_htable_size 1977 = (((totalram_pages << PAGE_SHIFT) / 16384) 1978 / sizeof(struct hlist_head)); 1979 if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 1980 nf_conntrack_htable_size = 16384; 1981 if (nf_conntrack_htable_size < 32) 1982 nf_conntrack_htable_size = 32; 1983 1984 /* Use a max. factor of four by default to get the same max as 1985 * with the old struct list_heads. When a table size is given 1986 * we use the old value of 8 to avoid reducing the max. 1987 * entries. */ 1988 max_factor = 4; 1989 } 1990 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 1991 1992 printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", 1993 NF_CONNTRACK_VERSION, nf_conntrack_htable_size, 1994 nf_conntrack_max); 1995 1996 ret = nf_conntrack_proto_init(); 1997 if (ret < 0) 1998 goto err_proto; 1999 2000 ret = nf_conntrack_helper_init(); 2001 if (ret < 0) 2002 goto err_helper; 2003 2004#ifdef CONFIG_NF_CONNTRACK_ZONES 2005 ret = nf_ct_extend_register(&nf_ct_zone_extend); 2006 if (ret < 0) 2007 goto err_extend; 2008#endif 2009 /* Set up fake conntrack: to never be deleted, not in any hashes */ 2010 for_each_possible_cpu(cpu) { 2011 struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); 2012 write_pnet(&ct->ct_net, &init_net); 2013 atomic_set(&ct->ct_general.use, 1); 2014 } 2015 /* - and look it like as a confirmed connection */ 2016 nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); 2017 return 0; 2018 2019#ifdef CONFIG_NF_CONNTRACK_ZONES 2020err_extend: 2021 nf_conntrack_helper_fini(); 2022#endif 2023err_helper: 2024 nf_conntrack_proto_fini(); 2025err_proto: 2026 return ret; 2027} 2028 2029/* 2030 * We need to use special "null" values, not used in hash table 2031 */ 2032#define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2033#define DYING_NULLS_VAL ((1<<30)+1) 2034 2035static int nf_conntrack_init_net(struct net *net) 2036{ 2037 int ret; 2038 2039 atomic_set(&net->ct.count, 0); 2040 INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL); 2041 INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL); 2042 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2043 if (!net->ct.stat) { 2044 ret = -ENOMEM; 2045 goto err_stat; 2046 } 2047 2048 net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); 2049 if (!net->ct.slabname) { 2050 ret = -ENOMEM; 2051 goto err_slabname; 2052 } 2053 2054 net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, 2055 sizeof(struct nf_conn), 0, 2056 SLAB_DESTROY_BY_RCU, NULL); 2057 if (!net->ct.nf_conntrack_cachep) { 2058 printk(KERN_ERR "Unable to create nf_conn slab cache\n"); 2059 ret = -ENOMEM; 2060 goto err_cache; 2061 } 2062 2063 net->ct.htable_size = nf_conntrack_htable_size; 2064 net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 2065 &net->ct.hash_vmalloc, 1); 2066 if (!net->ct.hash) { 2067 ret = -ENOMEM; 2068 printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); 2069 goto err_hash; 2070 } 2071 ret = nf_conntrack_expect_init(net); 2072 if (ret < 0) 2073 goto err_expect; 2074 ret = nf_conntrack_acct_init(net); 2075 if (ret < 0) 2076 goto err_acct; 2077 ret = nf_conntrack_ecache_init(net); 2078 if (ret < 0) 2079 goto err_ecache; 2080 2081 return 0; 2082 2083err_ecache: 2084 nf_conntrack_acct_fini(net); 2085err_acct: 2086 nf_conntrack_expect_fini(net); 2087err_expect: 2088 nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, 2089 net->ct.htable_size); 2090err_hash: 2091 kmem_cache_destroy(net->ct.nf_conntrack_cachep); 2092err_cache: 2093 kfree(net->ct.slabname); 2094err_slabname: 2095 free_percpu(net->ct.stat); 2096err_stat: 2097 return ret; 2098} 2099 2100s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, 2101 enum ip_conntrack_dir dir, 2102 u32 seq); 2103EXPORT_SYMBOL_GPL(nf_ct_nat_offset); 2104 2105int nf_conntrack_init(struct net *net) 2106{ 2107 int ret; 2108 2109 if (net_eq(net, &init_net)) { 2110 ret = nf_conntrack_init_init_net(); 2111 if (ret < 0) 2112 goto out_init_net; 2113 } 2114 ret = nf_conntrack_init_net(net); 2115 if (ret < 0) 2116 goto out_net; 2117 2118 if (net_eq(net, &init_net)) { 2119 /* For use by REJECT target */ 2120 rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); 2121 rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); 2122 2123 /* Howto get NAT offsets */ 2124 rcu_assign_pointer(nf_ct_nat_offset, NULL); 2125 } 2126 return 0; 2127 2128out_net: 2129 if (net_eq(net, &init_net)) 2130 nf_conntrack_cleanup_init_net(); 2131out_init_net: 2132 return ret; 2133} 2134