1/* Connection state tracking for netfilter. This is separated from, 2 but required by, the NAT layer; it can also be used by an iptables 3 extension. */ 4 5/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General 6 * Public Licence. 7 * 8 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org> 9 * - new API and handling of conntrack/nat helpers 10 * - now capable of multiple expectations for one master 11 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org> 12 * - add usage/reference counts to ip_conntrack_expect 13 * - export ip_conntrack[_expect]_{find_get,put} functions 14 * */ 15 16#ifdef MODULE 17#define __NO_VERSION__ 18#endif 19#include <linux/version.h> 20#include <linux/config.h> 21#include <linux/types.h> 22#include <linux/ip.h> 23#include <linux/netfilter.h> 24#include <linux/netfilter_ipv4.h> 25#include <linux/module.h> 26#include <linux/skbuff.h> 27#include <linux/proc_fs.h> 28#include <linux/vmalloc.h> 29#include <linux/brlock.h> 30#include <net/checksum.h> 31#include <linux/stddef.h> 32#include <linux/sysctl.h> 33#include <linux/slab.h> 34/* For ERR_PTR(). Yeah, I know... --RR */ 35#include <linux/fs.h> 36 37/* This rwlock protects the main hash table, protocol/helper/expected 38 registrations, conntrack timers*/ 39#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) 40#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) 41 42#include <linux/netfilter_ipv4/ip_conntrack.h> 43#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 44#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 45#include <linux/netfilter_ipv4/ip_conntrack_core.h> 46#include <linux/netfilter_ipv4/listhelp.h> 47 48#define IP_CONNTRACK_VERSION "2.1" 49 50#define DEBUGP(format, args...) 51 52DECLARE_RWLOCK(ip_conntrack_lock); 53DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock); 54 55void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; 56LIST_HEAD(ip_conntrack_expect_list); 57LIST_HEAD(protocol_list); 58static LIST_HEAD(helpers); 59unsigned int ip_conntrack_htable_size = 0; 60static int ip_conntrack_max = 0; 61static atomic_t ip_conntrack_count = ATOMIC_INIT(0); 62struct list_head *ip_conntrack_hash; 63static kmem_cache_t *ip_conntrack_cachep; 64 65extern struct ip_conntrack_protocol ip_conntrack_generic_protocol; 66 67static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr, 68 u_int8_t protocol) 69{ 70 return protocol == curr->proto; 71} 72 73struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol) 74{ 75 struct ip_conntrack_protocol *p; 76 77 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 78 p = LIST_FIND(&protocol_list, proto_cmpfn, 79 struct ip_conntrack_protocol *, protocol); 80 if (!p) 81 p = &ip_conntrack_generic_protocol; 82 83 return p; 84} 85 86struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol) 87{ 88 struct ip_conntrack_protocol *p; 89 90 READ_LOCK(&ip_conntrack_lock); 91 p = __ip_ct_find_proto(protocol); 92 READ_UNLOCK(&ip_conntrack_lock); 93 return p; 94} 95 96inline void 97ip_conntrack_put(struct ip_conntrack *ct) 98{ 99 IP_NF_ASSERT(ct); 100 IP_NF_ASSERT(ct->infos[0].master); 101 /* nf_conntrack_put wants to go via an info struct, so feed it 102 one at random. */ 103 nf_conntrack_put(&ct->infos[0]); 104} 105 106static inline u_int32_t 107hash_conntrack(const struct ip_conntrack_tuple *tuple) 108{ 109 /* ntohl because more differences in low bits. */ 110 /* To ensure that halves of the same connection don't hash 111 clash, we add the source per-proto again. */ 112 return (ntohl(tuple->src.ip + tuple->dst.ip 113 + tuple->src.u.all + tuple->dst.u.all 114 + tuple->dst.protonum) 115 + ntohs(tuple->src.u.all)) 116 % ip_conntrack_htable_size; 117} 118 119inline int 120get_tuple(const struct iphdr *iph, size_t len, 121 struct ip_conntrack_tuple *tuple, 122 struct ip_conntrack_protocol *protocol) 123{ 124 int ret; 125 126 /* Never happen */ 127 if (iph->frag_off & htons(IP_OFFSET)) { 128 printk("ip_conntrack_core: Frag of proto %u.\n", 129 iph->protocol); 130 return 0; 131 } 132 /* Guarantee 8 protocol bytes: if more wanted, use len param */ 133 else if (iph->ihl * 4 + 8 > len) 134 return 0; 135 136 tuple->src.ip = iph->saddr; 137 tuple->dst.ip = iph->daddr; 138 tuple->dst.protonum = iph->protocol; 139 140 tuple->src.u.all = tuple->dst.u.all = 0; 141 142 ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl, 143 len - 4*iph->ihl, 144 tuple); 145 return ret; 146} 147 148static int 149invert_tuple(struct ip_conntrack_tuple *inverse, 150 const struct ip_conntrack_tuple *orig, 151 const struct ip_conntrack_protocol *protocol) 152{ 153 inverse->src.ip = orig->dst.ip; 154 inverse->dst.ip = orig->src.ip; 155 inverse->dst.protonum = orig->dst.protonum; 156 157 inverse->src.u.all = inverse->dst.u.all = 0; 158 159 return protocol->invert_tuple(inverse, orig); 160} 161 162 163/* ip_conntrack_expect helper functions */ 164 165/* Compare tuple parts depending on mask. */ 166static inline int expect_cmp(const struct ip_conntrack_expect *i, 167 const struct ip_conntrack_tuple *tuple) 168{ 169 MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock); 170 return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask); 171} 172 173static void 174destroy_expect(struct ip_conntrack_expect *exp) 175{ 176 DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use)); 177 IP_NF_ASSERT(atomic_read(&exp->use)); 178 IP_NF_ASSERT(!timer_pending(&exp->timeout)); 179 180 kfree(exp); 181} 182 183 184inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp) 185{ 186 IP_NF_ASSERT(exp); 187 188 if (atomic_dec_and_test(&exp->use)) { 189 /* usage count dropped to zero */ 190 destroy_expect(exp); 191 } 192} 193 194static inline struct ip_conntrack_expect * 195__ip_ct_expect_find(const struct ip_conntrack_tuple *tuple) 196{ 197 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 198 MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock); 199 return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 200 struct ip_conntrack_expect *, tuple); 201} 202 203/* Find a expectation corresponding to a tuple. */ 204struct ip_conntrack_expect * 205ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple) 206{ 207 struct ip_conntrack_expect *exp; 208 209 READ_LOCK(&ip_conntrack_lock); 210 READ_LOCK(&ip_conntrack_expect_tuple_lock); 211 exp = __ip_ct_expect_find(tuple); 212 if (exp) 213 atomic_inc(&exp->use); 214 READ_UNLOCK(&ip_conntrack_expect_tuple_lock); 215 READ_UNLOCK(&ip_conntrack_lock); 216 217 return exp; 218} 219 220/* remove one specific expectation from all lists and drop refcount, 221 * does _NOT_ delete the timer. */ 222static void __unexpect_related(struct ip_conntrack_expect *expect) 223{ 224 DEBUGP("unexpect_related(%p)\n", expect); 225 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); 226 227 /* we're not allowed to unexpect a confirmed expectation! */ 228 IP_NF_ASSERT(!expect->sibling); 229 230 /* delete from global and local lists */ 231 list_del(&expect->list); 232 list_del(&expect->expected_list); 233 234 /* decrement expect-count of master conntrack */ 235 if (expect->expectant) 236 expect->expectant->expecting--; 237 238 ip_conntrack_expect_put(expect); 239} 240 241/* remove one specific expecatation from all lists, drop refcount 242 * and expire timer. 243 * This function can _NOT_ be called for confirmed expects! */ 244static void unexpect_related(struct ip_conntrack_expect *expect) 245{ 246 IP_NF_ASSERT(expect->expectant); 247 /* if we are supposed to have a timer, but we can't delete 248 * it: race condition. __unexpect_related will 249 * be calledd by timeout function */ 250 if (expect->expectant->helper 251 && expect->expectant->helper->timeout 252 && !del_timer(&expect->timeout)) 253 return; 254 255 __unexpect_related(expect); 256} 257 258/* delete all unconfirmed expectations for this conntrack */ 259static void remove_expectations(struct ip_conntrack *ct) 260{ 261 struct list_head *exp_entry, *next; 262 struct ip_conntrack_expect *exp; 263 264 DEBUGP("remove_expectations(%p)\n", ct); 265 266 for (exp_entry = ct->sibling_list.next; 267 exp_entry != &ct->sibling_list; exp_entry = next) { 268 next = exp_entry->next; 269 exp = list_entry(exp_entry, struct ip_conntrack_expect, 270 expected_list); 271 272 /* we skip established expectations, as we want to delete 273 * the un-established ones only */ 274 if (exp->sibling) { 275 DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct); 276 continue; 277 } 278 279 IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp)); 280 IP_NF_ASSERT(exp->expectant == ct); 281 282 /* delete expectation from global and private lists */ 283 unexpect_related(exp); 284 } 285} 286 287static void 288clean_from_lists(struct ip_conntrack *ct) 289{ 290 DEBUGP("clean_from_lists(%p)\n", ct); 291 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); 292 /* Remove from both hash lists: must not NULL out next ptrs, 293 otherwise we'll look unconfirmed. Fortunately, LIST_DELETE 294 doesn't do this. --RR */ 295 LIST_DELETE(&ip_conntrack_hash 296 [hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)], 297 &ct->tuplehash[IP_CT_DIR_ORIGINAL]); 298 LIST_DELETE(&ip_conntrack_hash 299 [hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)], 300 &ct->tuplehash[IP_CT_DIR_REPLY]); 301 302 /* Destroy all un-established, pending expectations */ 303 remove_expectations(ct); 304} 305 306static void 307destroy_conntrack(struct nf_conntrack *nfct) 308{ 309 struct ip_conntrack *ct = (struct ip_conntrack *)nfct; 310 struct ip_conntrack_protocol *proto; 311 312 DEBUGP("destroy_conntrack(%p)\n", ct); 313 IP_NF_ASSERT(atomic_read(&nfct->use) == 0); 314 IP_NF_ASSERT(!timer_pending(&ct->timeout)); 315 316 if (ct->master && master_ct(ct)) 317 ip_conntrack_put(master_ct(ct)); 318 319 /* To make sure we don't get any weird locking issues here: 320 * destroy_conntrack() MUST NOT be called with a write lock 321 * to ip_conntrack_lock!!! -HW */ 322 proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); 323 if (proto && proto->destroy) 324 proto->destroy(ct); 325 326 if (ip_conntrack_destroyed) 327 ip_conntrack_destroyed(ct); 328 329 WRITE_LOCK(&ip_conntrack_lock); 330 /* Delete our master expectation */ 331 if (ct->master) { 332 /* can't call __unexpect_related here, 333 * since it would screw up expect_list */ 334 list_del(&ct->master->expected_list); 335 kfree(ct->master); 336 } 337 WRITE_UNLOCK(&ip_conntrack_lock); 338 339 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); 340 kmem_cache_free(ip_conntrack_cachep, ct); 341 atomic_dec(&ip_conntrack_count); 342} 343 344static void death_by_timeout(unsigned long ul_conntrack) 345{ 346 struct ip_conntrack *ct = (void *)ul_conntrack; 347 348 WRITE_LOCK(&ip_conntrack_lock); 349 clean_from_lists(ct); 350 WRITE_UNLOCK(&ip_conntrack_lock); 351 ip_conntrack_put(ct); 352} 353 354static inline int 355conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, 356 const struct ip_conntrack_tuple *tuple, 357 const struct ip_conntrack *ignored_conntrack) 358{ 359 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 360 return i->ctrack != ignored_conntrack 361 && ip_ct_tuple_equal(tuple, &i->tuple); 362} 363 364static struct ip_conntrack_tuple_hash * 365__ip_conntrack_find(const struct ip_conntrack_tuple *tuple, 366 const struct ip_conntrack *ignored_conntrack) 367{ 368 struct ip_conntrack_tuple_hash *h; 369 370 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 371 h = LIST_FIND(&ip_conntrack_hash[hash_conntrack(tuple)], 372 conntrack_tuple_cmp, 373 struct ip_conntrack_tuple_hash *, 374 tuple, ignored_conntrack); 375 return h; 376} 377 378/* Find a connection corresponding to a tuple. */ 379struct ip_conntrack_tuple_hash * 380ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, 381 const struct ip_conntrack *ignored_conntrack) 382{ 383 struct ip_conntrack_tuple_hash *h; 384 385 READ_LOCK(&ip_conntrack_lock); 386 h = __ip_conntrack_find(tuple, ignored_conntrack); 387 if (h) 388 atomic_inc(&h->ctrack->ct_general.use); 389 READ_UNLOCK(&ip_conntrack_lock); 390 391 return h; 392} 393 394static inline struct ip_conntrack * 395__ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo) 396{ 397 struct ip_conntrack *ct 398 = (struct ip_conntrack *)nfct->master; 399 400 /* ctinfo is the index of the nfct inside the conntrack */ 401 *ctinfo = nfct - ct->infos; 402 IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER); 403 return ct; 404} 405 406/* Return conntrack and conntrack_info given skb->nfct->master */ 407struct ip_conntrack * 408ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo) 409{ 410 if (skb->nfct) 411 return __ip_conntrack_get(skb->nfct, ctinfo); 412 return NULL; 413} 414 415/* Confirm a connection given skb->nfct; places it in hash table */ 416int 417__ip_conntrack_confirm(struct nf_ct_info *nfct) 418{ 419 unsigned int hash, repl_hash; 420 struct ip_conntrack *ct; 421 enum ip_conntrack_info ctinfo; 422 423 ct = __ip_conntrack_get(nfct, &ctinfo); 424 425 /* ipt_REJECT uses ip_conntrack_attach to attach related 426 ICMP/TCP RST packets in other direction. Actual packet 427 which created connection will be IP_CT_NEW or for an 428 expected connection, IP_CT_RELATED. */ 429 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 430 return NF_ACCEPT; 431 432 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 433 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 434 435 /* We're not in hash table, and we refuse to set up related 436 connections for unconfirmed conns. But packet copies and 437 REJECT will give spurious warnings here. */ 438 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ 439 440 /* No external references means noone else could have 441 confirmed us. */ 442 IP_NF_ASSERT(!is_confirmed(ct)); 443 DEBUGP("Confirming conntrack %p\n", ct); 444 445 WRITE_LOCK(&ip_conntrack_lock); 446 /* See if there's one in the list already, including reverse: 447 NAT could have grabbed it without realizing, since we're 448 not in the hash. If there is, we lost race. */ 449 if (!LIST_FIND(&ip_conntrack_hash[hash], 450 conntrack_tuple_cmp, 451 struct ip_conntrack_tuple_hash *, 452 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) 453 && !LIST_FIND(&ip_conntrack_hash[repl_hash], 454 conntrack_tuple_cmp, 455 struct ip_conntrack_tuple_hash *, 456 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { 457 list_prepend(&ip_conntrack_hash[hash], 458 &ct->tuplehash[IP_CT_DIR_ORIGINAL]); 459 list_prepend(&ip_conntrack_hash[repl_hash], 460 &ct->tuplehash[IP_CT_DIR_REPLY]); 461 /* Timer relative to confirmation time, not original 462 setting time, otherwise we'd get timer wrap in 463 weird delay cases. */ 464 ct->timeout.expires += jiffies; 465 add_timer(&ct->timeout); 466 atomic_inc(&ct->ct_general.use); 467 WRITE_UNLOCK(&ip_conntrack_lock); 468 return NF_ACCEPT; 469 } 470 471 WRITE_UNLOCK(&ip_conntrack_lock); 472 return NF_DROP; 473} 474 475/* Returns true if a connection correspondings to the tuple (required 476 for NAT). */ 477int 478ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple, 479 const struct ip_conntrack *ignored_conntrack) 480{ 481 struct ip_conntrack_tuple_hash *h; 482 483 READ_LOCK(&ip_conntrack_lock); 484 h = __ip_conntrack_find(tuple, ignored_conntrack); 485 READ_UNLOCK(&ip_conntrack_lock); 486 487 return h != NULL; 488} 489 490/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ 491struct ip_conntrack * 492icmp_error_track(struct sk_buff *skb, 493 enum ip_conntrack_info *ctinfo, 494 unsigned int hooknum) 495{ 496 const struct iphdr *iph; 497 struct icmphdr *hdr; 498 struct ip_conntrack_tuple innertuple, origtuple; 499 struct iphdr *inner; 500 size_t datalen; 501 struct ip_conntrack_protocol *innerproto; 502 struct ip_conntrack_tuple_hash *h; 503 504 IP_NF_ASSERT(iph->protocol == IPPROTO_ICMP); 505 IP_NF_ASSERT(skb->nfct == NULL); 506 507 iph = skb->nh.iph; 508 hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl); 509 inner = (struct iphdr *)(hdr + 1); 510 datalen = skb->len - iph->ihl*4 - sizeof(*hdr); 511 512 if (skb->len < iph->ihl * 4 + sizeof(*hdr) + sizeof(*iph)) { 513 DEBUGP("icmp_error_track: too short\n"); 514 return NULL; 515 } 516 517 if (hdr->type != ICMP_DEST_UNREACH 518 && hdr->type != ICMP_SOURCE_QUENCH 519 && hdr->type != ICMP_TIME_EXCEEDED 520 && hdr->type != ICMP_PARAMETERPROB 521 && hdr->type != ICMP_REDIRECT) 522 return NULL; 523 524 /* Ignore ICMP's containing fragments (shouldn't happen) */ 525 if (inner->frag_off & htons(IP_OFFSET)) { 526 DEBUGP("icmp_error_track: fragment of proto %u\n", 527 inner->protocol); 528 return NULL; 529 } 530 531 /* Ignore it if the checksum's bogus. */ 532 if (ip_compute_csum((unsigned char *)hdr, sizeof(*hdr) + datalen)) { 533 DEBUGP("icmp_error_track: bad csum\n"); 534 return NULL; 535 } 536 537 innerproto = ip_ct_find_proto(inner->protocol); 538 /* Are they talking about one of our connections? */ 539 if (inner->ihl * 4 + 8 > datalen 540 || !get_tuple(inner, datalen, &origtuple, innerproto)) { 541 DEBUGP("icmp_error: ! get_tuple p=%u (%u*4+%u dlen=%u)\n", 542 inner->protocol, inner->ihl, 8, 543 datalen); 544 return NULL; 545 } 546 547 /* Ordinarily, we'd expect the inverted tupleproto, but it's 548 been preserved inside the ICMP. */ 549 if (!invert_tuple(&innertuple, &origtuple, innerproto)) { 550 DEBUGP("icmp_error_track: Can't invert tuple\n"); 551 return NULL; 552 } 553 554 *ctinfo = IP_CT_RELATED; 555 556 h = ip_conntrack_find_get(&innertuple, NULL); 557 if (!h) { 558 /* Locally generated ICMPs will match inverted if they 559 haven't been SNAT'ed yet */ 560 if (hooknum == NF_IP_LOCAL_OUT) 561 h = ip_conntrack_find_get(&origtuple, NULL); 562 563 if (!h) { 564 DEBUGP("icmp_error_track: no match\n"); 565 return NULL; 566 } 567 /* Reverse direction from that found */ 568 if (DIRECTION(h) != IP_CT_DIR_REPLY) 569 *ctinfo += IP_CT_IS_REPLY; 570 } else { 571 if (DIRECTION(h) == IP_CT_DIR_REPLY) 572 *ctinfo += IP_CT_IS_REPLY; 573 } 574 575 /* Update skb to refer to this connection */ 576 skb->nfct = &h->ctrack->infos[*ctinfo]; 577 return h->ctrack; 578} 579 580/* There's a small race here where we may free a just-assured 581 connection. Too bad: we're in trouble anyway. */ 582static inline int unreplied(const struct ip_conntrack_tuple_hash *i) 583{ 584 return !(i->ctrack->status & IPS_ASSURED); 585} 586 587static int early_drop(struct list_head *chain) 588{ 589 /* Traverse backwards: gives us oldest, which is roughly LRU */ 590 struct ip_conntrack_tuple_hash *h; 591 int dropped = 0; 592 593 READ_LOCK(&ip_conntrack_lock); 594 h = LIST_FIND(chain, unreplied, struct ip_conntrack_tuple_hash *); 595 if (h) 596 atomic_inc(&h->ctrack->ct_general.use); 597 READ_UNLOCK(&ip_conntrack_lock); 598 599 if (!h) 600 return dropped; 601 602 if (del_timer(&h->ctrack->timeout)) { 603 death_by_timeout((unsigned long)h->ctrack); 604 dropped = 1; 605 } 606 ip_conntrack_put(h->ctrack); 607 return dropped; 608} 609 610static inline int helper_cmp(const struct ip_conntrack_helper *i, 611 const struct ip_conntrack_tuple *rtuple) 612{ 613 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); 614} 615 616struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) 617{ 618 return LIST_FIND(&helpers, helper_cmp, 619 struct ip_conntrack_helper *, 620 tuple); 621} 622 623/* Allocate a new conntrack: we return -ENOMEM if classification 624 failed due to stress. Otherwise it really is unclassifiable. */ 625static struct ip_conntrack_tuple_hash * 626init_conntrack(const struct ip_conntrack_tuple *tuple, 627 struct ip_conntrack_protocol *protocol, 628 struct sk_buff *skb) 629{ 630 struct ip_conntrack *conntrack; 631 struct ip_conntrack_tuple repl_tuple; 632 size_t hash, repl_hash; 633 struct ip_conntrack_expect *expected; 634 int i; 635 static unsigned int drop_next = 0; 636 637 hash = hash_conntrack(tuple); 638 639 if (ip_conntrack_max && 640 atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { 641 /* Try dropping from random chain, or else from the 642 chain about to put into (in case they're trying to 643 bomb one hash chain). */ 644 unsigned int next = (drop_next++)%ip_conntrack_htable_size; 645 646 if (!early_drop(&ip_conntrack_hash[next]) 647 && !early_drop(&ip_conntrack_hash[hash])) { 648 if (net_ratelimit()) 649 printk(KERN_WARNING 650 "ip_conntrack: table full, dropping" 651 " packet.\n"); 652 return ERR_PTR(-ENOMEM); 653 } 654 } 655 656 if (!invert_tuple(&repl_tuple, tuple, protocol)) { 657 DEBUGP("Can't invert tuple.\n"); 658 return NULL; 659 } 660 repl_hash = hash_conntrack(&repl_tuple); 661 662 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); 663 if (!conntrack) { 664 DEBUGP("Can't allocate conntrack.\n"); 665 return ERR_PTR(-ENOMEM); 666 } 667 668 memset(conntrack, 0, sizeof(*conntrack)); 669 atomic_set(&conntrack->ct_general.use, 1); 670 conntrack->ct_general.destroy = destroy_conntrack; 671 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; 672 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack; 673 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; 674 conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack; 675 for (i=0; i < IP_CT_NUMBER; i++) 676 conntrack->infos[i].master = &conntrack->ct_general; 677 678 if (!protocol->new(conntrack, skb->nh.iph, skb->len)) { 679 kmem_cache_free(ip_conntrack_cachep, conntrack); 680 return NULL; 681 } 682 /* Don't set timer yet: wait for confirmation */ 683 init_timer(&conntrack->timeout); 684 conntrack->timeout.data = (unsigned long)conntrack; 685 conntrack->timeout.function = death_by_timeout; 686 687 INIT_LIST_HEAD(&conntrack->sibling_list); 688 689 /* Mark clearly that it's not in the hash table. */ 690 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list.next = NULL; 691 692 WRITE_LOCK(&ip_conntrack_lock); 693 /* Need finding and deleting of expected ONLY if we win race */ 694 READ_LOCK(&ip_conntrack_expect_tuple_lock); 695 expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 696 struct ip_conntrack_expect *, tuple); 697 READ_UNLOCK(&ip_conntrack_expect_tuple_lock); 698 699 /* Look up the conntrack helper for master connections only */ 700 if (!expected) 701 conntrack->helper = ip_ct_find_helper(&repl_tuple); 702 703 /* If the expectation is dying, then this is a looser. */ 704 if (expected 705 && expected->expectant->helper 706 && expected->expectant->helper->timeout 707 && ! del_timer(&expected->timeout)) 708 expected = NULL; 709 710 /* If master is not in hash table yet (ie. packet hasn't left 711 this machine yet), how can other end know about expected? 712 Hence these are not the droids you are looking for (if 713 master ct never got confirmed, we'd hold a reference to it 714 and weird things would happen to future packets). */ 715 if (expected && is_confirmed(expected->expectant)) { 716 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", 717 conntrack, expected); 718 /* Welcome, Mr. Bond. We've been expecting you... */ 719 IP_NF_ASSERT(master_ct(conntrack)); 720 conntrack->status = IPS_EXPECTED; 721 conntrack->master = expected; 722 expected->sibling = conntrack; 723 LIST_DELETE(&ip_conntrack_expect_list, expected); 724 INIT_LIST_HEAD(&expected->list); 725 expected->expectant->expecting--; 726 nf_conntrack_get(&master_ct(conntrack)->infos[0]); 727 } 728 atomic_inc(&ip_conntrack_count); 729 WRITE_UNLOCK(&ip_conntrack_lock); 730 731 if (expected && is_confirmed(expected->expectant) && expected->expectfn) 732 expected->expectfn(conntrack); 733 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; 734} 735 736/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ 737static inline struct ip_conntrack * 738resolve_normal_ct(struct sk_buff *skb, 739 struct ip_conntrack_protocol *proto, 740 int *set_reply, 741 unsigned int hooknum, 742 enum ip_conntrack_info *ctinfo) 743{ 744 struct ip_conntrack_tuple tuple; 745 struct ip_conntrack_tuple_hash *h; 746 747 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0); 748 749 if (!get_tuple(skb->nh.iph, skb->len, &tuple, proto)) 750 return NULL; 751 752 /* look for tuple match */ 753 h = ip_conntrack_find_get(&tuple, NULL); 754 if (!h) { 755 h = init_conntrack(&tuple, proto, skb); 756 if (!h) 757 return NULL; 758 if (IS_ERR(h)) 759 return (void *)h; 760 } 761 762 /* It exists; we have (non-exclusive) reference. */ 763 if (DIRECTION(h) == IP_CT_DIR_REPLY) { 764 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; 765 /* Please set reply bit if this packet OK */ 766 *set_reply = 1; 767 } else { 768 /* Once we've had two way comms, always ESTABLISHED. */ 769 if (h->ctrack->status & IPS_SEEN_REPLY) { 770 DEBUGP("ip_conntrack_in: normal packet for %p\n", 771 h->ctrack); 772 *ctinfo = IP_CT_ESTABLISHED; 773 } else if (h->ctrack->status & IPS_EXPECTED) { 774 DEBUGP("ip_conntrack_in: related packet for %p\n", 775 h->ctrack); 776 *ctinfo = IP_CT_RELATED; 777 } else { 778 DEBUGP("ip_conntrack_in: new packet for %p\n", 779 h->ctrack); 780 *ctinfo = IP_CT_NEW; 781 } 782 *set_reply = 0; 783 } 784 skb->nfct = &h->ctrack->infos[*ctinfo]; 785 return h->ctrack; 786} 787 788/* Netfilter hook itself. */ 789unsigned int ip_conntrack_in(unsigned int hooknum, 790 struct sk_buff **pskb, 791 const struct net_device *in, 792 const struct net_device *out, 793 int (*okfn)(struct sk_buff *)) 794{ 795 struct ip_conntrack *ct; 796 enum ip_conntrack_info ctinfo; 797 struct ip_conntrack_protocol *proto; 798 int set_reply; 799 int ret; 800 801 (*pskb)->nfcache |= NFC_UNKNOWN; 802 803/* Doesn't cover locally-generated broadcast, so not worth it. */ 804 805 /* Previously seen (loopback)? Ignore. Do this before 806 fragment check. */ 807 if ((*pskb)->nfct) 808 return NF_ACCEPT; 809 810 /* Gather fragments. */ 811 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { 812 *pskb = ip_ct_gather_frags(*pskb); 813 if (!*pskb) 814 return NF_STOLEN; 815 } 816 817 proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); 818 819 /* It may be an icmp error... */ 820 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 821 && icmp_error_track(*pskb, &ctinfo, hooknum)) 822 return NF_ACCEPT; 823 824 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) 825 /* Not valid part of a connection */ 826 return NF_ACCEPT; 827 828 if (IS_ERR(ct)) 829 /* Too stressed to deal. */ 830 return NF_DROP; 831 832 IP_NF_ASSERT((*pskb)->nfct); 833 834 ret = proto->packet(ct, (*pskb)->nh.iph, (*pskb)->len, ctinfo); 835 if (ret == -1) { 836 /* Invalid */ 837 nf_conntrack_put((*pskb)->nfct); 838 (*pskb)->nfct = NULL; 839 return NF_ACCEPT; 840 } 841 842 if (ret != NF_DROP && ct->helper) { 843 ret = ct->helper->help((*pskb)->nh.iph, (*pskb)->len, 844 ct, ctinfo); 845 if (ret == -1) { 846 /* Invalid */ 847 nf_conntrack_put((*pskb)->nfct); 848 (*pskb)->nfct = NULL; 849 return NF_ACCEPT; 850 } 851 } 852 if (set_reply) 853 set_bit(IPS_SEEN_REPLY_BIT, &ct->status); 854 855 return ret; 856} 857 858int invert_tuplepr(struct ip_conntrack_tuple *inverse, 859 const struct ip_conntrack_tuple *orig) 860{ 861 return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum)); 862} 863 864static inline int resent_expect(const struct ip_conntrack_expect *i, 865 const struct ip_conntrack_tuple *tuple, 866 const struct ip_conntrack_tuple *mask) 867{ 868 DEBUGP("resent_expect\n"); 869 DEBUGP(" tuple: "); DUMP_TUPLE(&i->tuple); 870 DEBUGP("ct_tuple: "); DUMP_TUPLE(&i->ct_tuple); 871 DEBUGP("test tuple: "); DUMP_TUPLE(tuple); 872 return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple)) 873 || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple))) 874 && ip_ct_tuple_equal(&i->mask, mask)); 875} 876 877/* Would two expected things clash? */ 878static inline int expect_clash(const struct ip_conntrack_expect *i, 879 const struct ip_conntrack_tuple *tuple, 880 const struct ip_conntrack_tuple *mask) 881{ 882 /* Part covered by intersection of masks must be unequal, 883 otherwise they clash */ 884 struct ip_conntrack_tuple intersect_mask 885 = { { i->mask.src.ip & mask->src.ip, 886 { i->mask.src.u.all & mask->src.u.all } }, 887 { i->mask.dst.ip & mask->dst.ip, 888 { i->mask.dst.u.all & mask->dst.u.all }, 889 i->mask.dst.protonum & mask->dst.protonum } }; 890 891 return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask); 892} 893 894inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect) 895{ 896 WRITE_LOCK(&ip_conntrack_lock); 897 unexpect_related(expect); 898 WRITE_UNLOCK(&ip_conntrack_lock); 899} 900 901static void expectation_timed_out(unsigned long ul_expect) 902{ 903 struct ip_conntrack_expect *expect = (void *) ul_expect; 904 905 DEBUGP("expectation %p timed out\n", expect); 906 WRITE_LOCK(&ip_conntrack_lock); 907 __unexpect_related(expect); 908 WRITE_UNLOCK(&ip_conntrack_lock); 909} 910 911/* Add a related connection. */ 912int ip_conntrack_expect_related(struct ip_conntrack *related_to, 913 struct ip_conntrack_expect *expect) 914{ 915 struct ip_conntrack_expect *old, *new; 916 int ret = 0; 917 918 WRITE_LOCK(&ip_conntrack_lock); 919 /* Because of the write lock, no reader can walk the lists, 920 * so there is no need to use the tuple lock too */ 921 922 DEBUGP("ip_conntrack_expect_related %p\n", related_to); 923 DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple); 924 DEBUGP("mask: "); DUMP_TUPLE_RAW(&expect->mask); 925 926 old = LIST_FIND(&ip_conntrack_expect_list, resent_expect, 927 struct ip_conntrack_expect *, &expect->tuple, 928 &expect->mask); 929 if (old) { 930 /* Helper private data may contain offsets but no pointers 931 pointing into the payload - otherwise we should have to copy 932 the data filled out by the helper over the old one */ 933 DEBUGP("expect_related: resent packet\n"); 934 if (related_to->helper && 935 related_to->helper->timeout) { 936 if (!del_timer(&old->timeout)) { 937 /* expectation is dying. Fall through */ 938 old = NULL; 939 } else { 940 old->timeout.expires = jiffies + 941 related_to->helper->timeout * HZ; 942 add_timer(&old->timeout); 943 } 944 } 945 946 if (old) { 947 WRITE_UNLOCK(&ip_conntrack_lock); 948 return -EEXIST; 949 } 950 } else if (related_to->helper && 951 related_to->helper->max_expected && 952 related_to->expecting >= related_to->helper->max_expected) { 953 struct list_head *cur_item; 954 /* old == NULL */ 955 if (!(related_to->helper->flags & 956 IP_CT_HELPER_F_REUSE_EXPECT)) { 957 WRITE_UNLOCK(&ip_conntrack_lock); 958 if (net_ratelimit()) 959 printk(KERN_WARNING 960 "ip_conntrack: max number of expected " 961 "connections %i of %s reached for " 962 "%u.%u.%u.%u->%u.%u.%u.%u\n", 963 related_to->helper->max_expected, 964 related_to->helper->name, 965 NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip), 966 NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip)); 967 return -EPERM; 968 } 969 DEBUGP("ip_conntrack: max number of expected " 970 "connections %i of %s reached for " 971 "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n", 972 related_to->helper->max_expected, 973 related_to->helper->name, 974 NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip), 975 NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip)); 976 977 /* choose the the oldest expectation to evict */ 978 list_for_each(cur_item, &related_to->sibling_list) { 979 struct ip_conntrack_expect *cur; 980 981 cur = list_entry(cur_item, 982 struct ip_conntrack_expect, 983 expected_list); 984 if (cur->sibling == NULL) { 985 old = cur; 986 break; 987 } 988 } 989 990 /* (!old) cannot happen, since related_to->expecting is the 991 * number of unconfirmed expects */ 992 IP_NF_ASSERT(old); 993 994 /* newnat14 does not reuse the real allocated memory 995 * structures but rather unexpects the old and 996 * allocates a new. unexpect_related will decrement 997 * related_to->expecting. 998 */ 999 unexpect_related(old); 1000 ret = -EPERM; 1001 } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash, 1002 struct ip_conntrack_expect *, &expect->tuple, 1003 &expect->mask)) { 1004 WRITE_UNLOCK(&ip_conntrack_lock); 1005 DEBUGP("expect_related: busy!\n"); 1006 return -EBUSY; 1007 } 1008 1009 new = (struct ip_conntrack_expect *) 1010 kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC); 1011 if (!new) { 1012 WRITE_UNLOCK(&ip_conntrack_lock); 1013 DEBUGP("expect_relaed: OOM allocating expect\n"); 1014 return -ENOMEM; 1015 } 1016 1017 /* Zero out the new structure, then fill out it with the data */ 1018 DEBUGP("new expectation %p of conntrack %p\n", new, related_to); 1019 memset(new, 0, sizeof(*expect)); 1020 INIT_LIST_HEAD(&new->list); 1021 INIT_LIST_HEAD(&new->expected_list); 1022 memcpy(new, expect, sizeof(*expect)); 1023 new->expectant = related_to; 1024 new->sibling = NULL; 1025 /* increase usage count. This sucks. The memset above overwrites 1026 * old usage count [if still present] and we increase to one. Only 1027 * works because everything is done under ip_conntrack_lock() */ 1028 atomic_inc(&new->use); 1029 1030 /* add to expected list for this connection */ 1031 list_add(&new->expected_list, &related_to->sibling_list); 1032 /* add to global list of expectations */ 1033 list_prepend(&ip_conntrack_expect_list, &new->list); 1034 /* add and start timer if required */ 1035 if (related_to->helper && 1036 related_to->helper->timeout) { 1037 init_timer(&new->timeout); 1038 new->timeout.data = (unsigned long)new; 1039 new->timeout.function = expectation_timed_out; 1040 new->timeout.expires = jiffies + 1041 related_to->helper->timeout * HZ; 1042 add_timer(&new->timeout); 1043 } 1044 related_to->expecting++; 1045 1046 WRITE_UNLOCK(&ip_conntrack_lock); 1047 1048 return ret; 1049} 1050 1051/* Change tuple in an existing expectation */ 1052int ip_conntrack_change_expect(struct ip_conntrack_expect *expect, 1053 struct ip_conntrack_tuple *newtuple) 1054{ 1055 int ret; 1056 1057 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 1058 WRITE_LOCK(&ip_conntrack_expect_tuple_lock); 1059 DEBUGP("change_expect:\n"); 1060 DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple); 1061 DEBUGP("exp mask: "); DUMP_TUPLE_RAW(&expect->mask); 1062 DEBUGP("newtuple: "); DUMP_TUPLE_RAW(newtuple); 1063 if (expect->ct_tuple.dst.protonum == 0) { 1064 /* Never seen before */ 1065 DEBUGP("change expect: never seen before\n"); 1066 if (!ip_ct_tuple_equal(&expect->tuple, newtuple) 1067 && LIST_FIND(&ip_conntrack_expect_list, expect_clash, 1068 struct ip_conntrack_expect *, newtuple, &expect->mask)) { 1069 /* Force NAT to find an unused tuple */ 1070 ret = -1; 1071 } else { 1072 memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple)); 1073 memcpy(&expect->tuple, newtuple, sizeof(expect->tuple)); 1074 ret = 0; 1075 } 1076 } else { 1077 /* Resent packet */ 1078 DEBUGP("change expect: resent packet\n"); 1079 if (ip_ct_tuple_equal(&expect->tuple, newtuple)) { 1080 ret = 0; 1081 } else { 1082 /* Force NAT to choose again the same port */ 1083 ret = -1; 1084 } 1085 } 1086 WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock); 1087 1088 return ret; 1089} 1090 1091/* Alter reply tuple (maybe alter helper). If it's already taken, 1092 return 0 and don't do alteration. */ 1093int ip_conntrack_alter_reply(struct ip_conntrack *conntrack, 1094 const struct ip_conntrack_tuple *newreply) 1095{ 1096 WRITE_LOCK(&ip_conntrack_lock); 1097 if (__ip_conntrack_find(newreply, conntrack)) { 1098 WRITE_UNLOCK(&ip_conntrack_lock); 1099 return 0; 1100 } 1101 /* Should be unconfirmed, so not in hash table yet */ 1102 IP_NF_ASSERT(!is_confirmed(conntrack)); 1103 1104 DEBUGP("Altering reply tuple of %p to ", conntrack); 1105 DUMP_TUPLE(newreply); 1106 1107 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 1108 if (!conntrack->master) 1109 conntrack->helper = LIST_FIND(&helpers, helper_cmp, 1110 struct ip_conntrack_helper *, 1111 newreply); 1112 WRITE_UNLOCK(&ip_conntrack_lock); 1113 1114 return 1; 1115} 1116 1117int ip_conntrack_helper_register(struct ip_conntrack_helper *me) 1118{ 1119 MOD_INC_USE_COUNT; 1120 1121 WRITE_LOCK(&ip_conntrack_lock); 1122 list_prepend(&helpers, me); 1123 WRITE_UNLOCK(&ip_conntrack_lock); 1124 1125 return 0; 1126} 1127 1128static inline int unhelp(struct ip_conntrack_tuple_hash *i, 1129 const struct ip_conntrack_helper *me) 1130{ 1131 if (i->ctrack->helper == me) { 1132 /* Get rid of any expected. */ 1133 remove_expectations(i->ctrack); 1134 /* And *then* set helper to NULL */ 1135 i->ctrack->helper = NULL; 1136 } 1137 return 0; 1138} 1139 1140void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) 1141{ 1142 unsigned int i; 1143 1144 /* Need write lock here, to delete helper. */ 1145 WRITE_LOCK(&ip_conntrack_lock); 1146 LIST_DELETE(&helpers, me); 1147 1148 /* Get rid of expecteds, set helpers to NULL. */ 1149 for (i = 0; i < ip_conntrack_htable_size; i++) 1150 LIST_FIND_W(&ip_conntrack_hash[i], unhelp, 1151 struct ip_conntrack_tuple_hash *, me); 1152 WRITE_UNLOCK(&ip_conntrack_lock); 1153 1154 /* Someone could be still looking at the helper in a bh. */ 1155 br_write_lock_bh(BR_NETPROTO_LOCK); 1156 br_write_unlock_bh(BR_NETPROTO_LOCK); 1157 1158 MOD_DEC_USE_COUNT; 1159} 1160 1161/* Refresh conntrack for this many jiffies. */ 1162void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies) 1163{ 1164 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct); 1165 1166 WRITE_LOCK(&ip_conntrack_lock); 1167 /* If not in hash table, timer will not be active yet */ 1168 if (!is_confirmed(ct)) 1169 ct->timeout.expires = extra_jiffies; 1170 else { 1171 /* Need del_timer for race avoidance (may already be dying). */ 1172 if (del_timer(&ct->timeout)) { 1173 ct->timeout.expires = jiffies + extra_jiffies; 1174 add_timer(&ct->timeout); 1175 } 1176 } 1177 WRITE_UNLOCK(&ip_conntrack_lock); 1178} 1179 1180/* Returns new sk_buff, or NULL */ 1181struct sk_buff * 1182ip_ct_gather_frags(struct sk_buff *skb) 1183{ 1184 struct sock *sk = skb->sk; 1185#ifdef CONFIG_NETFILTER_DEBUG 1186 unsigned int olddebug = skb->nf_debug; 1187#endif 1188 if (sk) { 1189 sock_hold(sk); 1190 skb_orphan(skb); 1191 } 1192 1193 local_bh_disable(); 1194 skb = ip_defrag(skb); 1195 local_bh_enable(); 1196 1197 if (!skb) { 1198 if (sk) sock_put(sk); 1199 return skb; 1200 } else if (skb_is_nonlinear(skb) && skb_linearize(skb, GFP_ATOMIC) != 0) { 1201 kfree_skb(skb); 1202 if (sk) sock_put(sk); 1203 return NULL; 1204 } 1205 1206 if (sk) { 1207 skb_set_owner_w(skb, sk); 1208 sock_put(sk); 1209 } 1210 1211 ip_send_check(skb->nh.iph); 1212 skb->nfcache |= NFC_ALTERED; 1213#ifdef CONFIG_NETFILTER_DEBUG 1214 /* Packet path as if nothing had happened. */ 1215 skb->nf_debug = olddebug; 1216#endif 1217 return skb; 1218} 1219 1220/* Used by ipt_REJECT. */ 1221static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct) 1222{ 1223 struct ip_conntrack *ct; 1224 enum ip_conntrack_info ctinfo; 1225 1226 ct = __ip_conntrack_get(nfct, &ctinfo); 1227 1228 /* This ICMP is in reverse direction to the packet which 1229 caused it */ 1230 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 1231 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; 1232 else 1233 ctinfo = IP_CT_RELATED; 1234 1235 /* Attach new skbuff, and increment count */ 1236 nskb->nfct = &ct->infos[ctinfo]; 1237 atomic_inc(&ct->ct_general.use); 1238} 1239 1240static inline int 1241do_kill(const struct ip_conntrack_tuple_hash *i, 1242 int (*kill)(const struct ip_conntrack *i, void *data), 1243 void *data) 1244{ 1245 return kill(i->ctrack, data); 1246} 1247 1248/* Bring out ya dead! */ 1249static struct ip_conntrack_tuple_hash * 1250get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data), 1251 void *data) 1252{ 1253 struct ip_conntrack_tuple_hash *h = NULL; 1254 unsigned int i; 1255 1256 READ_LOCK(&ip_conntrack_lock); 1257 for (i = 0; !h && i < ip_conntrack_htable_size; i++) { 1258 h = LIST_FIND(&ip_conntrack_hash[i], do_kill, 1259 struct ip_conntrack_tuple_hash *, kill, data); 1260 } 1261 if (h) 1262 atomic_inc(&h->ctrack->ct_general.use); 1263 READ_UNLOCK(&ip_conntrack_lock); 1264 1265 return h; 1266} 1267 1268void 1269ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data), 1270 void *data) 1271{ 1272 struct ip_conntrack_tuple_hash *h; 1273 1274 /* This is order n^2, by the way. */ 1275 while ((h = get_next_corpse(kill, data)) != NULL) { 1276 /* Time to push up daises... */ 1277 if (del_timer(&h->ctrack->timeout)) 1278 death_by_timeout((unsigned long)h->ctrack); 1279 /* ... else the timer will get him soon. */ 1280 1281 ip_conntrack_put(h->ctrack); 1282 } 1283} 1284 1285/* Fast function for those who don't want to parse /proc (and I don't 1286 blame them). */ 1287/* Reversing the socket's dst/src point of view gives us the reply 1288 mapping. */ 1289static int 1290getorigdst(struct sock *sk, int optval, void *user, int *len) 1291{ 1292 struct ip_conntrack_tuple_hash *h; 1293 struct ip_conntrack_tuple tuple = { { sk->rcv_saddr, { sk->sport } }, 1294 { sk->daddr, { sk->dport }, 1295 IPPROTO_TCP } }; 1296 1297 /* We only do TCP at the moment: is there a better way? */ 1298 if (strcmp(sk->prot->name, "TCP") != 0) { 1299 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n"); 1300 return -ENOPROTOOPT; 1301 } 1302 1303 if ((unsigned int) *len < sizeof(struct sockaddr_in)) { 1304 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n", 1305 *len, sizeof(struct sockaddr_in)); 1306 return -EINVAL; 1307 } 1308 1309 h = ip_conntrack_find_get(&tuple, NULL); 1310 if (h) { 1311 struct sockaddr_in sin; 1312 1313 sin.sin_family = AF_INET; 1314 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL] 1315 .tuple.dst.u.tcp.port; 1316 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL] 1317 .tuple.dst.ip; 1318 1319 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", 1320 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); 1321 ip_conntrack_put(h->ctrack); 1322 if (copy_to_user(user, &sin, sizeof(sin)) != 0) 1323 return -EFAULT; 1324 else 1325 return 0; 1326 } 1327 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n", 1328 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port), 1329 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port)); 1330 return -ENOENT; 1331} 1332 1333static struct nf_sockopt_ops so_getorigdst 1334= { { NULL, NULL }, PF_INET, 1335 0, 0, NULL, /* Setsockopts */ 1336 SO_ORIGINAL_DST, SO_ORIGINAL_DST+1, &getorigdst, 1337 0, NULL }; 1338 1339#define NET_IP_CONNTRACK_MAX 2089 1340#define NET_IP_CONNTRACK_MAX_NAME "ip_conntrack_max" 1341 1342#ifdef CONFIG_SYSCTL 1343static struct ctl_table_header *ip_conntrack_sysctl_header; 1344 1345static ctl_table ip_conntrack_table[] = { 1346 { NET_IP_CONNTRACK_MAX, NET_IP_CONNTRACK_MAX_NAME, &ip_conntrack_max, 1347 sizeof(ip_conntrack_max), 0644, NULL, proc_dointvec }, 1348 { 0 } 1349}; 1350 1351static ctl_table ip_conntrack_dir_table[] = { 1352 {NET_IPV4, "ipv4", NULL, 0, 0555, ip_conntrack_table, 0, 0, 0, 0, 0}, 1353 { 0 } 1354}; 1355 1356static ctl_table ip_conntrack_root_table[] = { 1357 {CTL_NET, "net", NULL, 0, 0555, ip_conntrack_dir_table, 0, 0, 0, 0, 0}, 1358 { 0 } 1359}; 1360#endif /*CONFIG_SYSCTL*/ 1361 1362static int kill_all(const struct ip_conntrack *i, void *data) 1363{ 1364 return 1; 1365} 1366 1367/* Mishearing the voices in his head, our hero wonders how he's 1368 supposed to kill the mall. */ 1369void ip_conntrack_cleanup(void) 1370{ 1371#ifdef CONFIG_SYSCTL 1372 unregister_sysctl_table(ip_conntrack_sysctl_header); 1373#endif 1374 ip_ct_attach = NULL; 1375 /* This makes sure all current packets have passed through 1376 netfilter framework. Roll on, two-stage module 1377 delete... */ 1378 br_write_lock_bh(BR_NETPROTO_LOCK); 1379 br_write_unlock_bh(BR_NETPROTO_LOCK); 1380 1381 i_see_dead_people: 1382 ip_ct_selective_cleanup(kill_all, NULL); 1383 if (atomic_read(&ip_conntrack_count) != 0) { 1384 schedule(); 1385 goto i_see_dead_people; 1386 } 1387 1388 kmem_cache_destroy(ip_conntrack_cachep); 1389 vfree(ip_conntrack_hash); 1390 nf_unregister_sockopt(&so_getorigdst); 1391} 1392 1393static int hashsize = 0; 1394MODULE_PARM(hashsize, "i"); 1395 1396int __init ip_conntrack_init(void) 1397{ 1398 unsigned int i; 1399 int ret; 1400 1401 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB 1402 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ 1403 if (hashsize) { 1404 ip_conntrack_htable_size = hashsize; 1405 } else { 1406 ip_conntrack_htable_size 1407 = (((num_physpages << PAGE_SHIFT) / 16384) 1408 / sizeof(struct list_head)); 1409 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) 1410 ip_conntrack_htable_size = 8192; 1411 if (ip_conntrack_htable_size < 16) 1412 ip_conntrack_htable_size = 16; 1413 } 1414 ip_conntrack_max = 8 * ip_conntrack_htable_size; 1415 1416 printk("ip_conntrack version %s (%u buckets, %d max)" 1417 " - %d bytes per conntrack\n", IP_CONNTRACK_VERSION, 1418 ip_conntrack_htable_size, ip_conntrack_max, 1419 sizeof(struct ip_conntrack)); 1420 1421 ret = nf_register_sockopt(&so_getorigdst); 1422 if (ret != 0) { 1423 printk(KERN_ERR "Unable to register netfilter socket option\n"); 1424 return ret; 1425 } 1426 1427 ip_conntrack_hash = vmalloc(sizeof(struct list_head) 1428 * ip_conntrack_htable_size); 1429 if (!ip_conntrack_hash) { 1430 printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); 1431 goto err_unreg_sockopt; 1432 } 1433 1434 ip_conntrack_cachep = kmem_cache_create("ip_conntrack", 1435 sizeof(struct ip_conntrack), 0, 1436 SLAB_HWCACHE_ALIGN, NULL, NULL); 1437 if (!ip_conntrack_cachep) { 1438 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); 1439 goto err_free_hash; 1440 } 1441 /* Don't NEED lock here, but good form anyway. */ 1442 WRITE_LOCK(&ip_conntrack_lock); 1443 /* Sew in builtin protocols. */ 1444 list_append(&protocol_list, &ip_conntrack_protocol_tcp); 1445 list_append(&protocol_list, &ip_conntrack_protocol_udp); 1446 list_append(&protocol_list, &ip_conntrack_protocol_icmp); 1447 WRITE_UNLOCK(&ip_conntrack_lock); 1448 1449 for (i = 0; i < ip_conntrack_htable_size; i++) 1450 INIT_LIST_HEAD(&ip_conntrack_hash[i]); 1451 1452/* This is fucking braindead. There is NO WAY of doing this without 1453 the CONFIG_SYSCTL unless you don't want to detect errors. 1454 Grrr... --RR */ 1455#ifdef CONFIG_SYSCTL 1456 ip_conntrack_sysctl_header 1457 = register_sysctl_table(ip_conntrack_root_table, 0); 1458 if (ip_conntrack_sysctl_header == NULL) { 1459 goto err_free_ct_cachep; 1460 } 1461#endif /*CONFIG_SYSCTL*/ 1462 1463 /* For use by ipt_REJECT */ 1464 ip_ct_attach = ip_conntrack_attach; 1465 return ret; 1466 1467err_free_ct_cachep: 1468 kmem_cache_destroy(ip_conntrack_cachep); 1469err_free_hash: 1470 vfree(ip_conntrack_hash); 1471err_unreg_sockopt: 1472 nf_unregister_sockopt(&so_getorigdst); 1473 1474 return -ENOMEM; 1475} 1476