1/* Connection state tracking for netfilter.  This is separated from,
2   but required by, the NAT layer; it can also be used by an iptables
3   extension. */
4
5/* (c) 1999 Paul `Rusty' Russell.  Licenced under the GNU General
6 * Public Licence.
7 *
8 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
9 * 	- new API and handling of conntrack/nat helpers
10 * 	- now capable of multiple expectations for one master
11 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
12 * 	- add usage/reference counts to ip_conntrack_expect
13 *	- export ip_conntrack[_expect]_{find_get,put} functions
14 * */
15
16#ifdef MODULE
17#define __NO_VERSION__
18#endif
19#include <linux/version.h>
20#include <linux/config.h>
21#include <linux/types.h>
22#include <linux/ip.h>
23#include <linux/netfilter.h>
24#include <linux/netfilter_ipv4.h>
25#include <linux/module.h>
26#include <linux/skbuff.h>
27#include <linux/proc_fs.h>
28#include <linux/vmalloc.h>
29#include <linux/brlock.h>
30#include <net/checksum.h>
31#include <linux/stddef.h>
32#include <linux/sysctl.h>
33#include <linux/slab.h>
34/* For ERR_PTR().  Yeah, I know... --RR */
35#include <linux/fs.h>
36
37/* This rwlock protects the main hash table, protocol/helper/expected
38   registrations, conntrack timers*/
39#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
40#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
41
42#include <linux/netfilter_ipv4/ip_conntrack.h>
43#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
44#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
45#include <linux/netfilter_ipv4/ip_conntrack_core.h>
46#include <linux/netfilter_ipv4/listhelp.h>
47
48#define IP_CONNTRACK_VERSION	"2.1"
49
50#define DEBUGP(format, args...)
51
52DECLARE_RWLOCK(ip_conntrack_lock);
53DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
54
55void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
56LIST_HEAD(ip_conntrack_expect_list);
57LIST_HEAD(protocol_list);
58static LIST_HEAD(helpers);
59unsigned int ip_conntrack_htable_size = 0;
60static int ip_conntrack_max = 0;
61static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
62struct list_head *ip_conntrack_hash;
63static kmem_cache_t *ip_conntrack_cachep;
64
65extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
66
67static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
68			      u_int8_t protocol)
69{
70	return protocol == curr->proto;
71}
72
73struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
74{
75	struct ip_conntrack_protocol *p;
76
77	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
78	p = LIST_FIND(&protocol_list, proto_cmpfn,
79		      struct ip_conntrack_protocol *, protocol);
80	if (!p)
81		p = &ip_conntrack_generic_protocol;
82
83	return p;
84}
85
86struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
87{
88	struct ip_conntrack_protocol *p;
89
90	READ_LOCK(&ip_conntrack_lock);
91	p = __ip_ct_find_proto(protocol);
92	READ_UNLOCK(&ip_conntrack_lock);
93	return p;
94}
95
96inline void
97ip_conntrack_put(struct ip_conntrack *ct)
98{
99	IP_NF_ASSERT(ct);
100	IP_NF_ASSERT(ct->infos[0].master);
101	/* nf_conntrack_put wants to go via an info struct, so feed it
102           one at random. */
103	nf_conntrack_put(&ct->infos[0]);
104}
105
106static inline u_int32_t
107hash_conntrack(const struct ip_conntrack_tuple *tuple)
108{
109	/* ntohl because more differences in low bits. */
110	/* To ensure that halves of the same connection don't hash
111	   clash, we add the source per-proto again. */
112	return (ntohl(tuple->src.ip + tuple->dst.ip
113		     + tuple->src.u.all + tuple->dst.u.all
114		     + tuple->dst.protonum)
115		+ ntohs(tuple->src.u.all))
116		% ip_conntrack_htable_size;
117}
118
119inline int
120get_tuple(const struct iphdr *iph, size_t len,
121	  struct ip_conntrack_tuple *tuple,
122	  struct ip_conntrack_protocol *protocol)
123{
124	int ret;
125
126	/* Never happen */
127	if (iph->frag_off & htons(IP_OFFSET)) {
128		printk("ip_conntrack_core: Frag of proto %u.\n",
129		       iph->protocol);
130		return 0;
131	}
132	/* Guarantee 8 protocol bytes: if more wanted, use len param */
133	else if (iph->ihl * 4 + 8 > len)
134		return 0;
135
136	tuple->src.ip = iph->saddr;
137	tuple->dst.ip = iph->daddr;
138	tuple->dst.protonum = iph->protocol;
139
140	tuple->src.u.all = tuple->dst.u.all = 0;
141
142	ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl,
143				     len - 4*iph->ihl,
144				     tuple);
145	return ret;
146}
147
148static int
149invert_tuple(struct ip_conntrack_tuple *inverse,
150	     const struct ip_conntrack_tuple *orig,
151	     const struct ip_conntrack_protocol *protocol)
152{
153	inverse->src.ip = orig->dst.ip;
154	inverse->dst.ip = orig->src.ip;
155	inverse->dst.protonum = orig->dst.protonum;
156
157	inverse->src.u.all = inverse->dst.u.all = 0;
158
159	return protocol->invert_tuple(inverse, orig);
160}
161
162
163/* ip_conntrack_expect helper functions */
164
165/* Compare tuple parts depending on mask. */
166static inline int expect_cmp(const struct ip_conntrack_expect *i,
167			     const struct ip_conntrack_tuple *tuple)
168{
169	MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
170	return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
171}
172
173static void
174destroy_expect(struct ip_conntrack_expect *exp)
175{
176	DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
177	IP_NF_ASSERT(atomic_read(&exp->use));
178	IP_NF_ASSERT(!timer_pending(&exp->timeout));
179
180	kfree(exp);
181}
182
183
184inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
185{
186	IP_NF_ASSERT(exp);
187
188	if (atomic_dec_and_test(&exp->use)) {
189		/* usage count dropped to zero */
190		destroy_expect(exp);
191	}
192}
193
194static inline struct ip_conntrack_expect *
195__ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
196{
197	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
198	MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
199	return LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
200			 struct ip_conntrack_expect *, tuple);
201}
202
203/* Find a expectation corresponding to a tuple. */
204struct ip_conntrack_expect *
205ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
206{
207	struct ip_conntrack_expect *exp;
208
209	READ_LOCK(&ip_conntrack_lock);
210	READ_LOCK(&ip_conntrack_expect_tuple_lock);
211	exp = __ip_ct_expect_find(tuple);
212	if (exp)
213		atomic_inc(&exp->use);
214	READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
215	READ_UNLOCK(&ip_conntrack_lock);
216
217	return exp;
218}
219
220/* remove one specific expectation from all lists and drop refcount,
221 * does _NOT_ delete the timer. */
222static void __unexpect_related(struct ip_conntrack_expect *expect)
223{
224	DEBUGP("unexpect_related(%p)\n", expect);
225	MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
226
227	/* we're not allowed to unexpect a confirmed expectation! */
228	IP_NF_ASSERT(!expect->sibling);
229
230	/* delete from global and local lists */
231	list_del(&expect->list);
232	list_del(&expect->expected_list);
233
234	/* decrement expect-count of master conntrack */
235	if (expect->expectant)
236		expect->expectant->expecting--;
237
238	ip_conntrack_expect_put(expect);
239}
240
241/* remove one specific expecatation from all lists, drop refcount
242 * and expire timer.
243 * This function can _NOT_ be called for confirmed expects! */
244static void unexpect_related(struct ip_conntrack_expect *expect)
245{
246	IP_NF_ASSERT(expect->expectant);
247	/* if we are supposed to have a timer, but we can't delete
248	 * it: race condition.  __unexpect_related will
249	 * be calledd by timeout function */
250	if (expect->expectant->helper
251	    && expect->expectant->helper->timeout
252	    && !del_timer(&expect->timeout))
253		return;
254
255	__unexpect_related(expect);
256}
257
258/* delete all unconfirmed expectations for this conntrack */
259static void remove_expectations(struct ip_conntrack *ct)
260{
261	struct list_head *exp_entry, *next;
262	struct ip_conntrack_expect *exp;
263
264	DEBUGP("remove_expectations(%p)\n", ct);
265
266	for (exp_entry = ct->sibling_list.next;
267	     exp_entry != &ct->sibling_list; exp_entry = next) {
268		next = exp_entry->next;
269		exp = list_entry(exp_entry, struct ip_conntrack_expect,
270				 expected_list);
271
272		/* we skip established expectations, as we want to delete
273		 * the un-established ones only */
274		if (exp->sibling) {
275			DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
276			continue;
277		}
278
279		IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
280		IP_NF_ASSERT(exp->expectant == ct);
281
282		/* delete expectation from global and private lists */
283		unexpect_related(exp);
284	}
285}
286
287static void
288clean_from_lists(struct ip_conntrack *ct)
289{
290	DEBUGP("clean_from_lists(%p)\n", ct);
291	MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
292	/* Remove from both hash lists: must not NULL out next ptrs,
293           otherwise we'll look unconfirmed.  Fortunately, LIST_DELETE
294           doesn't do this. --RR */
295	LIST_DELETE(&ip_conntrack_hash
296		    [hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)],
297		    &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
298	LIST_DELETE(&ip_conntrack_hash
299		    [hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)],
300		    &ct->tuplehash[IP_CT_DIR_REPLY]);
301
302	/* Destroy all un-established, pending expectations */
303	remove_expectations(ct);
304}
305
306static void
307destroy_conntrack(struct nf_conntrack *nfct)
308{
309	struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
310	struct ip_conntrack_protocol *proto;
311
312	DEBUGP("destroy_conntrack(%p)\n", ct);
313	IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
314	IP_NF_ASSERT(!timer_pending(&ct->timeout));
315
316	if (ct->master && master_ct(ct))
317		ip_conntrack_put(master_ct(ct));
318
319	/* To make sure we don't get any weird locking issues here:
320	 * destroy_conntrack() MUST NOT be called with a write lock
321	 * to ip_conntrack_lock!!! -HW */
322	proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
323	if (proto && proto->destroy)
324		proto->destroy(ct);
325
326	if (ip_conntrack_destroyed)
327		ip_conntrack_destroyed(ct);
328
329	WRITE_LOCK(&ip_conntrack_lock);
330	/* Delete our master expectation */
331	if (ct->master) {
332		/* can't call __unexpect_related here,
333		 * since it would screw up expect_list */
334		list_del(&ct->master->expected_list);
335		kfree(ct->master);
336	}
337	WRITE_UNLOCK(&ip_conntrack_lock);
338
339	DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
340	kmem_cache_free(ip_conntrack_cachep, ct);
341	atomic_dec(&ip_conntrack_count);
342}
343
344static void death_by_timeout(unsigned long ul_conntrack)
345{
346	struct ip_conntrack *ct = (void *)ul_conntrack;
347
348	WRITE_LOCK(&ip_conntrack_lock);
349	clean_from_lists(ct);
350	WRITE_UNLOCK(&ip_conntrack_lock);
351	ip_conntrack_put(ct);
352}
353
354static inline int
355conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
356		    const struct ip_conntrack_tuple *tuple,
357		    const struct ip_conntrack *ignored_conntrack)
358{
359	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
360	return i->ctrack != ignored_conntrack
361		&& ip_ct_tuple_equal(tuple, &i->tuple);
362}
363
364static struct ip_conntrack_tuple_hash *
365__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
366		    const struct ip_conntrack *ignored_conntrack)
367{
368	struct ip_conntrack_tuple_hash *h;
369
370	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
371	h = LIST_FIND(&ip_conntrack_hash[hash_conntrack(tuple)],
372		      conntrack_tuple_cmp,
373		      struct ip_conntrack_tuple_hash *,
374		      tuple, ignored_conntrack);
375	return h;
376}
377
378/* Find a connection corresponding to a tuple. */
379struct ip_conntrack_tuple_hash *
380ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
381		      const struct ip_conntrack *ignored_conntrack)
382{
383	struct ip_conntrack_tuple_hash *h;
384
385	READ_LOCK(&ip_conntrack_lock);
386	h = __ip_conntrack_find(tuple, ignored_conntrack);
387	if (h)
388		atomic_inc(&h->ctrack->ct_general.use);
389	READ_UNLOCK(&ip_conntrack_lock);
390
391	return h;
392}
393
394static inline struct ip_conntrack *
395__ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
396{
397	struct ip_conntrack *ct
398		= (struct ip_conntrack *)nfct->master;
399
400	/* ctinfo is the index of the nfct inside the conntrack */
401	*ctinfo = nfct - ct->infos;
402	IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
403	return ct;
404}
405
406/* Return conntrack and conntrack_info given skb->nfct->master */
407struct ip_conntrack *
408ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
409{
410	if (skb->nfct)
411		return __ip_conntrack_get(skb->nfct, ctinfo);
412	return NULL;
413}
414
415/* Confirm a connection given skb->nfct; places it in hash table */
416int
417__ip_conntrack_confirm(struct nf_ct_info *nfct)
418{
419	unsigned int hash, repl_hash;
420	struct ip_conntrack *ct;
421	enum ip_conntrack_info ctinfo;
422
423	ct = __ip_conntrack_get(nfct, &ctinfo);
424
425	/* ipt_REJECT uses ip_conntrack_attach to attach related
426	   ICMP/TCP RST packets in other direction.  Actual packet
427	   which created connection will be IP_CT_NEW or for an
428	   expected connection, IP_CT_RELATED. */
429	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
430		return NF_ACCEPT;
431
432	hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
433	repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
434
435	/* We're not in hash table, and we refuse to set up related
436	   connections for unconfirmed conns.  But packet copies and
437	   REJECT will give spurious warnings here. */
438	/* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
439
440	/* No external references means noone else could have
441           confirmed us. */
442	IP_NF_ASSERT(!is_confirmed(ct));
443	DEBUGP("Confirming conntrack %p\n", ct);
444
445	WRITE_LOCK(&ip_conntrack_lock);
446	/* See if there's one in the list already, including reverse:
447           NAT could have grabbed it without realizing, since we're
448           not in the hash.  If there is, we lost race. */
449	if (!LIST_FIND(&ip_conntrack_hash[hash],
450		       conntrack_tuple_cmp,
451		       struct ip_conntrack_tuple_hash *,
452		       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
453	    && !LIST_FIND(&ip_conntrack_hash[repl_hash],
454			  conntrack_tuple_cmp,
455			  struct ip_conntrack_tuple_hash *,
456			  &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
457		list_prepend(&ip_conntrack_hash[hash],
458			     &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
459		list_prepend(&ip_conntrack_hash[repl_hash],
460			     &ct->tuplehash[IP_CT_DIR_REPLY]);
461		/* Timer relative to confirmation time, not original
462		   setting time, otherwise we'd get timer wrap in
463		   weird delay cases. */
464		ct->timeout.expires += jiffies;
465		add_timer(&ct->timeout);
466		atomic_inc(&ct->ct_general.use);
467		WRITE_UNLOCK(&ip_conntrack_lock);
468		return NF_ACCEPT;
469	}
470
471	WRITE_UNLOCK(&ip_conntrack_lock);
472	return NF_DROP;
473}
474
475/* Returns true if a connection correspondings to the tuple (required
476   for NAT). */
477int
478ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
479			 const struct ip_conntrack *ignored_conntrack)
480{
481	struct ip_conntrack_tuple_hash *h;
482
483	READ_LOCK(&ip_conntrack_lock);
484	h = __ip_conntrack_find(tuple, ignored_conntrack);
485	READ_UNLOCK(&ip_conntrack_lock);
486
487	return h != NULL;
488}
489
490/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
491struct ip_conntrack *
492icmp_error_track(struct sk_buff *skb,
493		 enum ip_conntrack_info *ctinfo,
494		 unsigned int hooknum)
495{
496	const struct iphdr *iph;
497	struct icmphdr *hdr;
498	struct ip_conntrack_tuple innertuple, origtuple;
499	struct iphdr *inner;
500	size_t datalen;
501	struct ip_conntrack_protocol *innerproto;
502	struct ip_conntrack_tuple_hash *h;
503
504	IP_NF_ASSERT(iph->protocol == IPPROTO_ICMP);
505	IP_NF_ASSERT(skb->nfct == NULL);
506
507	iph = skb->nh.iph;
508	hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
509	inner = (struct iphdr *)(hdr + 1);
510	datalen = skb->len - iph->ihl*4 - sizeof(*hdr);
511
512	if (skb->len < iph->ihl * 4 + sizeof(*hdr) + sizeof(*iph)) {
513		DEBUGP("icmp_error_track: too short\n");
514		return NULL;
515	}
516
517	if (hdr->type != ICMP_DEST_UNREACH
518	    && hdr->type != ICMP_SOURCE_QUENCH
519	    && hdr->type != ICMP_TIME_EXCEEDED
520	    && hdr->type != ICMP_PARAMETERPROB
521	    && hdr->type != ICMP_REDIRECT)
522		return NULL;
523
524	/* Ignore ICMP's containing fragments (shouldn't happen) */
525	if (inner->frag_off & htons(IP_OFFSET)) {
526		DEBUGP("icmp_error_track: fragment of proto %u\n",
527		       inner->protocol);
528		return NULL;
529	}
530
531	/* Ignore it if the checksum's bogus. */
532	if (ip_compute_csum((unsigned char *)hdr, sizeof(*hdr) + datalen)) {
533		DEBUGP("icmp_error_track: bad csum\n");
534		return NULL;
535	}
536
537	innerproto = ip_ct_find_proto(inner->protocol);
538	/* Are they talking about one of our connections? */
539	if (inner->ihl * 4 + 8 > datalen
540	    || !get_tuple(inner, datalen, &origtuple, innerproto)) {
541		DEBUGP("icmp_error: ! get_tuple p=%u (%u*4+%u dlen=%u)\n",
542		       inner->protocol, inner->ihl, 8,
543		       datalen);
544		return NULL;
545	}
546
547	/* Ordinarily, we'd expect the inverted tupleproto, but it's
548	   been preserved inside the ICMP. */
549	if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
550		DEBUGP("icmp_error_track: Can't invert tuple\n");
551		return NULL;
552	}
553
554	*ctinfo = IP_CT_RELATED;
555
556	h = ip_conntrack_find_get(&innertuple, NULL);
557	if (!h) {
558		/* Locally generated ICMPs will match inverted if they
559		   haven't been SNAT'ed yet */
560		if (hooknum == NF_IP_LOCAL_OUT)
561			h = ip_conntrack_find_get(&origtuple, NULL);
562
563		if (!h) {
564			DEBUGP("icmp_error_track: no match\n");
565			return NULL;
566		}
567		/* Reverse direction from that found */
568		if (DIRECTION(h) != IP_CT_DIR_REPLY)
569			*ctinfo += IP_CT_IS_REPLY;
570	} else {
571		if (DIRECTION(h) == IP_CT_DIR_REPLY)
572			*ctinfo += IP_CT_IS_REPLY;
573	}
574
575	/* Update skb to refer to this connection */
576	skb->nfct = &h->ctrack->infos[*ctinfo];
577	return h->ctrack;
578}
579
580/* There's a small race here where we may free a just-assured
581   connection.  Too bad: we're in trouble anyway. */
582static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
583{
584	return !(i->ctrack->status & IPS_ASSURED);
585}
586
587static int early_drop(struct list_head *chain)
588{
589	/* Traverse backwards: gives us oldest, which is roughly LRU */
590	struct ip_conntrack_tuple_hash *h;
591	int dropped = 0;
592
593	READ_LOCK(&ip_conntrack_lock);
594	h = LIST_FIND(chain, unreplied, struct ip_conntrack_tuple_hash *);
595	if (h)
596		atomic_inc(&h->ctrack->ct_general.use);
597	READ_UNLOCK(&ip_conntrack_lock);
598
599	if (!h)
600		return dropped;
601
602	if (del_timer(&h->ctrack->timeout)) {
603		death_by_timeout((unsigned long)h->ctrack);
604		dropped = 1;
605	}
606	ip_conntrack_put(h->ctrack);
607	return dropped;
608}
609
610static inline int helper_cmp(const struct ip_conntrack_helper *i,
611			     const struct ip_conntrack_tuple *rtuple)
612{
613	return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
614}
615
616struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
617{
618	return LIST_FIND(&helpers, helper_cmp,
619			 struct ip_conntrack_helper *,
620			 tuple);
621}
622
623/* Allocate a new conntrack: we return -ENOMEM if classification
624   failed due to stress.  Otherwise it really is unclassifiable. */
625static struct ip_conntrack_tuple_hash *
626init_conntrack(const struct ip_conntrack_tuple *tuple,
627	       struct ip_conntrack_protocol *protocol,
628	       struct sk_buff *skb)
629{
630	struct ip_conntrack *conntrack;
631	struct ip_conntrack_tuple repl_tuple;
632	size_t hash, repl_hash;
633	struct ip_conntrack_expect *expected;
634	int i;
635	static unsigned int drop_next = 0;
636
637	hash = hash_conntrack(tuple);
638
639	if (ip_conntrack_max &&
640	    atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
641		/* Try dropping from random chain, or else from the
642                   chain about to put into (in case they're trying to
643                   bomb one hash chain). */
644		unsigned int next = (drop_next++)%ip_conntrack_htable_size;
645
646		if (!early_drop(&ip_conntrack_hash[next])
647		    && !early_drop(&ip_conntrack_hash[hash])) {
648			if (net_ratelimit())
649				printk(KERN_WARNING
650				       "ip_conntrack: table full, dropping"
651				       " packet.\n");
652			return ERR_PTR(-ENOMEM);
653		}
654	}
655
656	if (!invert_tuple(&repl_tuple, tuple, protocol)) {
657		DEBUGP("Can't invert tuple.\n");
658		return NULL;
659	}
660	repl_hash = hash_conntrack(&repl_tuple);
661
662	conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
663	if (!conntrack) {
664		DEBUGP("Can't allocate conntrack.\n");
665		return ERR_PTR(-ENOMEM);
666	}
667
668	memset(conntrack, 0, sizeof(*conntrack));
669	atomic_set(&conntrack->ct_general.use, 1);
670	conntrack->ct_general.destroy = destroy_conntrack;
671	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
672	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
673	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
674	conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
675	for (i=0; i < IP_CT_NUMBER; i++)
676		conntrack->infos[i].master = &conntrack->ct_general;
677
678	if (!protocol->new(conntrack, skb->nh.iph, skb->len)) {
679		kmem_cache_free(ip_conntrack_cachep, conntrack);
680		return NULL;
681	}
682	/* Don't set timer yet: wait for confirmation */
683	init_timer(&conntrack->timeout);
684	conntrack->timeout.data = (unsigned long)conntrack;
685	conntrack->timeout.function = death_by_timeout;
686
687	INIT_LIST_HEAD(&conntrack->sibling_list);
688
689	/* Mark clearly that it's not in the hash table. */
690	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list.next = NULL;
691
692	WRITE_LOCK(&ip_conntrack_lock);
693	/* Need finding and deleting of expected ONLY if we win race */
694	READ_LOCK(&ip_conntrack_expect_tuple_lock);
695	expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
696			     struct ip_conntrack_expect *, tuple);
697	READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
698
699	/* Look up the conntrack helper for master connections only */
700	if (!expected)
701		conntrack->helper = ip_ct_find_helper(&repl_tuple);
702
703	/* If the expectation is dying, then this is a looser. */
704	if (expected
705	    && expected->expectant->helper
706	    && expected->expectant->helper->timeout
707	    && ! del_timer(&expected->timeout))
708		expected = NULL;
709
710	/* If master is not in hash table yet (ie. packet hasn't left
711	   this machine yet), how can other end know about expected?
712	   Hence these are not the droids you are looking for (if
713	   master ct never got confirmed, we'd hold a reference to it
714	   and weird things would happen to future packets). */
715	if (expected && is_confirmed(expected->expectant)) {
716		DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
717			conntrack, expected);
718		/* Welcome, Mr. Bond.  We've been expecting you... */
719		IP_NF_ASSERT(master_ct(conntrack));
720		conntrack->status = IPS_EXPECTED;
721		conntrack->master = expected;
722		expected->sibling = conntrack;
723		LIST_DELETE(&ip_conntrack_expect_list, expected);
724		INIT_LIST_HEAD(&expected->list);
725		expected->expectant->expecting--;
726		nf_conntrack_get(&master_ct(conntrack)->infos[0]);
727	}
728	atomic_inc(&ip_conntrack_count);
729	WRITE_UNLOCK(&ip_conntrack_lock);
730
731	if (expected && is_confirmed(expected->expectant) && expected->expectfn)
732		expected->expectfn(conntrack);
733	return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
734}
735
736/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
737static inline struct ip_conntrack *
738resolve_normal_ct(struct sk_buff *skb,
739		  struct ip_conntrack_protocol *proto,
740		  int *set_reply,
741		  unsigned int hooknum,
742		  enum ip_conntrack_info *ctinfo)
743{
744	struct ip_conntrack_tuple tuple;
745	struct ip_conntrack_tuple_hash *h;
746
747	IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
748
749	if (!get_tuple(skb->nh.iph, skb->len, &tuple, proto))
750		return NULL;
751
752	/* look for tuple match */
753	h = ip_conntrack_find_get(&tuple, NULL);
754	if (!h) {
755		h = init_conntrack(&tuple, proto, skb);
756		if (!h)
757			return NULL;
758		if (IS_ERR(h))
759			return (void *)h;
760	}
761
762	/* It exists; we have (non-exclusive) reference. */
763	if (DIRECTION(h) == IP_CT_DIR_REPLY) {
764		*ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
765		/* Please set reply bit if this packet OK */
766		*set_reply = 1;
767	} else {
768		/* Once we've had two way comms, always ESTABLISHED. */
769		if (h->ctrack->status & IPS_SEEN_REPLY) {
770			DEBUGP("ip_conntrack_in: normal packet for %p\n",
771			       h->ctrack);
772		        *ctinfo = IP_CT_ESTABLISHED;
773		} else if (h->ctrack->status & IPS_EXPECTED) {
774			DEBUGP("ip_conntrack_in: related packet for %p\n",
775			       h->ctrack);
776			*ctinfo = IP_CT_RELATED;
777		} else {
778			DEBUGP("ip_conntrack_in: new packet for %p\n",
779			       h->ctrack);
780			*ctinfo = IP_CT_NEW;
781		}
782		*set_reply = 0;
783	}
784	skb->nfct = &h->ctrack->infos[*ctinfo];
785	return h->ctrack;
786}
787
788/* Netfilter hook itself. */
789unsigned int ip_conntrack_in(unsigned int hooknum,
790			     struct sk_buff **pskb,
791			     const struct net_device *in,
792			     const struct net_device *out,
793			     int (*okfn)(struct sk_buff *))
794{
795	struct ip_conntrack *ct;
796	enum ip_conntrack_info ctinfo;
797	struct ip_conntrack_protocol *proto;
798	int set_reply;
799	int ret;
800
801	(*pskb)->nfcache |= NFC_UNKNOWN;
802
803/* Doesn't cover locally-generated broadcast, so not worth it. */
804
805	/* Previously seen (loopback)?  Ignore.  Do this before
806           fragment check. */
807	if ((*pskb)->nfct)
808		return NF_ACCEPT;
809
810	/* Gather fragments. */
811	if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
812		*pskb = ip_ct_gather_frags(*pskb);
813		if (!*pskb)
814			return NF_STOLEN;
815	}
816
817	proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
818
819	/* It may be an icmp error... */
820	if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
821	    && icmp_error_track(*pskb, &ctinfo, hooknum))
822		return NF_ACCEPT;
823
824	if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
825		/* Not valid part of a connection */
826		return NF_ACCEPT;
827
828	if (IS_ERR(ct))
829		/* Too stressed to deal. */
830		return NF_DROP;
831
832	IP_NF_ASSERT((*pskb)->nfct);
833
834	ret = proto->packet(ct, (*pskb)->nh.iph, (*pskb)->len, ctinfo);
835	if (ret == -1) {
836		/* Invalid */
837		nf_conntrack_put((*pskb)->nfct);
838		(*pskb)->nfct = NULL;
839		return NF_ACCEPT;
840	}
841
842	if (ret != NF_DROP && ct->helper) {
843		ret = ct->helper->help((*pskb)->nh.iph, (*pskb)->len,
844				       ct, ctinfo);
845		if (ret == -1) {
846			/* Invalid */
847			nf_conntrack_put((*pskb)->nfct);
848			(*pskb)->nfct = NULL;
849			return NF_ACCEPT;
850		}
851	}
852	if (set_reply)
853		set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
854
855	return ret;
856}
857
858int invert_tuplepr(struct ip_conntrack_tuple *inverse,
859		   const struct ip_conntrack_tuple *orig)
860{
861	return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
862}
863
864static inline int resent_expect(const struct ip_conntrack_expect *i,
865			        const struct ip_conntrack_tuple *tuple,
866			        const struct ip_conntrack_tuple *mask)
867{
868	DEBUGP("resent_expect\n");
869	DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
870	DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
871	DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
872	return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
873	         || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
874		&& ip_ct_tuple_equal(&i->mask, mask));
875}
876
877/* Would two expected things clash? */
878static inline int expect_clash(const struct ip_conntrack_expect *i,
879			       const struct ip_conntrack_tuple *tuple,
880			       const struct ip_conntrack_tuple *mask)
881{
882	/* Part covered by intersection of masks must be unequal,
883           otherwise they clash */
884	struct ip_conntrack_tuple intersect_mask
885		= { { i->mask.src.ip & mask->src.ip,
886		      { i->mask.src.u.all & mask->src.u.all } },
887		    { i->mask.dst.ip & mask->dst.ip,
888		      { i->mask.dst.u.all & mask->dst.u.all },
889		      i->mask.dst.protonum & mask->dst.protonum } };
890
891	return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
892}
893
894inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
895{
896	WRITE_LOCK(&ip_conntrack_lock);
897	unexpect_related(expect);
898	WRITE_UNLOCK(&ip_conntrack_lock);
899}
900
901static void expectation_timed_out(unsigned long ul_expect)
902{
903	struct ip_conntrack_expect *expect = (void *) ul_expect;
904
905	DEBUGP("expectation %p timed out\n", expect);
906	WRITE_LOCK(&ip_conntrack_lock);
907	__unexpect_related(expect);
908	WRITE_UNLOCK(&ip_conntrack_lock);
909}
910
911/* Add a related connection. */
912int ip_conntrack_expect_related(struct ip_conntrack *related_to,
913				struct ip_conntrack_expect *expect)
914{
915	struct ip_conntrack_expect *old, *new;
916	int ret = 0;
917
918	WRITE_LOCK(&ip_conntrack_lock);
919	/* Because of the write lock, no reader can walk the lists,
920	 * so there is no need to use the tuple lock too */
921
922	DEBUGP("ip_conntrack_expect_related %p\n", related_to);
923	DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
924	DEBUGP("mask:  "); DUMP_TUPLE_RAW(&expect->mask);
925
926	old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
927		        struct ip_conntrack_expect *, &expect->tuple,
928			&expect->mask);
929	if (old) {
930		/* Helper private data may contain offsets but no pointers
931		   pointing into the payload - otherwise we should have to copy
932		   the data filled out by the helper over the old one */
933		DEBUGP("expect_related: resent packet\n");
934		if (related_to->helper &&
935		    related_to->helper->timeout) {
936			if (!del_timer(&old->timeout)) {
937				/* expectation is dying. Fall through */
938				old = NULL;
939			} else {
940				old->timeout.expires = jiffies +
941					related_to->helper->timeout * HZ;
942				add_timer(&old->timeout);
943			}
944		}
945
946		if (old) {
947			WRITE_UNLOCK(&ip_conntrack_lock);
948			return -EEXIST;
949		}
950	} else if (related_to->helper &&
951		   related_to->helper->max_expected &&
952		   related_to->expecting >= related_to->helper->max_expected) {
953		struct list_head *cur_item;
954		/* old == NULL */
955		if (!(related_to->helper->flags &
956		      IP_CT_HELPER_F_REUSE_EXPECT)) {
957			WRITE_UNLOCK(&ip_conntrack_lock);
958 		    	if (net_ratelimit())
959 			    	printk(KERN_WARNING
960				       "ip_conntrack: max number of expected "
961				       "connections %i of %s reached for "
962				       "%u.%u.%u.%u->%u.%u.%u.%u\n",
963				       related_to->helper->max_expected,
964				       related_to->helper->name,
965 		    	       	       NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
966 		    	       	       NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
967			return -EPERM;
968		}
969		DEBUGP("ip_conntrack: max number of expected "
970		       "connections %i of %s reached for "
971		       "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
972 		       related_to->helper->max_expected,
973		       related_to->helper->name,
974		       NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
975		       NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
976
977		/* choose the the oldest expectation to evict */
978		list_for_each(cur_item, &related_to->sibling_list) {
979			struct ip_conntrack_expect *cur;
980
981			cur = list_entry(cur_item,
982					 struct ip_conntrack_expect,
983					 expected_list);
984			if (cur->sibling == NULL) {
985				old = cur;
986				break;
987			}
988		}
989
990		/* (!old) cannot happen, since related_to->expecting is the
991		 * number of unconfirmed expects */
992		IP_NF_ASSERT(old);
993
994		/* newnat14 does not reuse the real allocated memory
995		 * structures but rather unexpects the old and
996		 * allocates a new.  unexpect_related will decrement
997		 * related_to->expecting.
998		 */
999		unexpect_related(old);
1000		ret = -EPERM;
1001	} else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1002			     struct ip_conntrack_expect *, &expect->tuple,
1003			     &expect->mask)) {
1004		WRITE_UNLOCK(&ip_conntrack_lock);
1005		DEBUGP("expect_related: busy!\n");
1006		return -EBUSY;
1007	}
1008
1009	new = (struct ip_conntrack_expect *)
1010	      kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
1011	if (!new) {
1012		WRITE_UNLOCK(&ip_conntrack_lock);
1013		DEBUGP("expect_relaed: OOM allocating expect\n");
1014		return -ENOMEM;
1015	}
1016
1017	/* Zero out the new structure, then fill out it with the data */
1018	DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
1019	memset(new, 0, sizeof(*expect));
1020	INIT_LIST_HEAD(&new->list);
1021	INIT_LIST_HEAD(&new->expected_list);
1022	memcpy(new, expect, sizeof(*expect));
1023	new->expectant = related_to;
1024	new->sibling = NULL;
1025	/* increase usage count. This sucks. The memset above overwrites
1026	 * old usage count [if still present] and we increase to one.  Only
1027	 * works because everything is done under ip_conntrack_lock() */
1028	atomic_inc(&new->use);
1029
1030	/* add to expected list for this connection */
1031	list_add(&new->expected_list, &related_to->sibling_list);
1032	/* add to global list of expectations */
1033	list_prepend(&ip_conntrack_expect_list, &new->list);
1034	/* add and start timer if required */
1035	if (related_to->helper &&
1036	    related_to->helper->timeout) {
1037		init_timer(&new->timeout);
1038		new->timeout.data = (unsigned long)new;
1039		new->timeout.function = expectation_timed_out;
1040		new->timeout.expires = jiffies +
1041					related_to->helper->timeout * HZ;
1042		add_timer(&new->timeout);
1043	}
1044	related_to->expecting++;
1045
1046	WRITE_UNLOCK(&ip_conntrack_lock);
1047
1048	return ret;
1049}
1050
1051/* Change tuple in an existing expectation */
1052int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
1053			       struct ip_conntrack_tuple *newtuple)
1054{
1055	int ret;
1056
1057	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
1058	WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
1059	DEBUGP("change_expect:\n");
1060	DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
1061	DEBUGP("exp mask:  "); DUMP_TUPLE_RAW(&expect->mask);
1062	DEBUGP("newtuple:  "); DUMP_TUPLE_RAW(newtuple);
1063	if (expect->ct_tuple.dst.protonum == 0) {
1064		/* Never seen before */
1065		DEBUGP("change expect: never seen before\n");
1066		if (!ip_ct_tuple_equal(&expect->tuple, newtuple)
1067		    && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1068			         struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1069			/* Force NAT to find an unused tuple */
1070			ret = -1;
1071		} else {
1072			memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1073			memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1074			ret = 0;
1075		}
1076	} else {
1077		/* Resent packet */
1078		DEBUGP("change expect: resent packet\n");
1079		if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1080			ret = 0;
1081		} else {
1082			/* Force NAT to choose again the same port */
1083			ret = -1;
1084		}
1085	}
1086	WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1087
1088	return ret;
1089}
1090
1091/* Alter reply tuple (maybe alter helper).  If it's already taken,
1092   return 0 and don't do alteration. */
1093int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1094			     const struct ip_conntrack_tuple *newreply)
1095{
1096	WRITE_LOCK(&ip_conntrack_lock);
1097	if (__ip_conntrack_find(newreply, conntrack)) {
1098		WRITE_UNLOCK(&ip_conntrack_lock);
1099		return 0;
1100	}
1101	/* Should be unconfirmed, so not in hash table yet */
1102	IP_NF_ASSERT(!is_confirmed(conntrack));
1103
1104	DEBUGP("Altering reply tuple of %p to ", conntrack);
1105	DUMP_TUPLE(newreply);
1106
1107	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1108	if (!conntrack->master)
1109		conntrack->helper = LIST_FIND(&helpers, helper_cmp,
1110					      struct ip_conntrack_helper *,
1111					      newreply);
1112	WRITE_UNLOCK(&ip_conntrack_lock);
1113
1114	return 1;
1115}
1116
1117int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1118{
1119	MOD_INC_USE_COUNT;
1120
1121	WRITE_LOCK(&ip_conntrack_lock);
1122	list_prepend(&helpers, me);
1123	WRITE_UNLOCK(&ip_conntrack_lock);
1124
1125	return 0;
1126}
1127
1128static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1129			 const struct ip_conntrack_helper *me)
1130{
1131	if (i->ctrack->helper == me) {
1132		/* Get rid of any expected. */
1133		remove_expectations(i->ctrack);
1134		/* And *then* set helper to NULL */
1135		i->ctrack->helper = NULL;
1136	}
1137	return 0;
1138}
1139
1140void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1141{
1142	unsigned int i;
1143
1144	/* Need write lock here, to delete helper. */
1145	WRITE_LOCK(&ip_conntrack_lock);
1146	LIST_DELETE(&helpers, me);
1147
1148	/* Get rid of expecteds, set helpers to NULL. */
1149	for (i = 0; i < ip_conntrack_htable_size; i++)
1150		LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1151			    struct ip_conntrack_tuple_hash *, me);
1152	WRITE_UNLOCK(&ip_conntrack_lock);
1153
1154	/* Someone could be still looking at the helper in a bh. */
1155	br_write_lock_bh(BR_NETPROTO_LOCK);
1156	br_write_unlock_bh(BR_NETPROTO_LOCK);
1157
1158	MOD_DEC_USE_COUNT;
1159}
1160
1161/* Refresh conntrack for this many jiffies. */
1162void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
1163{
1164	IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1165
1166	WRITE_LOCK(&ip_conntrack_lock);
1167	/* If not in hash table, timer will not be active yet */
1168	if (!is_confirmed(ct))
1169		ct->timeout.expires = extra_jiffies;
1170	else {
1171		/* Need del_timer for race avoidance (may already be dying). */
1172		if (del_timer(&ct->timeout)) {
1173			ct->timeout.expires = jiffies + extra_jiffies;
1174			add_timer(&ct->timeout);
1175		}
1176	}
1177	WRITE_UNLOCK(&ip_conntrack_lock);
1178}
1179
1180/* Returns new sk_buff, or NULL */
1181struct sk_buff *
1182ip_ct_gather_frags(struct sk_buff *skb)
1183{
1184	struct sock *sk = skb->sk;
1185#ifdef CONFIG_NETFILTER_DEBUG
1186	unsigned int olddebug = skb->nf_debug;
1187#endif
1188	if (sk) {
1189		sock_hold(sk);
1190		skb_orphan(skb);
1191	}
1192
1193	local_bh_disable();
1194	skb = ip_defrag(skb);
1195	local_bh_enable();
1196
1197	if (!skb) {
1198		if (sk) sock_put(sk);
1199		return skb;
1200	} else if (skb_is_nonlinear(skb) && skb_linearize(skb, GFP_ATOMIC) != 0) {
1201		kfree_skb(skb);
1202		if (sk) sock_put(sk);
1203		return NULL;
1204	}
1205
1206	if (sk) {
1207		skb_set_owner_w(skb, sk);
1208		sock_put(sk);
1209	}
1210
1211	ip_send_check(skb->nh.iph);
1212	skb->nfcache |= NFC_ALTERED;
1213#ifdef CONFIG_NETFILTER_DEBUG
1214	/* Packet path as if nothing had happened. */
1215	skb->nf_debug = olddebug;
1216#endif
1217	return skb;
1218}
1219
1220/* Used by ipt_REJECT. */
1221static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
1222{
1223	struct ip_conntrack *ct;
1224	enum ip_conntrack_info ctinfo;
1225
1226	ct = __ip_conntrack_get(nfct, &ctinfo);
1227
1228	/* This ICMP is in reverse direction to the packet which
1229           caused it */
1230	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1231		ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1232	else
1233		ctinfo = IP_CT_RELATED;
1234
1235	/* Attach new skbuff, and increment count */
1236	nskb->nfct = &ct->infos[ctinfo];
1237	atomic_inc(&ct->ct_general.use);
1238}
1239
1240static inline int
1241do_kill(const struct ip_conntrack_tuple_hash *i,
1242	int (*kill)(const struct ip_conntrack *i, void *data),
1243	void *data)
1244{
1245	return kill(i->ctrack, data);
1246}
1247
1248/* Bring out ya dead! */
1249static struct ip_conntrack_tuple_hash *
1250get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1251		void *data)
1252{
1253	struct ip_conntrack_tuple_hash *h = NULL;
1254	unsigned int i;
1255
1256	READ_LOCK(&ip_conntrack_lock);
1257	for (i = 0; !h && i < ip_conntrack_htable_size; i++) {
1258		h = LIST_FIND(&ip_conntrack_hash[i], do_kill,
1259			      struct ip_conntrack_tuple_hash *, kill, data);
1260	}
1261	if (h)
1262		atomic_inc(&h->ctrack->ct_general.use);
1263	READ_UNLOCK(&ip_conntrack_lock);
1264
1265	return h;
1266}
1267
1268void
1269ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1270			void *data)
1271{
1272	struct ip_conntrack_tuple_hash *h;
1273
1274	/* This is order n^2, by the way. */
1275	while ((h = get_next_corpse(kill, data)) != NULL) {
1276		/* Time to push up daises... */
1277		if (del_timer(&h->ctrack->timeout))
1278			death_by_timeout((unsigned long)h->ctrack);
1279		/* ... else the timer will get him soon. */
1280
1281		ip_conntrack_put(h->ctrack);
1282	}
1283}
1284
1285/* Fast function for those who don't want to parse /proc (and I don't
1286   blame them). */
1287/* Reversing the socket's dst/src point of view gives us the reply
1288   mapping. */
1289static int
1290getorigdst(struct sock *sk, int optval, void *user, int *len)
1291{
1292	struct ip_conntrack_tuple_hash *h;
1293	struct ip_conntrack_tuple tuple = { { sk->rcv_saddr, { sk->sport } },
1294					    { sk->daddr, { sk->dport },
1295					      IPPROTO_TCP } };
1296
1297	/* We only do TCP at the moment: is there a better way? */
1298	if (strcmp(sk->prot->name, "TCP") != 0) {
1299		DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1300		return -ENOPROTOOPT;
1301	}
1302
1303	if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1304		DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1305		       *len, sizeof(struct sockaddr_in));
1306		return -EINVAL;
1307	}
1308
1309	h = ip_conntrack_find_get(&tuple, NULL);
1310	if (h) {
1311		struct sockaddr_in sin;
1312
1313		sin.sin_family = AF_INET;
1314		sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1315			.tuple.dst.u.tcp.port;
1316		sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1317			.tuple.dst.ip;
1318
1319		DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1320		       NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1321		ip_conntrack_put(h->ctrack);
1322		if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1323			return -EFAULT;
1324		else
1325			return 0;
1326	}
1327	DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1328	       NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1329	       NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1330	return -ENOENT;
1331}
1332
1333static struct nf_sockopt_ops so_getorigdst
1334= { { NULL, NULL }, PF_INET,
1335    0, 0, NULL, /* Setsockopts */
1336    SO_ORIGINAL_DST, SO_ORIGINAL_DST+1, &getorigdst,
1337    0, NULL };
1338
1339#define NET_IP_CONNTRACK_MAX 2089
1340#define NET_IP_CONNTRACK_MAX_NAME "ip_conntrack_max"
1341
1342#ifdef CONFIG_SYSCTL
1343static struct ctl_table_header *ip_conntrack_sysctl_header;
1344
1345static ctl_table ip_conntrack_table[] = {
1346	{ NET_IP_CONNTRACK_MAX, NET_IP_CONNTRACK_MAX_NAME, &ip_conntrack_max,
1347		sizeof(ip_conntrack_max), 0644,  NULL, proc_dointvec },
1348 	{ 0 }
1349};
1350
1351static ctl_table ip_conntrack_dir_table[] = {
1352	{NET_IPV4, "ipv4", NULL, 0, 0555, ip_conntrack_table, 0, 0, 0, 0, 0},
1353	{ 0 }
1354};
1355
1356static ctl_table ip_conntrack_root_table[] = {
1357	{CTL_NET, "net", NULL, 0, 0555, ip_conntrack_dir_table, 0, 0, 0, 0, 0},
1358	{ 0 }
1359};
1360#endif /*CONFIG_SYSCTL*/
1361
1362static int kill_all(const struct ip_conntrack *i, void *data)
1363{
1364	return 1;
1365}
1366
1367/* Mishearing the voices in his head, our hero wonders how he's
1368   supposed to kill the mall. */
1369void ip_conntrack_cleanup(void)
1370{
1371#ifdef CONFIG_SYSCTL
1372	unregister_sysctl_table(ip_conntrack_sysctl_header);
1373#endif
1374	ip_ct_attach = NULL;
1375	/* This makes sure all current packets have passed through
1376           netfilter framework.  Roll on, two-stage module
1377           delete... */
1378	br_write_lock_bh(BR_NETPROTO_LOCK);
1379	br_write_unlock_bh(BR_NETPROTO_LOCK);
1380
1381 i_see_dead_people:
1382	ip_ct_selective_cleanup(kill_all, NULL);
1383	if (atomic_read(&ip_conntrack_count) != 0) {
1384		schedule();
1385		goto i_see_dead_people;
1386	}
1387
1388	kmem_cache_destroy(ip_conntrack_cachep);
1389	vfree(ip_conntrack_hash);
1390	nf_unregister_sockopt(&so_getorigdst);
1391}
1392
1393static int hashsize = 0;
1394MODULE_PARM(hashsize, "i");
1395
1396int __init ip_conntrack_init(void)
1397{
1398	unsigned int i;
1399	int ret;
1400
1401	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1402	 * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1403 	if (hashsize) {
1404 		ip_conntrack_htable_size = hashsize;
1405 	} else {
1406		ip_conntrack_htable_size
1407			= (((num_physpages << PAGE_SHIFT) / 16384)
1408			   / sizeof(struct list_head));
1409		if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1410			ip_conntrack_htable_size = 8192;
1411		if (ip_conntrack_htable_size < 16)
1412			ip_conntrack_htable_size = 16;
1413	}
1414	ip_conntrack_max = 8 * ip_conntrack_htable_size;
1415
1416	printk("ip_conntrack version %s (%u buckets, %d max)"
1417	       " - %d bytes per conntrack\n", IP_CONNTRACK_VERSION,
1418	       ip_conntrack_htable_size, ip_conntrack_max,
1419	       sizeof(struct ip_conntrack));
1420
1421	ret = nf_register_sockopt(&so_getorigdst);
1422	if (ret != 0) {
1423		printk(KERN_ERR "Unable to register netfilter socket option\n");
1424		return ret;
1425	}
1426
1427	ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1428				    * ip_conntrack_htable_size);
1429	if (!ip_conntrack_hash) {
1430		printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1431		goto err_unreg_sockopt;
1432	}
1433
1434	ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1435	                                        sizeof(struct ip_conntrack), 0,
1436	                                        SLAB_HWCACHE_ALIGN, NULL, NULL);
1437	if (!ip_conntrack_cachep) {
1438		printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1439		goto err_free_hash;
1440	}
1441	/* Don't NEED lock here, but good form anyway. */
1442	WRITE_LOCK(&ip_conntrack_lock);
1443	/* Sew in builtin protocols. */
1444	list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1445	list_append(&protocol_list, &ip_conntrack_protocol_udp);
1446	list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1447	WRITE_UNLOCK(&ip_conntrack_lock);
1448
1449	for (i = 0; i < ip_conntrack_htable_size; i++)
1450		INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1451
1452/* This is fucking braindead.  There is NO WAY of doing this without
1453   the CONFIG_SYSCTL unless you don't want to detect errors.
1454   Grrr... --RR */
1455#ifdef CONFIG_SYSCTL
1456	ip_conntrack_sysctl_header
1457		= register_sysctl_table(ip_conntrack_root_table, 0);
1458	if (ip_conntrack_sysctl_header == NULL) {
1459		goto err_free_ct_cachep;
1460	}
1461#endif /*CONFIG_SYSCTL*/
1462
1463	/* For use by ipt_REJECT */
1464	ip_ct_attach = ip_conntrack_attach;
1465	return ret;
1466
1467err_free_ct_cachep:
1468	kmem_cache_destroy(ip_conntrack_cachep);
1469err_free_hash:
1470	vfree(ip_conntrack_hash);
1471err_unreg_sockopt:
1472	nf_unregister_sockopt(&so_getorigdst);
1473
1474	return -ENOMEM;
1475}
1476