1/* NAT for netfilter; shared with compatibility layer. */
2
3/* (c) 1999 Paul `Rusty' Russell.  Licenced under the GNU General
4   Public Licence. */
5#ifdef MODULE
6#define __NO_VERSION__
7#endif
8#include <linux/version.h>
9#include <linux/module.h>
10#include <linux/types.h>
11#include <linux/timer.h>
12#include <linux/skbuff.h>
13#include <linux/netfilter_ipv4.h>
14#include <linux/brlock.h>
15#include <linux/vmalloc.h>
16#include <net/checksum.h>
17#include <net/icmp.h>
18#include <net/ip.h>
19#include <net/tcp.h>  /* For tcp_prot in getorigdst */
20
21#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
22#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
23
24#include <linux/netfilter_ipv4/ip_conntrack.h>
25#include <linux/netfilter_ipv4/ip_conntrack_core.h>
26#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
27#include <linux/netfilter_ipv4/ip_nat.h>
28#include <linux/netfilter_ipv4/ip_nat_protocol.h>
29#include <linux/netfilter_ipv4/ip_nat_core.h>
30#include <linux/netfilter_ipv4/ip_nat_helper.h>
31#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
32#include <linux/netfilter_ipv4/listhelp.h>
33
34#define DEBUGP(format, args...)
35
36DECLARE_RWLOCK(ip_nat_lock);
37DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
38
39/* Calculated at init based on memory size */
40static unsigned int ip_nat_htable_size;
41
42static struct list_head *bysource;
43static struct list_head *byipsproto;
44LIST_HEAD(protos);
45LIST_HEAD(helpers);
46
47extern struct ip_nat_protocol unknown_nat_protocol;
48
49/* We keep extra hashes for each conntrack, for fast searching. */
50static inline size_t
51hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
52{
53	/* Modified src and dst, to ensure we don't create two
54           identical streams. */
55	return (src + dst + proto) % ip_nat_htable_size;
56}
57
58static inline size_t
59hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
60{
61	/* Original src, to ensure we map it consistently if poss. */
62	return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
63}
64
65/* Noone using conntrack by the time this called. */
66static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
67{
68	struct ip_nat_info *info = &conn->nat.info;
69
70	if (!info->initialized)
71		return;
72
73	IP_NF_ASSERT(info->bysource.conntrack);
74	IP_NF_ASSERT(info->byipsproto.conntrack);
75
76	WRITE_LOCK(&ip_nat_lock);
77	LIST_DELETE(&bysource[hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL]
78					  .tuple.src,
79					  conn->tuplehash[IP_CT_DIR_ORIGINAL]
80					  .tuple.dst.protonum)],
81		    &info->bysource);
82
83	LIST_DELETE(&byipsproto
84		    [hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY]
85				      .tuple.src.ip,
86				      conn->tuplehash[IP_CT_DIR_REPLY]
87				      .tuple.dst.ip,
88				      conn->tuplehash[IP_CT_DIR_REPLY]
89				      .tuple.dst.protonum)],
90		    &info->byipsproto);
91	WRITE_UNLOCK(&ip_nat_lock);
92}
93
94/* We do checksum mangling, so if they were wrong before they're still
95 * wrong.  Also works for incomplete packets (eg. ICMP dest
96 * unreachables.) */
97u_int16_t
98ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
99{
100	u_int32_t diffs[] = { oldvalinv, newval };
101	return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
102				      oldcheck^0xFFFF));
103}
104
105static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
106{
107	return i->protonum == proto;
108}
109
110struct ip_nat_protocol *
111find_nat_proto(u_int16_t protonum)
112{
113	struct ip_nat_protocol *i;
114
115	MUST_BE_READ_LOCKED(&ip_nat_lock);
116	i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
117	if (!i)
118		i = &unknown_nat_protocol;
119	return i;
120}
121
122/* Is this tuple already taken? (not by us) */
123int
124ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
125		  const struct ip_conntrack *ignored_conntrack)
126{
127	/* Conntrack tracking doesn't keep track of outgoing tuples; only
128	   incoming ones.  NAT means they don't have a fixed mapping,
129	   so we invert the tuple and look for the incoming reply.
130
131	   We could keep a separate hash if this proves too slow. */
132	struct ip_conntrack_tuple reply;
133
134	invert_tuplepr(&reply, tuple);
135	return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
136}
137
138/* Does tuple + the source manip come within the range mr */
139static int
140in_range(const struct ip_conntrack_tuple *tuple,
141	 const struct ip_conntrack_manip *manip,
142	 const struct ip_nat_multi_range *mr)
143{
144	struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
145	unsigned int i;
146	struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
147
148	for (i = 0; i < mr->rangesize; i++) {
149		/* If we are allowed to map IPs, then we must be in the
150		   range specified, otherwise we must be unchanged. */
151		if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
152			if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
153			    || (ntohl(newtuple.src.ip)
154				> ntohl(mr->range[i].max_ip)))
155				continue;
156		} else {
157			if (newtuple.src.ip != tuple->src.ip)
158				continue;
159		}
160
161		if ((mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
162		    && proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
163				       &mr->range[i].min, &mr->range[i].max))
164			return 1;
165	}
166	return 0;
167}
168
169static inline int
170src_cmp(const struct ip_nat_hash *i,
171	const struct ip_conntrack_tuple *tuple,
172	const struct ip_nat_multi_range *mr)
173{
174	return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
175		== tuple->dst.protonum
176		&& i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
177		== tuple->src.ip
178		&& i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
179		== tuple->src.u.all
180		&& in_range(tuple,
181			    &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
182			    .tuple.src,
183			    mr));
184}
185
186/* Only called for SRC manip */
187static struct ip_conntrack_manip *
188find_appropriate_src(const struct ip_conntrack_tuple *tuple,
189		     const struct ip_nat_multi_range *mr)
190{
191	unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
192	struct ip_nat_hash *i;
193
194	MUST_BE_READ_LOCKED(&ip_nat_lock);
195	i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
196	if (i)
197		return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
198	else
199		return NULL;
200}
201
202#ifdef CONFIG_IP_NF_NAT_LOCAL
203/* If it's really a local destination manip, it may need to do a
204   source manip too. */
205static int
206do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
207{
208	struct rtable *rt;
209
210	if (ip_route_output(&rt, var_ip, 0, 0, 0) != 0) {
211		DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
212		       NIPQUAD(var_ip));
213		return 0;
214	}
215
216	*other_ipp = rt->rt_src;
217	ip_rt_put(rt);
218	return 1;
219}
220#endif
221
222/* Simple way to iterate through all. */
223static inline int fake_cmp(const struct ip_nat_hash *i,
224			   u_int32_t src, u_int32_t dst, u_int16_t protonum,
225			   unsigned int *score,
226			   const struct ip_conntrack *conntrack)
227{
228	/* Compare backwards: we're dealing with OUTGOING tuples, and
229           inside the conntrack is the REPLY tuple.  Don't count this
230           conntrack. */
231	if (i->conntrack != conntrack
232	    && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
233	    && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
234	    && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
235		== protonum))
236		(*score)++;
237	return 0;
238}
239
240static inline unsigned int
241count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
242	   const struct ip_conntrack *conntrack)
243{
244	unsigned int score = 0;
245
246	MUST_BE_READ_LOCKED(&ip_nat_lock);
247	LIST_FIND(&byipsproto[hash_by_ipsproto(src, dst, protonum)],
248		  fake_cmp, struct ip_nat_hash *, src, dst, protonum, &score,
249		  conntrack);
250
251	return score;
252}
253
254/* For [FUTURE] fragmentation handling, we want the least-used
255   src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
256   if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
257   1-65535, we don't do pro-rata allocation based on ports; we choose
258   the ip with the lowest src-ip/dst-ip/proto usage.
259
260   If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
261   range), we eliminate that and try again.  This is not the most
262   efficient approach, but if you're worried about that, don't hand us
263   ranges you don't really have.  */
264static struct ip_nat_range *
265find_best_ips_proto(struct ip_conntrack_tuple *tuple,
266		    const struct ip_nat_multi_range *mr,
267		    const struct ip_conntrack *conntrack,
268		    unsigned int hooknum)
269{
270	unsigned int i;
271	struct {
272		const struct ip_nat_range *range;
273		unsigned int score;
274		struct ip_conntrack_tuple tuple;
275	} best = { NULL,  0xFFFFFFFF };
276	u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
277	static unsigned int randomness = 0;
278
279	if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
280		var_ipp = &tuple->src.ip;
281		saved_ip = tuple->dst.ip;
282		other_ipp = &tuple->dst.ip;
283	} else {
284		var_ipp = &tuple->dst.ip;
285		saved_ip = tuple->src.ip;
286		other_ipp = &tuple->src.ip;
287	}
288	/* Don't do do_extra_mangle unless neccessary (overrides
289           explicit socket bindings, for example) */
290	orig_dstip = tuple->dst.ip;
291
292	IP_NF_ASSERT(mr->rangesize >= 1);
293	for (i = 0; i < mr->rangesize; i++) {
294		/* Host order */
295		u_int32_t minip, maxip, j;
296
297		/* Don't do ranges which are already eliminated. */
298		if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
299			continue;
300		}
301
302		if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
303			minip = ntohl(mr->range[i].min_ip);
304			maxip = ntohl(mr->range[i].max_ip);
305		} else
306			minip = maxip = ntohl(*var_ipp);
307
308		randomness++;
309		for (j = 0; j < maxip - minip + 1; j++) {
310			unsigned int score;
311
312			*var_ipp = htonl(minip + (randomness + j)
313					 % (maxip - minip + 1));
314
315			/* Reset the other ip in case it was mangled by
316			 * do_extra_mangle last time. */
317			*other_ipp = saved_ip;
318
319#ifdef CONFIG_IP_NF_NAT_LOCAL
320			if (hooknum == NF_IP_LOCAL_OUT
321			    && *var_ipp != orig_dstip
322			    && !do_extra_mangle(*var_ipp, other_ipp)) {
323				DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
324				       i, NIPQUAD(*var_ipp));
325				/* Can't route?  This whole range part is
326				 * probably screwed, but keep trying
327				 * anyway. */
328				continue;
329			}
330#endif
331
332			/* Count how many others map onto this. */
333			score = count_maps(tuple->src.ip, tuple->dst.ip,
334					   tuple->dst.protonum, conntrack);
335			if (score < best.score) {
336				/* Optimization: doesn't get any better than
337				   this. */
338				if (score == 0)
339					return (struct ip_nat_range *)
340						&mr->range[i];
341
342				best.score = score;
343				best.tuple = *tuple;
344				best.range = &mr->range[i];
345			}
346		}
347	}
348	*tuple = best.tuple;
349
350	/* Discard const. */
351	return (struct ip_nat_range *)best.range;
352}
353
354/* Fast version doesn't iterate through hash chains, but only handles
355   common case of single IP address (null NAT, masquerade) */
356static struct ip_nat_range *
357find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
358			 const struct ip_nat_multi_range *mr,
359			 const struct ip_conntrack *conntrack,
360			 unsigned int hooknum)
361{
362	if (mr->rangesize != 1
363	    || (mr->range[0].flags & IP_NAT_RANGE_FULL)
364	    || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
365		&& mr->range[0].min_ip != mr->range[0].max_ip))
366		return find_best_ips_proto(tuple, mr, conntrack, hooknum);
367
368	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
369		if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
370			tuple->src.ip = mr->range[0].min_ip;
371		else {
372			/* Only do extra mangle when required (breaks
373                           socket binding) */
374#ifdef CONFIG_IP_NF_NAT_LOCAL
375			if (tuple->dst.ip != mr->range[0].min_ip
376			    && hooknum == NF_IP_LOCAL_OUT
377			    && !do_extra_mangle(mr->range[0].min_ip,
378						&tuple->src.ip))
379				return NULL;
380#endif
381			tuple->dst.ip = mr->range[0].min_ip;
382		}
383	}
384
385	/* Discard const. */
386	return (struct ip_nat_range *)&mr->range[0];
387}
388
389static int
390get_unique_tuple(struct ip_conntrack_tuple *tuple,
391		 const struct ip_conntrack_tuple *orig_tuple,
392		 const struct ip_nat_multi_range *mrr,
393		 struct ip_conntrack *conntrack,
394		 unsigned int hooknum)
395{
396	struct ip_nat_protocol *proto
397		= find_nat_proto(orig_tuple->dst.protonum);
398	struct ip_nat_range *rptr;
399	unsigned int i;
400	int ret;
401
402	/* We temporarily use flags for marking full parts, but we
403	   always clean up afterwards */
404	struct ip_nat_multi_range *mr = (void *)mrr;
405
406	/* 1) If this srcip/proto/src-proto-part is currently mapped,
407	   and that same mapping gives a unique tuple within the given
408	   range, use that.
409
410	   This is only required for source (ie. NAT/masq) mappings.
411	   So far, we don't do local source mappings, so multiple
412	   manips not an issue.  */
413	if (hooknum == NF_IP_POST_ROUTING) {
414		struct ip_conntrack_manip *manip;
415
416		manip = find_appropriate_src(orig_tuple, mr);
417		if (manip) {
418			/* Apply same source manipulation. */
419			*tuple = ((struct ip_conntrack_tuple)
420				  { *manip, orig_tuple->dst });
421			DEBUGP("get_unique_tuple: Found current src map\n");
422			return 1;
423		}
424	}
425
426	/* 2) Select the least-used IP/proto combination in the given
427	   range.
428	*/
429	*tuple = *orig_tuple;
430	while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
431	       != NULL) {
432		DEBUGP("Found best for "); DUMP_TUPLE_RAW(tuple);
433		/* 3) The per-protocol part of the manip is made to
434		   map into the range to make a unique tuple. */
435
436		/* Only bother mapping if it's not already in range
437		   and unique */
438		if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
439		     || proto->in_range(tuple, HOOK2MANIP(hooknum),
440					&rptr->min, &rptr->max))
441		    && !ip_nat_used_tuple(tuple, conntrack)) {
442			ret = 1;
443			goto clear_fulls;
444		} else {
445			if (proto->unique_tuple(tuple, rptr,
446						HOOK2MANIP(hooknum),
447						conntrack)) {
448				/* Must be unique. */
449				IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
450								conntrack));
451				ret = 1;
452				goto clear_fulls;
453			} else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
454				/* Try implicit source NAT; protocol
455                                   may be able to play with ports to
456                                   make it unique. */
457				struct ip_nat_range r
458					= { IP_NAT_RANGE_MAP_IPS,
459					    tuple->src.ip, tuple->src.ip,
460					    { 0 }, { 0 } };
461				DEBUGP("Trying implicit mapping\n");
462				if (proto->unique_tuple(tuple, &r,
463							IP_NAT_MANIP_SRC,
464							conntrack)) {
465					/* Must be unique. */
466					IP_NF_ASSERT(!ip_nat_used_tuple
467						     (tuple, conntrack));
468					ret = 1;
469					goto clear_fulls;
470				}
471			}
472			DEBUGP("Protocol can't get unique tuple %u.\n",
473			       hooknum);
474		}
475
476		/* Eliminate that from range, and try again. */
477		rptr->flags |= IP_NAT_RANGE_FULL;
478		*tuple = *orig_tuple;
479	}
480
481	ret = 0;
482
483 clear_fulls:
484	/* Clear full flags. */
485	IP_NF_ASSERT(mr->rangesize >= 1);
486	for (i = 0; i < mr->rangesize; i++)
487		mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
488
489	return ret;
490}
491
492static inline int
493helper_cmp(const struct ip_nat_helper *helper,
494	   const struct ip_conntrack_tuple *tuple)
495{
496	return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
497}
498
499/* Where to manip the reply packets (will be reverse manip). */
500static unsigned int opposite_hook[NF_IP_NUMHOOKS]
501= { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
502    [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
503#ifdef CONFIG_IP_NF_NAT_LOCAL
504    [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
505    [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
506#endif
507};
508
509unsigned int
510ip_nat_setup_info(struct ip_conntrack *conntrack,
511		  const struct ip_nat_multi_range *mr,
512		  unsigned int hooknum)
513{
514	struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
515	struct ip_conntrack_tuple orig_tp;
516	struct ip_nat_info *info = &conntrack->nat.info;
517
518	MUST_BE_WRITE_LOCKED(&ip_nat_lock);
519	IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
520		     || hooknum == NF_IP_POST_ROUTING
521		     || hooknum == NF_IP_LOCAL_OUT);
522	IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
523
524	/* What we've got will look like inverse of reply. Normally
525	   this is what is in the conntrack, except for prior
526	   manipulations (future optimization: if num_manips == 0,
527	   orig_tp =
528	   conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
529	invert_tuplepr(&orig_tp,
530		       &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
531
532
533	do {
534		if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
535				      hooknum)) {
536			DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
537			       conntrack);
538			return NF_DROP;
539		}
540
541
542		/* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
543		   the original (A/B/C/D') and the mangled one (E/F/G/H').
544
545		   We're only allowed to work with the SRC per-proto
546		   part, so we create inverses of both to start, then
547		   derive the other fields we need.  */
548
549		/* Reply connection: simply invert the new tuple
550                   (G/H/E/F') */
551		invert_tuplepr(&reply, &new_tuple);
552
553		/* Alter conntrack table so it recognizes replies.
554                   If fail this race (reply tuple now used), repeat. */
555	} while (!ip_conntrack_alter_reply(conntrack, &reply));
556
557	/* Create inverse of original: C/D/A/B' */
558	invert_tuplepr(&inv_tuple, &orig_tp);
559
560	/* Has source changed?. */
561	if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
562		/* In this direction, a source manip. */
563		info->manips[info->num_manips++] =
564			((struct ip_nat_info_manip)
565			 { IP_CT_DIR_ORIGINAL, hooknum,
566			   IP_NAT_MANIP_SRC, new_tuple.src });
567
568		IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
569
570		/* In the reverse direction, a destination manip. */
571		info->manips[info->num_manips++] =
572			((struct ip_nat_info_manip)
573			 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
574			   IP_NAT_MANIP_DST, orig_tp.src });
575		IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
576	}
577
578	/* Has destination changed? */
579	if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
580		/* In this direction, a destination manip */
581		info->manips[info->num_manips++] =
582			((struct ip_nat_info_manip)
583			 { IP_CT_DIR_ORIGINAL, hooknum,
584			   IP_NAT_MANIP_DST, reply.src });
585
586		IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
587
588		/* In the reverse direction, a source manip. */
589		info->manips[info->num_manips++] =
590			((struct ip_nat_info_manip)
591			 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
592			   IP_NAT_MANIP_SRC, inv_tuple.src });
593		IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
594	}
595
596	/* If there's a helper, assign it; based on new tuple. */
597	if (!conntrack->master)
598		info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
599					 &reply);
600
601	/* It's done. */
602	info->initialized |= (1 << HOOK2MANIP(hooknum));
603	return NF_ACCEPT;
604}
605
606void replace_in_hashes(struct ip_conntrack *conntrack,
607		       struct ip_nat_info *info)
608{
609	/* Source has changed, so replace in hashes. */
610	unsigned int srchash
611		= hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
612			      .tuple.src,
613			      conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
614			      .tuple.dst.protonum);
615	/* We place packet as seen OUTGOUNG in byips_proto hash
616           (ie. reverse dst and src of reply packet. */
617	unsigned int ipsprotohash
618		= hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
619				   .tuple.dst.ip,
620				   conntrack->tuplehash[IP_CT_DIR_REPLY]
621				   .tuple.src.ip,
622				   conntrack->tuplehash[IP_CT_DIR_REPLY]
623				   .tuple.dst.protonum);
624
625	IP_NF_ASSERT(info->bysource.conntrack == conntrack);
626	MUST_BE_WRITE_LOCKED(&ip_nat_lock);
627
628	list_del(&info->bysource.list);
629	list_del(&info->byipsproto.list);
630
631	list_prepend(&bysource[srchash], &info->bysource);
632	list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
633}
634
635void place_in_hashes(struct ip_conntrack *conntrack,
636		     struct ip_nat_info *info)
637{
638	unsigned int srchash
639		= hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
640			      .tuple.src,
641			      conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
642			      .tuple.dst.protonum);
643	/* We place packet as seen OUTGOUNG in byips_proto hash
644           (ie. reverse dst and src of reply packet. */
645	unsigned int ipsprotohash
646		= hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
647				   .tuple.dst.ip,
648				   conntrack->tuplehash[IP_CT_DIR_REPLY]
649				   .tuple.src.ip,
650				   conntrack->tuplehash[IP_CT_DIR_REPLY]
651				   .tuple.dst.protonum);
652
653	IP_NF_ASSERT(!info->bysource.conntrack);
654
655	MUST_BE_WRITE_LOCKED(&ip_nat_lock);
656	info->byipsproto.conntrack = conntrack;
657	info->bysource.conntrack = conntrack;
658
659	list_prepend(&bysource[srchash], &info->bysource);
660	list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
661}
662
663static void
664manip_pkt(u_int16_t proto, struct iphdr *iph, size_t len,
665	  const struct ip_conntrack_manip *manip,
666	  enum ip_nat_manip_type maniptype,
667	  __u32 *nfcache)
668{
669	*nfcache |= NFC_ALTERED;
670	find_nat_proto(proto)->manip_pkt(iph, len, manip, maniptype);
671
672	if (maniptype == IP_NAT_MANIP_SRC) {
673		iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
674						iph->check);
675		iph->saddr = manip->ip;
676	} else {
677		iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
678						iph->check);
679		iph->daddr = manip->ip;
680	}
681}
682
683static inline int exp_for_packet(struct ip_conntrack_expect *exp,
684			         struct sk_buff **pskb)
685{
686	struct ip_conntrack_protocol *proto;
687	int ret = 1;
688
689	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
690	proto = __ip_ct_find_proto((*pskb)->nh.iph->protocol);
691	if (proto->exp_matches_pkt)
692		ret = proto->exp_matches_pkt(exp, pskb);
693
694	return ret;
695}
696
697/* Do packet manipulations according to binding. */
698unsigned int
699do_bindings(struct ip_conntrack *ct,
700	    enum ip_conntrack_info ctinfo,
701	    struct ip_nat_info *info,
702	    unsigned int hooknum,
703	    struct sk_buff **pskb)
704{
705	unsigned int i;
706	struct ip_nat_helper *helper;
707	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
708	int is_tcp = (*pskb)->nh.iph->protocol == IPPROTO_TCP;
709
710	/* Need nat lock to protect against modification, but neither
711	   conntrack (referenced) and helper (deleted with
712	   synchronize_bh()) can vanish. */
713	READ_LOCK(&ip_nat_lock);
714	for (i = 0; i < info->num_manips; i++) {
715		/* raw socket (tcpdump) may have clone of incoming
716                   skb: don't disturb it --RR */
717		if (skb_cloned(*pskb) && !(*pskb)->sk) {
718			struct sk_buff *nskb = skb_copy(*pskb, GFP_ATOMIC);
719			if (!nskb) {
720				READ_UNLOCK(&ip_nat_lock);
721				return NF_DROP;
722			}
723			kfree_skb(*pskb);
724			*pskb = nskb;
725		}
726
727		if (info->manips[i].direction == dir
728		    && info->manips[i].hooknum == hooknum) {
729			DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
730			       *pskb,
731			       info->manips[i].maniptype == IP_NAT_MANIP_SRC
732			       ? "SRC" : "DST",
733			       NIPQUAD(info->manips[i].manip.ip),
734			       htons(info->manips[i].manip.u.all));
735			manip_pkt((*pskb)->nh.iph->protocol,
736				  (*pskb)->nh.iph,
737				  (*pskb)->len,
738				  &info->manips[i].manip,
739				  info->manips[i].maniptype,
740				  &(*pskb)->nfcache);
741		}
742	}
743	helper = info->helper;
744	READ_UNLOCK(&ip_nat_lock);
745
746	if (helper) {
747		struct ip_conntrack_expect *exp = NULL;
748		struct list_head *cur_item;
749		int ret = NF_ACCEPT;
750
751		DEBUGP("do_bindings: helper existing for (%p)\n", ct);
752
753		/* Always defragged for helpers */
754		IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
755			       & htons(IP_MF|IP_OFFSET)));
756
757		/* Have to grab read lock before sibling_list traversal */
758		READ_LOCK(&ip_conntrack_lock);
759		list_for_each(cur_item, &ct->sibling_list) {
760			exp = list_entry(cur_item, struct ip_conntrack_expect,
761					 expected_list);
762
763			/* if this expectation is already established, skip */
764			if (exp->sibling)
765				continue;
766
767			if (exp_for_packet(exp, pskb)) {
768				DEBUGP("calling nat helper (exp=%p) for packet\n",
769					exp);
770				ret = helper->help(ct, exp, info, ctinfo,
771						   hooknum, pskb);
772				if (ret != NF_ACCEPT) {
773					READ_UNLOCK(&ip_conntrack_lock);
774					return ret;
775				}
776			}
777		}
778		/* Helper might want to manip the packet even when there is no expectation */
779		if (!exp && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
780			DEBUGP("calling nat helper for packet without expectation\n");
781			ret = helper->help(ct, NULL, info, ctinfo,
782					   hooknum, pskb);
783			if (ret != NF_ACCEPT) {
784				READ_UNLOCK(&ip_conntrack_lock);
785				return ret;
786			}
787		}
788		READ_UNLOCK(&ip_conntrack_lock);
789
790		/* Adjust sequence number only once per packet
791		 * (helper is called at all hooks) */
792		if (is_tcp && (hooknum == NF_IP_POST_ROUTING
793			       || hooknum == NF_IP_LOCAL_IN)) {
794			DEBUGP("ip_nat_core: adjusting sequence number\n");
795			/* future: put this in a l4-proto specific function,
796			 * and call this function here. */
797			ip_nat_seq_adjust(*pskb, ct, ctinfo);
798		}
799
800		return ret;
801
802	} else
803		return NF_ACCEPT;
804
805	/* not reached */
806}
807
808unsigned int
809icmp_reply_translation(struct sk_buff *skb,
810		       struct ip_conntrack *conntrack,
811		       unsigned int hooknum,
812		       int dir)
813{
814	struct iphdr *iph = skb->nh.iph;
815	struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
816	struct iphdr *inner = (struct iphdr *)(hdr + 1);
817	size_t datalen = skb->len - ((void *)inner - (void *)iph);
818	unsigned int i;
819	struct ip_nat_info *info = &conntrack->nat.info;
820
821	IP_NF_ASSERT(skb->len >= iph->ihl*4 + sizeof(struct icmphdr));
822	/* Must be RELATED */
823	IP_NF_ASSERT(skb->nfct - (struct ip_conntrack *)skb->nfct->master
824		     == IP_CT_RELATED
825		     || skb->nfct - (struct ip_conntrack *)skb->nfct->master
826		     == IP_CT_RELATED+IP_CT_IS_REPLY);
827
828	/* Redirects on non-null nats must be dropped, else they'll
829           start talking to each other without our translation, and be
830           confused... --RR */
831	if (hdr->type == ICMP_REDIRECT) {
832		/* Don't care about races here. */
833		if (info->initialized
834		    != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
835		    || info->num_manips != 0)
836			return NF_DROP;
837	}
838
839	DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
840	       skb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
841	/* Note: May not be from a NAT'd host, but probably safest to
842	   do translation always as if it came from the host itself
843	   (even though a "host unreachable" coming from the host
844	   itself is a bit weird).
845
846	   More explanation: some people use NAT for anonymizing.
847	   Also, CERT recommends dropping all packets from private IP
848	   addresses (although ICMP errors from internal links with
849	   such addresses are not too uncommon, as Alan Cox points
850	   out) */
851
852	READ_LOCK(&ip_nat_lock);
853	for (i = 0; i < info->num_manips; i++) {
854		DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
855		       i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
856		       "ORIG" : "REPLY", info->manips[i].hooknum);
857
858		if (info->manips[i].direction != dir)
859			continue;
860
861		/* Mapping the inner packet is just like a normal
862		   packet, except it was never src/dst reversed, so
863		   where we would normally apply a dst manip, we apply
864		   a src, and vice versa. */
865		if (info->manips[i].hooknum == opposite_hook[hooknum]) {
866			DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
867			       info->manips[i].maniptype == IP_NAT_MANIP_SRC
868			       ? "DST" : "SRC",
869			       NIPQUAD(info->manips[i].manip.ip),
870			       ntohs(info->manips[i].manip.u.udp.port));
871			manip_pkt(inner->protocol, inner,
872				  skb->len - ((void *)inner - (void *)iph),
873				  &info->manips[i].manip,
874				  !info->manips[i].maniptype,
875				  &skb->nfcache);
876		/* Outer packet needs to have IP header NATed like
877                   it's a reply. */
878		} else if (info->manips[i].hooknum == hooknum) {
879			/* Use mapping to map outer packet: 0 give no
880                           per-proto mapping */
881			DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
882			       info->manips[i].maniptype == IP_NAT_MANIP_SRC
883			       ? "SRC" : "DST",
884			       NIPQUAD(info->manips[i].manip.ip));
885			manip_pkt(0, iph, skb->len,
886				  &info->manips[i].manip,
887				  info->manips[i].maniptype,
888				  &skb->nfcache);
889		}
890	}
891	READ_UNLOCK(&ip_nat_lock);
892
893	/* Since we mangled inside ICMP packet, recalculate its
894	   checksum from scratch.  (Hence the handling of incorrect
895	   checksums in conntrack, so we don't accidentally fix one.)  */
896	hdr->checksum = 0;
897	hdr->checksum = ip_compute_csum((unsigned char *)hdr,
898					sizeof(*hdr) + datalen);
899
900	return NF_ACCEPT;
901}
902
903int __init ip_nat_init(void)
904{
905	size_t i;
906
907	/* Leave them the same for the moment. */
908	ip_nat_htable_size = ip_conntrack_htable_size;
909
910	/* One vmalloc for both hash tables */
911	bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
912	if (!bysource) {
913		return -ENOMEM;
914	}
915	byipsproto = bysource + ip_nat_htable_size;
916
917	/* Sew in builtin protocols. */
918	WRITE_LOCK(&ip_nat_lock);
919	list_append(&protos, &ip_nat_protocol_tcp);
920	list_append(&protos, &ip_nat_protocol_udp);
921	list_append(&protos, &ip_nat_protocol_icmp);
922	WRITE_UNLOCK(&ip_nat_lock);
923
924	for (i = 0; i < ip_nat_htable_size; i++) {
925		INIT_LIST_HEAD(&bysource[i]);
926		INIT_LIST_HEAD(&byipsproto[i]);
927	}
928
929	IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
930	ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
931
932	return 0;
933}
934
935/* Clear NAT section of all conntracks, in case we're loaded again. */
936static int clean_nat(const struct ip_conntrack *i, void *data)
937{
938	memset((void *)&i->nat, 0, sizeof(i->nat));
939	return 0;
940}
941
942/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
943void ip_nat_cleanup(void)
944{
945	ip_ct_selective_cleanup(&clean_nat, NULL);
946	ip_conntrack_destroyed = NULL;
947	vfree(bysource);
948}
949