1/* NAT for netfilter; shared with compatibility layer. */ 2 3/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General 4 Public Licence. */ 5#ifdef MODULE 6#define __NO_VERSION__ 7#endif 8#include <linux/version.h> 9#include <linux/module.h> 10#include <linux/types.h> 11#include <linux/timer.h> 12#include <linux/skbuff.h> 13#include <linux/netfilter_ipv4.h> 14#include <linux/brlock.h> 15#include <linux/vmalloc.h> 16#include <net/checksum.h> 17#include <net/icmp.h> 18#include <net/ip.h> 19#include <net/tcp.h> /* For tcp_prot in getorigdst */ 20 21#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 22#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 23 24#include <linux/netfilter_ipv4/ip_conntrack.h> 25#include <linux/netfilter_ipv4/ip_conntrack_core.h> 26#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 27#include <linux/netfilter_ipv4/ip_nat.h> 28#include <linux/netfilter_ipv4/ip_nat_protocol.h> 29#include <linux/netfilter_ipv4/ip_nat_core.h> 30#include <linux/netfilter_ipv4/ip_nat_helper.h> 31#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 32#include <linux/netfilter_ipv4/listhelp.h> 33 34#define DEBUGP(format, args...) 35 36DECLARE_RWLOCK(ip_nat_lock); 37DECLARE_RWLOCK_EXTERN(ip_conntrack_lock); 38 39/* Calculated at init based on memory size */ 40static unsigned int ip_nat_htable_size; 41 42static struct list_head *bysource; 43static struct list_head *byipsproto; 44LIST_HEAD(protos); 45LIST_HEAD(helpers); 46 47extern struct ip_nat_protocol unknown_nat_protocol; 48 49/* We keep extra hashes for each conntrack, for fast searching. */ 50static inline size_t 51hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto) 52{ 53 /* Modified src and dst, to ensure we don't create two 54 identical streams. */ 55 return (src + dst + proto) % ip_nat_htable_size; 56} 57 58static inline size_t 59hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto) 60{ 61 /* Original src, to ensure we map it consistently if poss. */ 62 return (manip->ip + manip->u.all + proto) % ip_nat_htable_size; 63} 64 65/* Noone using conntrack by the time this called. */ 66static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn) 67{ 68 struct ip_nat_info *info = &conn->nat.info; 69 70 if (!info->initialized) 71 return; 72 73 IP_NF_ASSERT(info->bysource.conntrack); 74 IP_NF_ASSERT(info->byipsproto.conntrack); 75 76 WRITE_LOCK(&ip_nat_lock); 77 LIST_DELETE(&bysource[hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL] 78 .tuple.src, 79 conn->tuplehash[IP_CT_DIR_ORIGINAL] 80 .tuple.dst.protonum)], 81 &info->bysource); 82 83 LIST_DELETE(&byipsproto 84 [hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY] 85 .tuple.src.ip, 86 conn->tuplehash[IP_CT_DIR_REPLY] 87 .tuple.dst.ip, 88 conn->tuplehash[IP_CT_DIR_REPLY] 89 .tuple.dst.protonum)], 90 &info->byipsproto); 91 WRITE_UNLOCK(&ip_nat_lock); 92} 93 94/* We do checksum mangling, so if they were wrong before they're still 95 * wrong. Also works for incomplete packets (eg. ICMP dest 96 * unreachables.) */ 97u_int16_t 98ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck) 99{ 100 u_int32_t diffs[] = { oldvalinv, newval }; 101 return csum_fold(csum_partial((char *)diffs, sizeof(diffs), 102 oldcheck^0xFFFF)); 103} 104 105static inline int cmp_proto(const struct ip_nat_protocol *i, int proto) 106{ 107 return i->protonum == proto; 108} 109 110struct ip_nat_protocol * 111find_nat_proto(u_int16_t protonum) 112{ 113 struct ip_nat_protocol *i; 114 115 MUST_BE_READ_LOCKED(&ip_nat_lock); 116 i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum); 117 if (!i) 118 i = &unknown_nat_protocol; 119 return i; 120} 121 122/* Is this tuple already taken? (not by us) */ 123int 124ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple, 125 const struct ip_conntrack *ignored_conntrack) 126{ 127 /* Conntrack tracking doesn't keep track of outgoing tuples; only 128 incoming ones. NAT means they don't have a fixed mapping, 129 so we invert the tuple and look for the incoming reply. 130 131 We could keep a separate hash if this proves too slow. */ 132 struct ip_conntrack_tuple reply; 133 134 invert_tuplepr(&reply, tuple); 135 return ip_conntrack_tuple_taken(&reply, ignored_conntrack); 136} 137 138/* Does tuple + the source manip come within the range mr */ 139static int 140in_range(const struct ip_conntrack_tuple *tuple, 141 const struct ip_conntrack_manip *manip, 142 const struct ip_nat_multi_range *mr) 143{ 144 struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum); 145 unsigned int i; 146 struct ip_conntrack_tuple newtuple = { *manip, tuple->dst }; 147 148 for (i = 0; i < mr->rangesize; i++) { 149 /* If we are allowed to map IPs, then we must be in the 150 range specified, otherwise we must be unchanged. */ 151 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) { 152 if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip) 153 || (ntohl(newtuple.src.ip) 154 > ntohl(mr->range[i].max_ip))) 155 continue; 156 } else { 157 if (newtuple.src.ip != tuple->src.ip) 158 continue; 159 } 160 161 if ((mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED) 162 && proto->in_range(&newtuple, IP_NAT_MANIP_SRC, 163 &mr->range[i].min, &mr->range[i].max)) 164 return 1; 165 } 166 return 0; 167} 168 169static inline int 170src_cmp(const struct ip_nat_hash *i, 171 const struct ip_conntrack_tuple *tuple, 172 const struct ip_nat_multi_range *mr) 173{ 174 return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum 175 == tuple->dst.protonum 176 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip 177 == tuple->src.ip 178 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all 179 == tuple->src.u.all 180 && in_range(tuple, 181 &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL] 182 .tuple.src, 183 mr)); 184} 185 186/* Only called for SRC manip */ 187static struct ip_conntrack_manip * 188find_appropriate_src(const struct ip_conntrack_tuple *tuple, 189 const struct ip_nat_multi_range *mr) 190{ 191 unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum); 192 struct ip_nat_hash *i; 193 194 MUST_BE_READ_LOCKED(&ip_nat_lock); 195 i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr); 196 if (i) 197 return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src; 198 else 199 return NULL; 200} 201 202#ifdef CONFIG_IP_NF_NAT_LOCAL 203/* If it's really a local destination manip, it may need to do a 204 source manip too. */ 205static int 206do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp) 207{ 208 struct rtable *rt; 209 210 if (ip_route_output(&rt, var_ip, 0, 0, 0) != 0) { 211 DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n", 212 NIPQUAD(var_ip)); 213 return 0; 214 } 215 216 *other_ipp = rt->rt_src; 217 ip_rt_put(rt); 218 return 1; 219} 220#endif 221 222/* Simple way to iterate through all. */ 223static inline int fake_cmp(const struct ip_nat_hash *i, 224 u_int32_t src, u_int32_t dst, u_int16_t protonum, 225 unsigned int *score, 226 const struct ip_conntrack *conntrack) 227{ 228 /* Compare backwards: we're dealing with OUTGOING tuples, and 229 inside the conntrack is the REPLY tuple. Don't count this 230 conntrack. */ 231 if (i->conntrack != conntrack 232 && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst 233 && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src 234 && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum 235 == protonum)) 236 (*score)++; 237 return 0; 238} 239 240static inline unsigned int 241count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum, 242 const struct ip_conntrack *conntrack) 243{ 244 unsigned int score = 0; 245 246 MUST_BE_READ_LOCKED(&ip_nat_lock); 247 LIST_FIND(&byipsproto[hash_by_ipsproto(src, dst, protonum)], 248 fake_cmp, struct ip_nat_hash *, src, dst, protonum, &score, 249 conntrack); 250 251 return score; 252} 253 254/* For [FUTURE] fragmentation handling, we want the least-used 255 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus 256 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports 257 1-65535, we don't do pro-rata allocation based on ports; we choose 258 the ip with the lowest src-ip/dst-ip/proto usage. 259 260 If an allocation then fails (eg. all 6 ports used in the 1.2.3.4 261 range), we eliminate that and try again. This is not the most 262 efficient approach, but if you're worried about that, don't hand us 263 ranges you don't really have. */ 264static struct ip_nat_range * 265find_best_ips_proto(struct ip_conntrack_tuple *tuple, 266 const struct ip_nat_multi_range *mr, 267 const struct ip_conntrack *conntrack, 268 unsigned int hooknum) 269{ 270 unsigned int i; 271 struct { 272 const struct ip_nat_range *range; 273 unsigned int score; 274 struct ip_conntrack_tuple tuple; 275 } best = { NULL, 0xFFFFFFFF }; 276 u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip; 277 static unsigned int randomness = 0; 278 279 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) { 280 var_ipp = &tuple->src.ip; 281 saved_ip = tuple->dst.ip; 282 other_ipp = &tuple->dst.ip; 283 } else { 284 var_ipp = &tuple->dst.ip; 285 saved_ip = tuple->src.ip; 286 other_ipp = &tuple->src.ip; 287 } 288 /* Don't do do_extra_mangle unless neccessary (overrides 289 explicit socket bindings, for example) */ 290 orig_dstip = tuple->dst.ip; 291 292 IP_NF_ASSERT(mr->rangesize >= 1); 293 for (i = 0; i < mr->rangesize; i++) { 294 /* Host order */ 295 u_int32_t minip, maxip, j; 296 297 /* Don't do ranges which are already eliminated. */ 298 if (mr->range[i].flags & IP_NAT_RANGE_FULL) { 299 continue; 300 } 301 302 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) { 303 minip = ntohl(mr->range[i].min_ip); 304 maxip = ntohl(mr->range[i].max_ip); 305 } else 306 minip = maxip = ntohl(*var_ipp); 307 308 randomness++; 309 for (j = 0; j < maxip - minip + 1; j++) { 310 unsigned int score; 311 312 *var_ipp = htonl(minip + (randomness + j) 313 % (maxip - minip + 1)); 314 315 /* Reset the other ip in case it was mangled by 316 * do_extra_mangle last time. */ 317 *other_ipp = saved_ip; 318 319#ifdef CONFIG_IP_NF_NAT_LOCAL 320 if (hooknum == NF_IP_LOCAL_OUT 321 && *var_ipp != orig_dstip 322 && !do_extra_mangle(*var_ipp, other_ipp)) { 323 DEBUGP("Range %u %u.%u.%u.%u rt failed!\n", 324 i, NIPQUAD(*var_ipp)); 325 /* Can't route? This whole range part is 326 * probably screwed, but keep trying 327 * anyway. */ 328 continue; 329 } 330#endif 331 332 /* Count how many others map onto this. */ 333 score = count_maps(tuple->src.ip, tuple->dst.ip, 334 tuple->dst.protonum, conntrack); 335 if (score < best.score) { 336 /* Optimization: doesn't get any better than 337 this. */ 338 if (score == 0) 339 return (struct ip_nat_range *) 340 &mr->range[i]; 341 342 best.score = score; 343 best.tuple = *tuple; 344 best.range = &mr->range[i]; 345 } 346 } 347 } 348 *tuple = best.tuple; 349 350 /* Discard const. */ 351 return (struct ip_nat_range *)best.range; 352} 353 354/* Fast version doesn't iterate through hash chains, but only handles 355 common case of single IP address (null NAT, masquerade) */ 356static struct ip_nat_range * 357find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple, 358 const struct ip_nat_multi_range *mr, 359 const struct ip_conntrack *conntrack, 360 unsigned int hooknum) 361{ 362 if (mr->rangesize != 1 363 || (mr->range[0].flags & IP_NAT_RANGE_FULL) 364 || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) 365 && mr->range[0].min_ip != mr->range[0].max_ip)) 366 return find_best_ips_proto(tuple, mr, conntrack, hooknum); 367 368 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { 369 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) 370 tuple->src.ip = mr->range[0].min_ip; 371 else { 372 /* Only do extra mangle when required (breaks 373 socket binding) */ 374#ifdef CONFIG_IP_NF_NAT_LOCAL 375 if (tuple->dst.ip != mr->range[0].min_ip 376 && hooknum == NF_IP_LOCAL_OUT 377 && !do_extra_mangle(mr->range[0].min_ip, 378 &tuple->src.ip)) 379 return NULL; 380#endif 381 tuple->dst.ip = mr->range[0].min_ip; 382 } 383 } 384 385 /* Discard const. */ 386 return (struct ip_nat_range *)&mr->range[0]; 387} 388 389static int 390get_unique_tuple(struct ip_conntrack_tuple *tuple, 391 const struct ip_conntrack_tuple *orig_tuple, 392 const struct ip_nat_multi_range *mrr, 393 struct ip_conntrack *conntrack, 394 unsigned int hooknum) 395{ 396 struct ip_nat_protocol *proto 397 = find_nat_proto(orig_tuple->dst.protonum); 398 struct ip_nat_range *rptr; 399 unsigned int i; 400 int ret; 401 402 /* We temporarily use flags for marking full parts, but we 403 always clean up afterwards */ 404 struct ip_nat_multi_range *mr = (void *)mrr; 405 406 /* 1) If this srcip/proto/src-proto-part is currently mapped, 407 and that same mapping gives a unique tuple within the given 408 range, use that. 409 410 This is only required for source (ie. NAT/masq) mappings. 411 So far, we don't do local source mappings, so multiple 412 manips not an issue. */ 413 if (hooknum == NF_IP_POST_ROUTING) { 414 struct ip_conntrack_manip *manip; 415 416 manip = find_appropriate_src(orig_tuple, mr); 417 if (manip) { 418 /* Apply same source manipulation. */ 419 *tuple = ((struct ip_conntrack_tuple) 420 { *manip, orig_tuple->dst }); 421 DEBUGP("get_unique_tuple: Found current src map\n"); 422 return 1; 423 } 424 } 425 426 /* 2) Select the least-used IP/proto combination in the given 427 range. 428 */ 429 *tuple = *orig_tuple; 430 while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum)) 431 != NULL) { 432 DEBUGP("Found best for "); DUMP_TUPLE_RAW(tuple); 433 /* 3) The per-protocol part of the manip is made to 434 map into the range to make a unique tuple. */ 435 436 /* Only bother mapping if it's not already in range 437 and unique */ 438 if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED) 439 || proto->in_range(tuple, HOOK2MANIP(hooknum), 440 &rptr->min, &rptr->max)) 441 && !ip_nat_used_tuple(tuple, conntrack)) { 442 ret = 1; 443 goto clear_fulls; 444 } else { 445 if (proto->unique_tuple(tuple, rptr, 446 HOOK2MANIP(hooknum), 447 conntrack)) { 448 /* Must be unique. */ 449 IP_NF_ASSERT(!ip_nat_used_tuple(tuple, 450 conntrack)); 451 ret = 1; 452 goto clear_fulls; 453 } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) { 454 /* Try implicit source NAT; protocol 455 may be able to play with ports to 456 make it unique. */ 457 struct ip_nat_range r 458 = { IP_NAT_RANGE_MAP_IPS, 459 tuple->src.ip, tuple->src.ip, 460 { 0 }, { 0 } }; 461 DEBUGP("Trying implicit mapping\n"); 462 if (proto->unique_tuple(tuple, &r, 463 IP_NAT_MANIP_SRC, 464 conntrack)) { 465 /* Must be unique. */ 466 IP_NF_ASSERT(!ip_nat_used_tuple 467 (tuple, conntrack)); 468 ret = 1; 469 goto clear_fulls; 470 } 471 } 472 DEBUGP("Protocol can't get unique tuple %u.\n", 473 hooknum); 474 } 475 476 /* Eliminate that from range, and try again. */ 477 rptr->flags |= IP_NAT_RANGE_FULL; 478 *tuple = *orig_tuple; 479 } 480 481 ret = 0; 482 483 clear_fulls: 484 /* Clear full flags. */ 485 IP_NF_ASSERT(mr->rangesize >= 1); 486 for (i = 0; i < mr->rangesize; i++) 487 mr->range[i].flags &= ~IP_NAT_RANGE_FULL; 488 489 return ret; 490} 491 492static inline int 493helper_cmp(const struct ip_nat_helper *helper, 494 const struct ip_conntrack_tuple *tuple) 495{ 496 return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask); 497} 498 499/* Where to manip the reply packets (will be reverse manip). */ 500static unsigned int opposite_hook[NF_IP_NUMHOOKS] 501= { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING, 502 [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING, 503#ifdef CONFIG_IP_NF_NAT_LOCAL 504 [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN, 505 [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT, 506#endif 507}; 508 509unsigned int 510ip_nat_setup_info(struct ip_conntrack *conntrack, 511 const struct ip_nat_multi_range *mr, 512 unsigned int hooknum) 513{ 514 struct ip_conntrack_tuple new_tuple, inv_tuple, reply; 515 struct ip_conntrack_tuple orig_tp; 516 struct ip_nat_info *info = &conntrack->nat.info; 517 518 MUST_BE_WRITE_LOCKED(&ip_nat_lock); 519 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING 520 || hooknum == NF_IP_POST_ROUTING 521 || hooknum == NF_IP_LOCAL_OUT); 522 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS); 523 524 /* What we've got will look like inverse of reply. Normally 525 this is what is in the conntrack, except for prior 526 manipulations (future optimization: if num_manips == 0, 527 orig_tp = 528 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ 529 invert_tuplepr(&orig_tp, 530 &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple); 531 532 533 do { 534 if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack, 535 hooknum)) { 536 DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n", 537 conntrack); 538 return NF_DROP; 539 } 540 541 542 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT): 543 the original (A/B/C/D') and the mangled one (E/F/G/H'). 544 545 We're only allowed to work with the SRC per-proto 546 part, so we create inverses of both to start, then 547 derive the other fields we need. */ 548 549 /* Reply connection: simply invert the new tuple 550 (G/H/E/F') */ 551 invert_tuplepr(&reply, &new_tuple); 552 553 /* Alter conntrack table so it recognizes replies. 554 If fail this race (reply tuple now used), repeat. */ 555 } while (!ip_conntrack_alter_reply(conntrack, &reply)); 556 557 /* Create inverse of original: C/D/A/B' */ 558 invert_tuplepr(&inv_tuple, &orig_tp); 559 560 /* Has source changed?. */ 561 if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) { 562 /* In this direction, a source manip. */ 563 info->manips[info->num_manips++] = 564 ((struct ip_nat_info_manip) 565 { IP_CT_DIR_ORIGINAL, hooknum, 566 IP_NAT_MANIP_SRC, new_tuple.src }); 567 568 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS); 569 570 /* In the reverse direction, a destination manip. */ 571 info->manips[info->num_manips++] = 572 ((struct ip_nat_info_manip) 573 { IP_CT_DIR_REPLY, opposite_hook[hooknum], 574 IP_NAT_MANIP_DST, orig_tp.src }); 575 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS); 576 } 577 578 /* Has destination changed? */ 579 if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) { 580 /* In this direction, a destination manip */ 581 info->manips[info->num_manips++] = 582 ((struct ip_nat_info_manip) 583 { IP_CT_DIR_ORIGINAL, hooknum, 584 IP_NAT_MANIP_DST, reply.src }); 585 586 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS); 587 588 /* In the reverse direction, a source manip. */ 589 info->manips[info->num_manips++] = 590 ((struct ip_nat_info_manip) 591 { IP_CT_DIR_REPLY, opposite_hook[hooknum], 592 IP_NAT_MANIP_SRC, inv_tuple.src }); 593 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS); 594 } 595 596 /* If there's a helper, assign it; based on new tuple. */ 597 if (!conntrack->master) 598 info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *, 599 &reply); 600 601 /* It's done. */ 602 info->initialized |= (1 << HOOK2MANIP(hooknum)); 603 return NF_ACCEPT; 604} 605 606void replace_in_hashes(struct ip_conntrack *conntrack, 607 struct ip_nat_info *info) 608{ 609 /* Source has changed, so replace in hashes. */ 610 unsigned int srchash 611 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] 612 .tuple.src, 613 conntrack->tuplehash[IP_CT_DIR_ORIGINAL] 614 .tuple.dst.protonum); 615 /* We place packet as seen OUTGOUNG in byips_proto hash 616 (ie. reverse dst and src of reply packet. */ 617 unsigned int ipsprotohash 618 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY] 619 .tuple.dst.ip, 620 conntrack->tuplehash[IP_CT_DIR_REPLY] 621 .tuple.src.ip, 622 conntrack->tuplehash[IP_CT_DIR_REPLY] 623 .tuple.dst.protonum); 624 625 IP_NF_ASSERT(info->bysource.conntrack == conntrack); 626 MUST_BE_WRITE_LOCKED(&ip_nat_lock); 627 628 list_del(&info->bysource.list); 629 list_del(&info->byipsproto.list); 630 631 list_prepend(&bysource[srchash], &info->bysource); 632 list_prepend(&byipsproto[ipsprotohash], &info->byipsproto); 633} 634 635void place_in_hashes(struct ip_conntrack *conntrack, 636 struct ip_nat_info *info) 637{ 638 unsigned int srchash 639 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] 640 .tuple.src, 641 conntrack->tuplehash[IP_CT_DIR_ORIGINAL] 642 .tuple.dst.protonum); 643 /* We place packet as seen OUTGOUNG in byips_proto hash 644 (ie. reverse dst and src of reply packet. */ 645 unsigned int ipsprotohash 646 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY] 647 .tuple.dst.ip, 648 conntrack->tuplehash[IP_CT_DIR_REPLY] 649 .tuple.src.ip, 650 conntrack->tuplehash[IP_CT_DIR_REPLY] 651 .tuple.dst.protonum); 652 653 IP_NF_ASSERT(!info->bysource.conntrack); 654 655 MUST_BE_WRITE_LOCKED(&ip_nat_lock); 656 info->byipsproto.conntrack = conntrack; 657 info->bysource.conntrack = conntrack; 658 659 list_prepend(&bysource[srchash], &info->bysource); 660 list_prepend(&byipsproto[ipsprotohash], &info->byipsproto); 661} 662 663static void 664manip_pkt(u_int16_t proto, struct iphdr *iph, size_t len, 665 const struct ip_conntrack_manip *manip, 666 enum ip_nat_manip_type maniptype, 667 __u32 *nfcache) 668{ 669 *nfcache |= NFC_ALTERED; 670 find_nat_proto(proto)->manip_pkt(iph, len, manip, maniptype); 671 672 if (maniptype == IP_NAT_MANIP_SRC) { 673 iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip, 674 iph->check); 675 iph->saddr = manip->ip; 676 } else { 677 iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip, 678 iph->check); 679 iph->daddr = manip->ip; 680 } 681} 682 683static inline int exp_for_packet(struct ip_conntrack_expect *exp, 684 struct sk_buff **pskb) 685{ 686 struct ip_conntrack_protocol *proto; 687 int ret = 1; 688 689 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 690 proto = __ip_ct_find_proto((*pskb)->nh.iph->protocol); 691 if (proto->exp_matches_pkt) 692 ret = proto->exp_matches_pkt(exp, pskb); 693 694 return ret; 695} 696 697/* Do packet manipulations according to binding. */ 698unsigned int 699do_bindings(struct ip_conntrack *ct, 700 enum ip_conntrack_info ctinfo, 701 struct ip_nat_info *info, 702 unsigned int hooknum, 703 struct sk_buff **pskb) 704{ 705 unsigned int i; 706 struct ip_nat_helper *helper; 707 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 708 int is_tcp = (*pskb)->nh.iph->protocol == IPPROTO_TCP; 709 710 /* Need nat lock to protect against modification, but neither 711 conntrack (referenced) and helper (deleted with 712 synchronize_bh()) can vanish. */ 713 READ_LOCK(&ip_nat_lock); 714 for (i = 0; i < info->num_manips; i++) { 715 /* raw socket (tcpdump) may have clone of incoming 716 skb: don't disturb it --RR */ 717 if (skb_cloned(*pskb) && !(*pskb)->sk) { 718 struct sk_buff *nskb = skb_copy(*pskb, GFP_ATOMIC); 719 if (!nskb) { 720 READ_UNLOCK(&ip_nat_lock); 721 return NF_DROP; 722 } 723 kfree_skb(*pskb); 724 *pskb = nskb; 725 } 726 727 if (info->manips[i].direction == dir 728 && info->manips[i].hooknum == hooknum) { 729 DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n", 730 *pskb, 731 info->manips[i].maniptype == IP_NAT_MANIP_SRC 732 ? "SRC" : "DST", 733 NIPQUAD(info->manips[i].manip.ip), 734 htons(info->manips[i].manip.u.all)); 735 manip_pkt((*pskb)->nh.iph->protocol, 736 (*pskb)->nh.iph, 737 (*pskb)->len, 738 &info->manips[i].manip, 739 info->manips[i].maniptype, 740 &(*pskb)->nfcache); 741 } 742 } 743 helper = info->helper; 744 READ_UNLOCK(&ip_nat_lock); 745 746 if (helper) { 747 struct ip_conntrack_expect *exp = NULL; 748 struct list_head *cur_item; 749 int ret = NF_ACCEPT; 750 751 DEBUGP("do_bindings: helper existing for (%p)\n", ct); 752 753 /* Always defragged for helpers */ 754 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off 755 & htons(IP_MF|IP_OFFSET))); 756 757 /* Have to grab read lock before sibling_list traversal */ 758 READ_LOCK(&ip_conntrack_lock); 759 list_for_each(cur_item, &ct->sibling_list) { 760 exp = list_entry(cur_item, struct ip_conntrack_expect, 761 expected_list); 762 763 /* if this expectation is already established, skip */ 764 if (exp->sibling) 765 continue; 766 767 if (exp_for_packet(exp, pskb)) { 768 DEBUGP("calling nat helper (exp=%p) for packet\n", 769 exp); 770 ret = helper->help(ct, exp, info, ctinfo, 771 hooknum, pskb); 772 if (ret != NF_ACCEPT) { 773 READ_UNLOCK(&ip_conntrack_lock); 774 return ret; 775 } 776 } 777 } 778 /* Helper might want to manip the packet even when there is no expectation */ 779 if (!exp && helper->flags & IP_NAT_HELPER_F_ALWAYS) { 780 DEBUGP("calling nat helper for packet without expectation\n"); 781 ret = helper->help(ct, NULL, info, ctinfo, 782 hooknum, pskb); 783 if (ret != NF_ACCEPT) { 784 READ_UNLOCK(&ip_conntrack_lock); 785 return ret; 786 } 787 } 788 READ_UNLOCK(&ip_conntrack_lock); 789 790 /* Adjust sequence number only once per packet 791 * (helper is called at all hooks) */ 792 if (is_tcp && (hooknum == NF_IP_POST_ROUTING 793 || hooknum == NF_IP_LOCAL_IN)) { 794 DEBUGP("ip_nat_core: adjusting sequence number\n"); 795 /* future: put this in a l4-proto specific function, 796 * and call this function here. */ 797 ip_nat_seq_adjust(*pskb, ct, ctinfo); 798 } 799 800 return ret; 801 802 } else 803 return NF_ACCEPT; 804 805 /* not reached */ 806} 807 808unsigned int 809icmp_reply_translation(struct sk_buff *skb, 810 struct ip_conntrack *conntrack, 811 unsigned int hooknum, 812 int dir) 813{ 814 struct iphdr *iph = skb->nh.iph; 815 struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl); 816 struct iphdr *inner = (struct iphdr *)(hdr + 1); 817 size_t datalen = skb->len - ((void *)inner - (void *)iph); 818 unsigned int i; 819 struct ip_nat_info *info = &conntrack->nat.info; 820 821 IP_NF_ASSERT(skb->len >= iph->ihl*4 + sizeof(struct icmphdr)); 822 /* Must be RELATED */ 823 IP_NF_ASSERT(skb->nfct - (struct ip_conntrack *)skb->nfct->master 824 == IP_CT_RELATED 825 || skb->nfct - (struct ip_conntrack *)skb->nfct->master 826 == IP_CT_RELATED+IP_CT_IS_REPLY); 827 828 /* Redirects on non-null nats must be dropped, else they'll 829 start talking to each other without our translation, and be 830 confused... --RR */ 831 if (hdr->type == ICMP_REDIRECT) { 832 /* Don't care about races here. */ 833 if (info->initialized 834 != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST)) 835 || info->num_manips != 0) 836 return NF_DROP; 837 } 838 839 DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n", 840 skb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); 841 /* Note: May not be from a NAT'd host, but probably safest to 842 do translation always as if it came from the host itself 843 (even though a "host unreachable" coming from the host 844 itself is a bit weird). 845 846 More explanation: some people use NAT for anonymizing. 847 Also, CERT recommends dropping all packets from private IP 848 addresses (although ICMP errors from internal links with 849 such addresses are not too uncommon, as Alan Cox points 850 out) */ 851 852 READ_LOCK(&ip_nat_lock); 853 for (i = 0; i < info->num_manips; i++) { 854 DEBUGP("icmp_reply: manip %u dir %s hook %u\n", 855 i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ? 856 "ORIG" : "REPLY", info->manips[i].hooknum); 857 858 if (info->manips[i].direction != dir) 859 continue; 860 861 /* Mapping the inner packet is just like a normal 862 packet, except it was never src/dst reversed, so 863 where we would normally apply a dst manip, we apply 864 a src, and vice versa. */ 865 if (info->manips[i].hooknum == opposite_hook[hooknum]) { 866 DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n", 867 info->manips[i].maniptype == IP_NAT_MANIP_SRC 868 ? "DST" : "SRC", 869 NIPQUAD(info->manips[i].manip.ip), 870 ntohs(info->manips[i].manip.u.udp.port)); 871 manip_pkt(inner->protocol, inner, 872 skb->len - ((void *)inner - (void *)iph), 873 &info->manips[i].manip, 874 !info->manips[i].maniptype, 875 &skb->nfcache); 876 /* Outer packet needs to have IP header NATed like 877 it's a reply. */ 878 } else if (info->manips[i].hooknum == hooknum) { 879 /* Use mapping to map outer packet: 0 give no 880 per-proto mapping */ 881 DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n", 882 info->manips[i].maniptype == IP_NAT_MANIP_SRC 883 ? "SRC" : "DST", 884 NIPQUAD(info->manips[i].manip.ip)); 885 manip_pkt(0, iph, skb->len, 886 &info->manips[i].manip, 887 info->manips[i].maniptype, 888 &skb->nfcache); 889 } 890 } 891 READ_UNLOCK(&ip_nat_lock); 892 893 /* Since we mangled inside ICMP packet, recalculate its 894 checksum from scratch. (Hence the handling of incorrect 895 checksums in conntrack, so we don't accidentally fix one.) */ 896 hdr->checksum = 0; 897 hdr->checksum = ip_compute_csum((unsigned char *)hdr, 898 sizeof(*hdr) + datalen); 899 900 return NF_ACCEPT; 901} 902 903int __init ip_nat_init(void) 904{ 905 size_t i; 906 907 /* Leave them the same for the moment. */ 908 ip_nat_htable_size = ip_conntrack_htable_size; 909 910 /* One vmalloc for both hash tables */ 911 bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2); 912 if (!bysource) { 913 return -ENOMEM; 914 } 915 byipsproto = bysource + ip_nat_htable_size; 916 917 /* Sew in builtin protocols. */ 918 WRITE_LOCK(&ip_nat_lock); 919 list_append(&protos, &ip_nat_protocol_tcp); 920 list_append(&protos, &ip_nat_protocol_udp); 921 list_append(&protos, &ip_nat_protocol_icmp); 922 WRITE_UNLOCK(&ip_nat_lock); 923 924 for (i = 0; i < ip_nat_htable_size; i++) { 925 INIT_LIST_HEAD(&bysource[i]); 926 INIT_LIST_HEAD(&byipsproto[i]); 927 } 928 929 IP_NF_ASSERT(ip_conntrack_destroyed == NULL); 930 ip_conntrack_destroyed = &ip_nat_cleanup_conntrack; 931 932 return 0; 933} 934 935/* Clear NAT section of all conntracks, in case we're loaded again. */ 936static int clean_nat(const struct ip_conntrack *i, void *data) 937{ 938 memset((void *)&i->nat, 0, sizeof(i->nat)); 939 return 0; 940} 941 942/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */ 943void ip_nat_cleanup(void) 944{ 945 ip_ct_selective_cleanup(&clean_nat, NULL); 946 ip_conntrack_destroyed = NULL; 947 vfree(bysource); 948} 949