ip_state.c revision 57096
1/* 2 * Copyright (C) 1995-1998 by Darren Reed. 3 * 4 * Redistribution and use in source and binary forms are permitted 5 * provided that this notice is preserved and due credit is given 6 * to the original author and the contributors. 7 */ 8#if !defined(lint) 9static const char sccsid[] = "@(#)ip_state.c 1.8 6/5/96 (C) 1993-1995 Darren Reed"; 10static const char rcsid[] = "@(#)$Id: ip_state.c,v 2.3.2.18 2000/01/27 08:51:30 darrenr Exp $"; 11#endif 12 13#include <sys/errno.h> 14#include <sys/types.h> 15#include <sys/param.h> 16#include <sys/file.h> 17#if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \ 18 defined(_KERNEL) 19# include "opt_ipfilter_log.h" 20#endif 21#if !defined(_KERNEL) && !defined(KERNEL) && !defined(__KERNEL__) 22# include <stdio.h> 23# include <stdlib.h> 24# include <string.h> 25#else 26# ifdef linux 27# include <linux/kernel.h> 28# include <linux/module.h> 29# endif 30#endif 31#if defined(_KERNEL) && (__FreeBSD_version >= 220000) 32# include <sys/filio.h> 33# include <sys/fcntl.h> 34# if (__FreeBSD_version >= 300000) && !defined(IPFILTER_LKM) 35# include "opt_ipfilter.h" 36# endif 37#else 38# include <sys/ioctl.h> 39#endif 40#include <sys/time.h> 41#include <sys/uio.h> 42#ifndef linux 43# include <sys/protosw.h> 44#endif 45#include <sys/socket.h> 46#if (defined(_KERNEL) || defined(KERNEL)) && !defined(linux) 47# include <sys/systm.h> 48#endif 49#if !defined(__SVR4) && !defined(__svr4__) 50# ifndef linux 51# include <sys/mbuf.h> 52# endif 53#else 54# include <sys/filio.h> 55# include <sys/byteorder.h> 56# ifdef _KERNEL 57# include <sys/dditypes.h> 58# endif 59# include <sys/stream.h> 60# include <sys/kmem.h> 61#endif 62 63#include <net/if.h> 64#ifdef sun 65# include <net/af.h> 66#endif 67#include <net/route.h> 68#include <netinet/in.h> 69#include <netinet/in_systm.h> 70#include <netinet/ip.h> 71#include <netinet/tcp.h> 72#ifndef linux 73# include <netinet/ip_var.h> 74# include <netinet/tcp_fsm.h> 75#endif 76#include <netinet/udp.h> 77#include <netinet/ip_icmp.h> 78#include "netinet/ip_compat.h" 79#include <netinet/tcpip.h> 80#include "netinet/ip_fil.h" 81#include "netinet/ip_nat.h" 82#include "netinet/ip_frag.h" 83#include "netinet/ip_proxy.h" 84#include "netinet/ip_state.h" 85#if (__FreeBSD_version >= 300000) 86# include <sys/malloc.h> 87# if (defined(_KERNEL) || defined(KERNEL)) && !defined(IPFILTER_LKM) 88# include <sys/libkern.h> 89# include <sys/systm.h> 90# endif 91#endif 92 93#ifndef MIN 94# define MIN(a,b) (((a)<(b))?(a):(b)) 95#endif 96 97#define TCP_CLOSE (TH_FIN|TH_RST) 98 99ipstate_t **ips_table = NULL; 100int ips_num = 0; 101ips_stat_t ips_stats; 102#if (SOLARIS || defined(__sgi)) && defined(_KERNEL) 103extern KRWLOCK_T ipf_state, ipf_mutex; 104extern kmutex_t ipf_rw; 105#endif 106 107static int fr_matchsrcdst __P((ipstate_t *, struct in_addr, struct in_addr, 108 fr_info_t *, tcphdr_t *)); 109static frentry_t *fr_checkicmpmatchingstate __P((ip_t *, fr_info_t *)); 110static int fr_state_flush __P((int)); 111static ips_stat_t *fr_statetstats __P((void)); 112static void fr_delstate __P((ipstate_t *)); 113 114 115#define FIVE_DAYS (2 * 5 * 86400) /* 5 days: half closed session */ 116 117#define TCP_MSL 240 /* 2 minutes */ 118u_long fr_tcpidletimeout = FIVE_DAYS, 119 fr_tcpclosewait = 2 * TCP_MSL, 120 fr_tcplastack = 2 * TCP_MSL, 121 fr_tcptimeout = 2 * TCP_MSL, 122 fr_tcpclosed = 1, 123 fr_udptimeout = 240, 124 fr_icmptimeout = 120; 125int fr_statemax = IPSTATE_MAX, 126 fr_statesize = IPSTATE_SIZE; 127int fr_state_doflush = 0; 128 129 130int fr_stateinit() 131{ 132 KMALLOCS(ips_table, ipstate_t **, fr_statesize * sizeof(ipstate_t *)); 133 if (ips_table != NULL) 134 bzero((char *)ips_table, fr_statesize * sizeof(ipstate_t *)); 135 else 136 return -1; 137 return 0; 138} 139 140 141static ips_stat_t *fr_statetstats() 142{ 143 ips_stats.iss_active = ips_num; 144 ips_stats.iss_table = ips_table; 145 return &ips_stats; 146} 147 148 149/* 150 * flush state tables. two actions currently defined: 151 * which == 0 : flush all state table entries 152 * which == 1 : flush TCP connections which have started to close but are 153 * stuck for some reason. 154 */ 155static int fr_state_flush(which) 156int which; 157{ 158 register int i; 159 register ipstate_t *is, **isp; 160#if defined(_KERNEL) && !SOLARIS 161 int s; 162#endif 163 int delete, removed = 0; 164 165 SPL_NET(s); 166 WRITE_ENTER(&ipf_state); 167 for (i = fr_statesize - 1; i >= 0; i--) 168 for (isp = &ips_table[i]; (is = *isp); ) { 169 delete = 0; 170 171 switch (which) 172 { 173 case 0 : 174 delete = 1; 175 break; 176 case 1 : 177 if (is->is_p != IPPROTO_TCP) 178 break; 179 if ((is->is_state[0] != TCPS_ESTABLISHED) || 180 (is->is_state[1] != TCPS_ESTABLISHED)) 181 delete = 1; 182 break; 183 } 184 185 if (delete) { 186 *isp = is->is_next; 187 if (is->is_p == IPPROTO_TCP) 188 ips_stats.iss_fin++; 189 else 190 ips_stats.iss_expire++; 191 if (ips_table[i] == NULL) 192 ips_stats.iss_inuse--; 193#ifdef IPFILTER_LOG 194 ipstate_log(is, ISL_FLUSH); 195#endif 196 fr_delstate(is); 197 ips_num--; 198 removed++; 199 } else 200 isp = &is->is_next; 201 } 202 RWLOCK_EXIT(&ipf_state); 203 SPL_X(s); 204 return removed; 205} 206 207 208int fr_state_ioctl(data, cmd, mode) 209caddr_t data; 210#if defined(__NetBSD__) || defined(__OpenBSD__) 211u_long cmd; 212#else 213int cmd; 214#endif 215int mode; 216{ 217 int arg, ret, error = 0; 218 219 switch (cmd) 220 { 221 case SIOCIPFFL : 222 IRCOPY(data, (caddr_t)&arg, sizeof(arg)); 223 if (arg == 0 || arg == 1) { 224 ret = fr_state_flush(arg); 225 IWCOPY((caddr_t)&ret, data, sizeof(ret)); 226 } else 227 error = EINVAL; 228 break; 229#ifdef IPFILTER_LOG 230 case SIOCIPFFB : 231 if (!(mode & FWRITE)) 232 error = EPERM; 233 else 234 *(int *)data = ipflog_clear(IPL_LOGSTATE); 235 break; 236#endif 237 case SIOCGIPST : 238 IWCOPY((caddr_t)fr_statetstats(), data, sizeof(ips_stat_t)); 239 break; 240 case FIONREAD : 241#ifdef IPFILTER_LOG 242 IWCOPY((caddr_t)&iplused[IPL_LOGSTATE], (caddr_t)data, 243 sizeof(iplused[IPL_LOGSTATE])); 244#endif 245 break; 246 default : 247 error = EINVAL; 248 break; 249 } 250 return error; 251} 252 253 254/* 255 * Create a new ipstate structure and hang it off the hash table. 256 */ 257ipstate_t *fr_addstate(ip, fin, flags) 258ip_t *ip; 259fr_info_t *fin; 260u_int flags; 261{ 262 register ipstate_t *is; 263 register u_int hv; 264 ipstate_t ips; 265 u_int pass; 266 267 if ((ip->ip_off & IP_OFFMASK) || (fin->fin_fi.fi_fl & FI_SHORT)) 268 return NULL; 269 if (ips_num == fr_statemax) { 270 ips_stats.iss_max++; 271 fr_state_doflush = 1; 272 return NULL; 273 } 274 is = &ips; 275 bzero((char *)is, sizeof(*is)); 276 ips.is_age = 1; 277 ips.is_state[0] = 0; 278 ips.is_state[1] = 0; 279 /* 280 * Copy and calculate... 281 */ 282 hv = (is->is_p = ip->ip_p); 283 hv += (is->is_src.s_addr = ip->ip_src.s_addr); 284 hv += (is->is_dst.s_addr = ip->ip_dst.s_addr); 285 286 switch (ip->ip_p) 287 { 288 case IPPROTO_ICMP : 289 { 290 struct icmp *ic = (struct icmp *)fin->fin_dp; 291 292 switch (ic->icmp_type) 293 { 294 case ICMP_ECHO : 295 is->is_icmp.ics_type = ICMP_ECHOREPLY; /* XXX */ 296 hv += (is->is_icmp.ics_id = ic->icmp_id); 297 hv += (is->is_icmp.ics_seq = ic->icmp_seq); 298 break; 299 case ICMP_TSTAMP : 300 case ICMP_IREQ : 301 case ICMP_MASKREQ : 302 is->is_icmp.ics_type = ic->icmp_type + 1; 303 break; 304 default : 305 return NULL; 306 } 307 ATOMIC_INC(ips_stats.iss_icmp); 308 is->is_age = fr_icmptimeout; 309 break; 310 } 311 case IPPROTO_TCP : 312 { 313 register tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp; 314 315 /* 316 * The endian of the ports doesn't matter, but the ack and 317 * sequence numbers do as we do mathematics on them later. 318 */ 319 is->is_dport = tcp->th_dport; 320 is->is_sport = tcp->th_sport; 321 if ((flags & (FI_W_DPORT|FI_W_SPORT)) == 0) { 322 hv += tcp->th_dport; 323 hv += tcp->th_sport; 324 } 325 if (tcp->th_seq != 0) { 326 is->is_send = ntohl(tcp->th_seq) + ip->ip_len - 327 fin->fin_hlen - (tcp->th_off << 2) + 328 ((tcp->th_flags & TH_SYN) ? 1 : 0) + 329 ((tcp->th_flags & TH_FIN) ? 1 : 0); 330 is->is_maxsend = is->is_send + 1; 331 } 332 is->is_dend = 0; 333 is->is_maxswin = ntohs(tcp->th_win); 334 if (is->is_maxswin == 0) 335 is->is_maxswin = 1; 336 /* 337 * If we're creating state for a starting connection, start the 338 * timer on it as we'll never see an error if it fails to 339 * connect. 340 */ 341 MUTEX_ENTER(&ipf_rw); 342 ips_stats.iss_tcp++; 343 fr_tcp_age(&is->is_age, is->is_state, ip, fin, 344 tcp->th_sport == is->is_sport); 345 MUTEX_EXIT(&ipf_rw); 346 break; 347 } 348 case IPPROTO_UDP : 349 { 350 register tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp; 351 352 is->is_dport = tcp->th_dport; 353 is->is_sport = tcp->th_sport; 354 if ((flags & (FI_W_DPORT|FI_W_SPORT)) == 0) { 355 hv += tcp->th_dport; 356 hv += tcp->th_sport; 357 } 358 ATOMIC_INC(ips_stats.iss_udp); 359 is->is_age = fr_udptimeout; 360 break; 361 } 362 default : 363 return NULL; 364 } 365 366 KMALLOC(is, ipstate_t *); 367 if (is == NULL) { 368 ATOMIC_INC(ips_stats.iss_nomem); 369 return NULL; 370 } 371 bcopy((char *)&ips, (char *)is, sizeof(*is)); 372 hv %= fr_statesize; 373 RW_UPGRADE(&ipf_mutex); 374 is->is_rule = fin->fin_fr; 375 if (is->is_rule != NULL) { 376 is->is_rule->fr_ref++; 377 pass = is->is_rule->fr_flags; 378 } else 379 pass = fr_flags; 380 MUTEX_DOWNGRADE(&ipf_mutex); 381 WRITE_ENTER(&ipf_state); 382 383 is->is_rout = pass & FR_OUTQUE ? 1 : 0; 384 is->is_pass = pass; 385 is->is_pkts = 1; 386 is->is_bytes = ip->ip_len; 387 /* 388 * We want to check everything that is a property of this packet, 389 * but we don't (automatically) care about it's fragment status as 390 * this may change. 391 */ 392 is->is_opt = fin->fin_fi.fi_optmsk; 393 is->is_optmsk = 0xffffffff; 394 is->is_sec = fin->fin_fi.fi_secmsk; 395 is->is_secmsk = 0xffff; 396 is->is_auth = fin->fin_fi.fi_auth; 397 is->is_authmsk = 0xffff; 398 is->is_flags = fin->fin_fi.fi_fl & FI_CMP; 399 is->is_flags |= FI_CMP << 4; 400 is->is_flags |= flags & (FI_W_DPORT|FI_W_SPORT); 401 /* 402 * add into table. 403 */ 404 is->is_next = ips_table[hv]; 405 ips_table[hv] = is; 406 if (is->is_next == NULL) 407 ips_stats.iss_inuse++; 408 if (fin->fin_out) { 409 is->is_ifpin = NULL; 410 is->is_ifpout = fin->fin_ifp; 411 } else { 412 is->is_ifpin = fin->fin_ifp; 413 is->is_ifpout = NULL; 414 } 415 if (pass & FR_LOGFIRST) 416 is->is_pass &= ~(FR_LOGFIRST|FR_LOG); 417 ATOMIC_INC(ips_num); 418#ifdef IPFILTER_LOG 419 ipstate_log(is, ISL_NEW); 420#endif 421 RWLOCK_EXIT(&ipf_state); 422 fin->fin_rev = (is->is_dst.s_addr != ip->ip_dst.s_addr); 423 if (fin->fin_fi.fi_fl & FI_FRAG) 424 ipfr_newfrag(ip, fin, pass ^ FR_KEEPSTATE); 425 return is; 426} 427 428 429 430/* 431 * check to see if a packet with TCP headers fits within the TCP window. 432 * change timeout depending on whether new packet is a SYN-ACK returning for a 433 * SYN or a RST or FIN which indicate time to close up shop. 434 */ 435int fr_tcpstate(is, fin, ip, tcp) 436register ipstate_t *is; 437fr_info_t *fin; 438ip_t *ip; 439tcphdr_t *tcp; 440{ 441 register tcp_seq seq, ack, end; 442 register int ackskew; 443 tcpdata_t *fdata, *tdata; 444 u_short win, maxwin; 445 int ret = 0; 446 int source; 447 448 /* 449 * Find difference between last checked packet and this packet. 450 */ 451 source = (ip->ip_src.s_addr == is->is_src.s_addr); 452 fdata = &is->is_tcp.ts_data[!source]; 453 tdata = &is->is_tcp.ts_data[source]; 454 seq = ntohl(tcp->th_seq); 455 ack = ntohl(tcp->th_ack); 456 win = ntohs(tcp->th_win); 457 end = seq + ip->ip_len - fin->fin_hlen - (tcp->th_off << 2) + 458 ((tcp->th_flags & TH_SYN) ? 1 : 0) + 459 ((tcp->th_flags & TH_FIN) ? 1 : 0); 460 461 if (fdata->td_end == 0) { 462 /* 463 * Must be a (outgoing) SYN-ACK in reply to a SYN. 464 */ 465 fdata->td_end = end; 466 fdata->td_maxwin = 1; 467 fdata->td_maxend = end + 1; 468 } 469 470 if (!(tcp->th_flags & TH_ACK)) { /* Pretend an ack was sent */ 471 ack = tdata->td_end; 472 win = 1; 473 if ((tcp->th_flags == TH_SYN) && (tdata->td_maxwin == 0)) 474 tdata->td_maxwin = 1; 475 } else if (((tcp->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) && 476 (ack == 0)) { 477 /* gross hack to get around certain broken tcp stacks */ 478 ack = tdata->td_end; 479 } 480 481 if (seq == end) 482 seq = end = fdata->td_end; 483 484 maxwin = tdata->td_maxwin; 485 ackskew = tdata->td_end - ack; 486 487#define SEQ_GE(a,b) ((int)((a) - (b)) >= 0) 488#define SEQ_GT(a,b) ((int)((a) - (b)) > 0) 489 if ((SEQ_GE(fdata->td_maxend, end)) && 490 (SEQ_GE(seq, fdata->td_end - maxwin)) && 491/* XXX what about big packets */ 492#define MAXACKWINDOW 66000 493 (ackskew >= -MAXACKWINDOW) && 494 (ackskew <= MAXACKWINDOW)) { 495 /* if ackskew < 0 then this should be due to fragented 496 * packets. There is no way to know the length of the 497 * total packet in advance. 498 * We do know the total length from the fragment cache though. 499 * Note however that there might be more sessions with 500 * exactly the same source and destination paramters in the 501 * state cache (and source and destination is the only stuff 502 * that is saved in the fragment cache). Note further that 503 * some TCP connections in the state cache are hashed with 504 * sport and dport as well which makes it not worthwhile to 505 * look for them. 506 * Thus, when ackskew is negative but still seems to belong 507 * to this session, we bump up the destinations end value. 508 */ 509 if (ackskew < 0) 510 tdata->td_end = ack; 511 512 /* update max window seen */ 513 if (fdata->td_maxwin < win) 514 fdata->td_maxwin = win; 515 if (SEQ_GT(end, fdata->td_end)) 516 fdata->td_end = end; 517 if (SEQ_GE(ack + win, tdata->td_maxend)) { 518 tdata->td_maxend = ack + win; 519 if (win == 0) 520 tdata->td_maxend++; 521 } 522 523 ATOMIC_INC(ips_stats.iss_hits); 524 is->is_pkts++; 525 is->is_bytes += ip->ip_len; 526 /* 527 * Nearing end of connection, start timeout. 528 */ 529 MUTEX_ENTER(&ipf_rw); 530 fr_tcp_age(&is->is_age, is->is_state, ip, fin, source); 531 MUTEX_EXIT(&ipf_rw); 532 ret = 1; 533 } 534 return ret; 535} 536 537 538static int fr_matchsrcdst(is, src, dst, fin, tcp) 539ipstate_t *is; 540struct in_addr src, dst; 541fr_info_t *fin; 542tcphdr_t *tcp; 543{ 544 int ret = 0, rev, out, flags; 545 u_short sp, dp; 546 void *ifp; 547 548 rev = fin->fin_rev = (is->is_dst.s_addr != dst.s_addr); 549 ifp = fin->fin_ifp; 550 out = fin->fin_out; 551 552 if (tcp != NULL) { 553 flags = is->is_flags; 554 sp = tcp->th_sport; 555 dp = tcp->th_dport; 556 } else { 557 flags = 0; 558 sp = 0; 559 dp = 0; 560 } 561 562 if (rev == 0) { 563 if (!out) { 564 if (is->is_ifpin == ifp) 565 ret = 1; 566 } else { 567 if (is->is_ifpout == NULL || is->is_ifpout == ifp) 568 ret = 1; 569 } 570 } else { 571 if (out) { 572 if (is->is_ifpin == ifp) 573 ret = 1; 574 } else { 575 if (is->is_ifpout == NULL || is->is_ifpout == ifp) 576 ret = 1; 577 } 578 } 579 if (ret == 0) 580 return 0; 581 ret = 0; 582 583 if (rev == 0) { 584 if ((is->is_dst.s_addr == dst.s_addr) && 585 (is->is_src.s_addr == src.s_addr) && 586 (!tcp || ((sp == is->is_sport || flags & FI_W_SPORT) && 587 (dp == is->is_dport || flags & FI_W_DPORT)))) { 588 ret = 1; 589 } 590 } else { 591 if ((is->is_dst.s_addr == src.s_addr) && 592 (is->is_src.s_addr == dst.s_addr) && 593 (!tcp || ((sp == is->is_dport || flags & FI_W_DPORT) && 594 (dp == is->is_sport || flags & FI_W_SPORT)))) { 595 ret = 1; 596 } 597 } 598 if (ret == 0) 599 return 0; 600 601 /* 602 * Whether or not this should be here, is questionable, but the aim 603 * is to get this out of the main line. 604 */ 605 if (tcp == NULL) 606 flags = is->is_flags & (FI_CMP|(FI_CMP<<4)); 607 608 if (((fin->fin_fi.fi_fl & (flags >> 4)) != (flags & FI_CMP)) || 609 ((fin->fin_fi.fi_optmsk & is->is_optmsk) != is->is_opt) || 610 ((fin->fin_fi.fi_secmsk & is->is_secmsk) != is->is_sec) || 611 ((fin->fin_fi.fi_auth & is->is_authmsk) != is->is_auth)) 612 return 0; 613 614 if ((flags & (FI_W_SPORT|FI_W_DPORT))) { 615 if ((flags & FI_W_SPORT) != 0) { 616 if (rev == 0) { 617 is->is_sport = sp; 618 is->is_send = htonl(tcp->th_seq); 619 } else { 620 is->is_sport = dp; 621 is->is_send = htonl(tcp->th_ack); 622 } 623 is->is_maxsend = is->is_send + 1; 624 } else if ((flags & FI_W_DPORT) != 0) { 625 if (rev == 0) { 626 is->is_dport = dp; 627 is->is_dend = htonl(tcp->th_ack); 628 } else { 629 is->is_dport = sp; 630 is->is_dend = htonl(tcp->th_seq); 631 } 632 is->is_maxdend = is->is_dend + 1; 633 } 634 is->is_flags &= ~(FI_W_SPORT|FI_W_DPORT); 635 } 636 637 if (!rev) { 638 if (out && (out == is->is_rout)) { 639 if (!is->is_ifpout) 640 is->is_ifpout = ifp; 641 } else { 642 if (!is->is_ifpin) 643 is->is_ifpin = ifp; 644 } 645 } else { 646 if (!out && (out != is->is_rout)) { 647 if (!is->is_ifpin) 648 is->is_ifpin = ifp; 649 } else { 650 if (!is->is_ifpout) 651 is->is_ifpout = ifp; 652 } 653 } 654 return 1; 655} 656 657frentry_t *fr_checkicmpmatchingstate(ip, fin) 658ip_t *ip; 659fr_info_t *fin; 660{ 661 register struct in_addr dst, src; 662 register ipstate_t *is, **isp; 663 register u_short sport, dport; 664 register u_char pr; 665 struct icmp *ic; 666 u_short savelen; 667 fr_info_t ofin; 668 tcphdr_t *tcp; 669 icmphdr_t *icmp; 670 frentry_t *fr; 671 ip_t *oip; 672 int type; 673 u_int hv; 674 675 /* 676 * Does it at least have the return (basic) IP header ? 677 * Only a basic IP header (no options) should be with 678 * an ICMP error header. 679 */ 680 if ((ip->ip_hl != 5) || (ip->ip_len < ICMPERR_MINPKTLEN)) 681 return NULL; 682 ic = (struct icmp *)((char *)ip + fin->fin_hlen); 683 type = ic->icmp_type; 684 /* 685 * If it's not an error type, then return 686 */ 687 if ((type != ICMP_UNREACH) && (type != ICMP_SOURCEQUENCH) && 688 (type != ICMP_REDIRECT) && (type != ICMP_TIMXCEED) && 689 (type != ICMP_PARAMPROB)) 690 return NULL; 691 692 oip = (ip_t *)((char *)fin->fin_dp + ICMPERR_ICMPHLEN); 693 if (ip->ip_len < ICMPERR_MAXPKTLEN + ((oip->ip_hl - 5) << 2)) 694 return NULL; 695 696 if (oip->ip_p == IPPROTO_ICMP) { 697 698 icmp = (icmphdr_t *)((char *)oip + (oip->ip_hl << 2)); 699 700 /* 701 * a ICMP error can only be generated as a result of an 702 * ICMP query, not as the response on an ICMP error 703 * 704 * XXX theoretically ICMP_ECHOREP and the other reply's are 705 * ICMP query's as well, but adding them here seems strange XXX 706 */ 707 if ((icmp->icmp_type != ICMP_ECHO) && 708 (icmp->icmp_type != ICMP_TSTAMP) && 709 (icmp->icmp_type != ICMP_IREQ) && 710 (icmp->icmp_type != ICMP_MASKREQ)) 711 return NULL; 712 713 /* 714 * perform a lookup of the ICMP packet in the state table 715 */ 716 717 hv = (pr = oip->ip_p); 718 hv += (src.s_addr = oip->ip_src.s_addr); 719 hv += (dst.s_addr = oip->ip_dst.s_addr); 720 if (icmp->icmp_type == ICMP_ECHO) { 721 hv += icmp->icmp_id; 722 hv += icmp->icmp_seq; 723 } 724 hv %= fr_statesize; 725 726 oip->ip_len = ntohs(oip->ip_len); 727 fr_makefrip(oip->ip_hl << 2, oip, &ofin); 728 oip->ip_len = htons(oip->ip_len); 729 ofin.fin_ifp = fin->fin_ifp; 730 ofin.fin_out = !fin->fin_out; 731 ofin.fin_mp = NULL; /* if dereferenced, panic XXX */ 732 733 READ_ENTER(&ipf_state); 734 for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_next) 735 if ((is->is_p == pr) && 736 fr_matchsrcdst(is, src, dst, &ofin, NULL)) { 737 /* 738 * in the state table ICMP query's are stored 739 * with the type of the corresponding ICMP 740 * response. Correct here 741 */ 742 if (((is->is_type == ICMP_ECHOREPLY) && 743 (icmp->icmp_id == is->is_icmp.ics_id) && 744 (icmp->icmp_seq == is->is_icmp.ics_seq) && 745 (icmp->icmp_type == ICMP_ECHO)) || 746 (is->is_type - 1 == ic->icmp_type)) { 747 ips_stats.iss_hits++; 748 is->is_pkts++; 749 is->is_bytes += ip->ip_len; 750 fr = is->is_rule; 751 RWLOCK_EXIT(&ipf_state); 752 return fr; 753 } 754 } 755 RWLOCK_EXIT(&ipf_state); 756 return NULL; 757 }; 758 759 if ((oip->ip_p != IPPROTO_TCP) && (oip->ip_p != IPPROTO_UDP)) 760 return NULL; 761 762 tcp = (tcphdr_t *)((char *)oip + (oip->ip_hl << 2)); 763 dport = tcp->th_dport; 764 sport = tcp->th_sport; 765 766 hv = (pr = oip->ip_p); 767 hv += (src.s_addr = oip->ip_src.s_addr); 768 hv += (dst.s_addr = oip->ip_dst.s_addr); 769 hv += dport; 770 hv += sport; 771 hv %= fr_statesize; 772 /* 773 * we make an fin entry to be able to feed it to 774 * matchsrcdst note that not all fields are encessary 775 * but this is the cleanest way. Note further we fill 776 * in fin_mp such that if someone uses it we'll get 777 * a kernel panic. fr_matchsrcdst does not use this. 778 * 779 * watch out here, as ip is in host order and oip in network 780 * order. Any change we make must be undone afterwards. 781 */ 782 savelen = oip->ip_len; 783 oip->ip_len = ip->ip_len - (ip->ip_hl << 2) - ICMPERR_ICMPHLEN; 784 fr_makefrip(oip->ip_hl << 2, oip, &ofin); 785 oip->ip_len = savelen; 786 ofin.fin_ifp = fin->fin_ifp; 787 ofin.fin_out = !fin->fin_out; 788 ofin.fin_mp = NULL; /* if dereferenced, panic XXX */ 789 READ_ENTER(&ipf_state); 790 for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_next) { 791 /* 792 * Only allow this icmp though if the 793 * encapsulated packet was allowed through the 794 * other way around. Note that the minimal amount 795 * of info present does not allow for checking against 796 * tcp internals such as seq and ack numbers. 797 */ 798 if ((is->is_p == pr) && 799 fr_matchsrcdst(is, src, dst, &ofin, tcp)) { 800 fr = is->is_rule; 801 ips_stats.iss_hits++; 802 /* 803 * we must swap src and dst here because the icmp 804 * comes the other way around 805 */ 806 is->is_pkts++; 807 is->is_bytes += ip->ip_len; 808 /* 809 * we deliberately do not touch the timeouts 810 * for the accompanying state table entry. 811 * It remains to be seen if that is correct. XXX 812 */ 813 RWLOCK_EXIT(&ipf_state); 814 return fr; 815 } 816 } 817 RWLOCK_EXIT(&ipf_state); 818 return NULL; 819} 820 821/* 822 * Check if a packet has a registered state. 823 */ 824frentry_t *fr_checkstate(ip, fin) 825ip_t *ip; 826fr_info_t *fin; 827{ 828 register struct in_addr dst, src; 829 register ipstate_t *is, **isp; 830 register u_char pr; 831 u_int hv, hvm, hlen, tryagain, pass; 832 struct icmp *ic; 833 frentry_t *fr; 834 tcphdr_t *tcp; 835 836 if ((ip->ip_off & IP_OFFMASK) || (fin->fin_fi.fi_fl & FI_SHORT)) 837 return NULL; 838 839 is = NULL; 840 hlen = fin->fin_hlen; 841 tcp = (tcphdr_t *)((char *)ip + hlen); 842 ic = (struct icmp *)tcp; 843 hv = (pr = ip->ip_p); 844 hv += (src.s_addr = ip->ip_src.s_addr); 845 hv += (dst.s_addr = ip->ip_dst.s_addr); 846 847 /* 848 * Search the hash table for matching packet header info. 849 */ 850 switch (ip->ip_p) 851 { 852 case IPPROTO_ICMP : 853 if ((ic->icmp_type == ICMP_ECHO) || 854 (ic->icmp_type == ICMP_ECHOREPLY)) { 855 hv += ic->icmp_id; 856 hv += ic->icmp_seq; 857 } 858 hv %= fr_statesize; 859 READ_ENTER(&ipf_state); 860 for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_next) 861 if ((is->is_p == pr) && 862 fr_matchsrcdst(is, src, dst, fin, NULL)) { 863 if ((is->is_type == ICMP_ECHOREPLY) && 864 (ic->icmp_type == ICMP_ECHO) && 865 (ic->icmp_id == is->is_icmp.ics_id) && 866 (ic->icmp_seq == is->is_icmp.ics_seq)) 867 ; 868 else if (is->is_type != ic->icmp_type) 869 continue; 870 is->is_age = fr_icmptimeout; 871 break; 872 } 873 if (is != NULL) 874 break; 875 RWLOCK_EXIT(&ipf_state); 876 /* 877 * No matching icmp state entry. Perhaps this is a 878 * response to another state entry. 879 */ 880 fr = fr_checkicmpmatchingstate(ip, fin); 881 if (fr) 882 return fr; 883 break; 884 case IPPROTO_TCP : 885 { 886 register u_short dport = tcp->th_dport, sport = tcp->th_sport; 887 888 tryagain = 0; 889retry_tcp: 890 hvm = hv % fr_statesize; 891 WRITE_ENTER(&ipf_state); 892 for (isp = &ips_table[hvm]; (is = *isp); 893 isp = &is->is_next) 894 if ((is->is_p == pr) && 895 fr_matchsrcdst(is, src, dst, fin, tcp)) { 896 if (fr_tcpstate(is, fin, ip, tcp)) { 897#ifndef _KERNEL 898 if (tcp->th_flags & TCP_CLOSE) { 899 *isp = is->is_next; 900 isp = &ips_table[hvm]; 901 if (ips_table[hvm] == NULL) 902 ips_stats.iss_inuse--; 903 fr_delstate(is); 904 ips_num--; 905 } 906#endif 907 break; 908 } 909 is = NULL; 910 break; 911 } 912 if (is != NULL) 913 break; 914 RWLOCK_EXIT(&ipf_state); 915 hv += dport; 916 hv += sport; 917 if (tryagain == 0) { 918 tryagain = 1; 919 goto retry_tcp; 920 } 921 break; 922 } 923 case IPPROTO_UDP : 924 { 925 register u_short dport = tcp->th_dport, sport = tcp->th_sport; 926 927 tryagain = 0; 928retry_udp: 929 hvm = hv % fr_statesize; 930 /* 931 * Nothing else to match on but ports. and IP#'s 932 */ 933 READ_ENTER(&ipf_state); 934 for (is = ips_table[hvm]; is; is = is->is_next) 935 if ((is->is_p == pr) && 936 fr_matchsrcdst(is, src, dst, fin, tcp)) { 937 is->is_age = fr_udptimeout; 938 break; 939 } 940 if (is != NULL) 941 break; 942 RWLOCK_EXIT(&ipf_state); 943 hv += dport; 944 hv += sport; 945 if (tryagain == 0) { 946 tryagain = 1; 947 goto retry_udp; 948 } 949 break; 950 } 951 default : 952 break; 953 } 954 if (is == NULL) { 955 ATOMIC_INC(ips_stats.iss_miss); 956 return NULL; 957 } 958 MUTEX_ENTER(&ipf_rw); 959 is->is_bytes += ip->ip_len; 960 ips_stats.iss_hits++; 961 is->is_pkts++; 962 MUTEX_EXIT(&ipf_rw); 963 fr = is->is_rule; 964 fin->fin_fr = fr; 965 pass = is->is_pass; 966 RWLOCK_EXIT(&ipf_state); 967 if (fin->fin_fi.fi_fl & FI_FRAG) 968 ipfr_newfrag(ip, fin, pass ^ FR_KEEPSTATE); 969 return fr; 970} 971 972 973static void fr_delstate(is) 974ipstate_t *is; 975{ 976 frentry_t *fr; 977 978 fr = is->is_rule; 979 if (fr != NULL) { 980 ATOMIC_DEC(fr->fr_ref); 981 if (fr->fr_ref == 0) 982 KFREE(fr); 983 } 984 KFREE(is); 985} 986 987 988/* 989 * Free memory in use by all state info. kept. 990 */ 991void fr_stateunload() 992{ 993 register int i; 994 register ipstate_t *is, **isp; 995 996 WRITE_ENTER(&ipf_state); 997 for (i = fr_statesize - 1; i >= 0; i--) 998 for (isp = &ips_table[i]; (is = *isp); ) { 999 *isp = is->is_next; 1000 fr_delstate(is); 1001 ips_num--; 1002 } 1003 ips_stats.iss_inuse = 0; 1004 ips_num = 0; 1005 RWLOCK_EXIT(&ipf_state); 1006 KFREES(ips_table, fr_statesize * sizeof(ipstate_t *)); 1007 ips_table = NULL; 1008} 1009 1010 1011/* 1012 * Slowly expire held state for thingslike UDP and ICMP. Timeouts are set 1013 * in expectation of this being called twice per second. 1014 */ 1015void fr_timeoutstate() 1016{ 1017 register int i; 1018 register ipstate_t *is, **isp; 1019#if defined(_KERNEL) && !SOLARIS 1020 int s; 1021#endif 1022 1023 SPL_NET(s); 1024 WRITE_ENTER(&ipf_state); 1025 for (i = fr_statesize - 1; i >= 0; i--) 1026 for (isp = &ips_table[i]; (is = *isp); ) 1027 if (is->is_age && !--is->is_age) { 1028 *isp = is->is_next; 1029 if (is->is_p == IPPROTO_TCP) 1030 ips_stats.iss_fin++; 1031 else 1032 ips_stats.iss_expire++; 1033 if (ips_table[i] == NULL) 1034 ips_stats.iss_inuse--; 1035#ifdef IPFILTER_LOG 1036 ipstate_log(is, ISL_EXPIRE); 1037#endif 1038 fr_delstate(is); 1039 ips_num--; 1040 } else 1041 isp = &is->is_next; 1042 RWLOCK_EXIT(&ipf_state); 1043 SPL_X(s); 1044 if (fr_state_doflush) { 1045 (void) fr_state_flush(1); 1046 fr_state_doflush = 0; 1047 } 1048} 1049 1050 1051/* 1052 * Original idea freom Pradeep Krishnan for use primarily with NAT code. 1053 * (pkrishna@netcom.com) 1054 */ 1055void fr_tcp_age(age, state, ip, fin, dir) 1056u_long *age; 1057u_char *state; 1058ip_t *ip; 1059fr_info_t *fin; 1060int dir; 1061{ 1062 tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp; 1063 u_char flags = tcp->th_flags; 1064 int dlen, ostate; 1065 1066 ostate = state[1 - dir]; 1067 1068 dlen = ip->ip_len - fin->fin_hlen - (tcp->th_off << 2); 1069 1070 if (flags & TH_RST) { 1071 if (!(tcp->th_flags & TH_PUSH) && !dlen) { 1072 *age = fr_tcpclosed; 1073 state[dir] = TCPS_CLOSED; 1074 } else { 1075 *age = fr_tcpclosewait; 1076 state[dir] = TCPS_CLOSE_WAIT; 1077 } 1078 return; 1079 } 1080 1081 *age = fr_tcptimeout; /* 1 min */ 1082 1083 switch(state[dir]) 1084 { 1085 case TCPS_CLOSED: 1086 if ((flags & (TH_FIN|TH_SYN|TH_RST|TH_ACK)) == TH_ACK) { 1087 state[dir] = TCPS_ESTABLISHED; 1088 *age = fr_tcpidletimeout; 1089 } 1090 case TCPS_FIN_WAIT_2: 1091 if ((flags & TH_OPENING) == TH_OPENING) 1092 state[dir] = TCPS_SYN_RECEIVED; 1093 else if (flags & TH_SYN) 1094 state[dir] = TCPS_SYN_SENT; 1095 break; 1096 case TCPS_SYN_RECEIVED: 1097 case TCPS_SYN_SENT: 1098 if ((flags & (TH_FIN|TH_ACK)) == TH_ACK) { 1099 state[dir] = TCPS_ESTABLISHED; 1100 *age = fr_tcpidletimeout; 1101 } else if ((flags & (TH_FIN|TH_ACK)) == (TH_FIN|TH_ACK)) { 1102 state[dir] = TCPS_CLOSE_WAIT; 1103 if (!(flags & TH_PUSH) && !dlen && 1104 ostate > TCPS_ESTABLISHED) 1105 *age = fr_tcplastack; 1106 else 1107 *age = fr_tcpclosewait; 1108 } 1109 break; 1110 case TCPS_ESTABLISHED: 1111 if (flags & TH_FIN) { 1112 state[dir] = TCPS_CLOSE_WAIT; 1113 if (!(flags & TH_PUSH) && !dlen && 1114 ostate > TCPS_ESTABLISHED) 1115 *age = fr_tcplastack; 1116 else 1117 *age = fr_tcpclosewait; 1118 } else { 1119 if (ostate < TCPS_CLOSE_WAIT) 1120 *age = fr_tcpidletimeout; 1121 } 1122 break; 1123 case TCPS_CLOSE_WAIT: 1124 if ((flags & TH_FIN) && !(flags & TH_PUSH) && !dlen && 1125 ostate > TCPS_ESTABLISHED) { 1126 *age = fr_tcplastack; 1127 state[dir] = TCPS_LAST_ACK; 1128 } else 1129 *age = fr_tcpclosewait; 1130 break; 1131 case TCPS_LAST_ACK: 1132 if (flags & TH_ACK) { 1133 state[dir] = TCPS_FIN_WAIT_2; 1134 if (!(flags & TH_PUSH) && !dlen && 1135 ostate > TCPS_ESTABLISHED) 1136 *age = fr_tcplastack; 1137 else { 1138 *age = fr_tcpclosewait; 1139 state[dir] = TCPS_CLOSE_WAIT; 1140 } 1141 } 1142 break; 1143 } 1144} 1145 1146 1147#ifdef IPFILTER_LOG 1148void ipstate_log(is, type) 1149struct ipstate *is; 1150u_int type; 1151{ 1152 struct ipslog ipsl; 1153 void *items[1]; 1154 size_t sizes[1]; 1155 int types[1]; 1156 1157 ipsl.isl_type = type; 1158 ipsl.isl_pkts = is->is_pkts; 1159 ipsl.isl_bytes = is->is_bytes; 1160 ipsl.isl_src = is->is_src; 1161 ipsl.isl_dst = is->is_dst; 1162 ipsl.isl_p = is->is_p; 1163 ipsl.isl_flags = is->is_flags; 1164 if (ipsl.isl_p == IPPROTO_TCP || ipsl.isl_p == IPPROTO_UDP) { 1165 ipsl.isl_sport = is->is_sport; 1166 ipsl.isl_dport = is->is_dport; 1167 if (ipsl.isl_p == IPPROTO_TCP) { 1168 ipsl.isl_state[0] = is->is_state[0]; 1169 ipsl.isl_state[1] = is->is_state[1]; 1170 } 1171 } else if (ipsl.isl_p == IPPROTO_ICMP) 1172 ipsl.isl_itype = is->is_icmp.ics_type; 1173 else { 1174 ipsl.isl_ps.isl_filler[0] = 0; 1175 ipsl.isl_ps.isl_filler[1] = 0; 1176 } 1177 items[0] = &ipsl; 1178 sizes[0] = sizeof(ipsl); 1179 types[0] = 0; 1180 1181 (void) ipllog(IPL_LOGSTATE, NULL, items, sizes, types, 1); 1182} 1183#endif 1184 1185 1186void ip_statesync(ifp) 1187void *ifp; 1188{ 1189 register ipstate_t *is; 1190 register int i; 1191 1192 WRITE_ENTER(&ipf_state); 1193 for (i = fr_statesize - 1; i >= 0; i--) 1194 for (is = ips_table[i]; is != NULL; is = is->is_next) { 1195 if (is->is_ifpin == ifp) 1196 is->is_ifpin = NULL; 1197 if (is->is_ifpout == ifp) 1198 is->is_ifpout = NULL; 1199 } 1200 RWLOCK_EXIT(&ipf_state); 1201} 1202