ip_frag.c revision 89336
1249259Sdim/* 2249259Sdim * Copyright (C) 1993-2001 by Darren Reed. 3249259Sdim * 4249259Sdim * See the IPFILTER.LICENCE file for details on licencing. 5249259Sdim */ 6249259Sdim#if defined(KERNEL) && !defined(_KERNEL) 7249259Sdim# define _KERNEL 8249259Sdim#endif 9249259Sdim 10249259Sdim#include <sys/errno.h> 11249259Sdim#include <sys/types.h> 12249259Sdim#include <sys/param.h> 13249259Sdim#include <sys/time.h> 14249259Sdim#include <sys/file.h> 15263508Sdim#if !defined(_KERNEL) && !defined(KERNEL) 16263508Sdim# include <stdio.h> 17263508Sdim# include <string.h> 18263508Sdim# include <stdlib.h> 19249259Sdim#endif 20249259Sdim#if (defined(KERNEL) || defined(_KERNEL)) && (__FreeBSD_version >= 220000) 21# include <sys/filio.h> 22# include <sys/fcntl.h> 23#else 24# include <sys/ioctl.h> 25#endif 26#include <sys/uio.h> 27#ifndef linux 28# include <sys/protosw.h> 29#endif 30#include <sys/socket.h> 31#if defined(_KERNEL) && !defined(linux) 32# include <sys/systm.h> 33#endif 34#if !defined(__SVR4) && !defined(__svr4__) 35# if defined(_KERNEL) && !defined(__sgi) 36# include <sys/kernel.h> 37# endif 38# ifndef linux 39# include <sys/mbuf.h> 40# endif 41#else 42# include <sys/byteorder.h> 43# ifdef _KERNEL 44# include <sys/dditypes.h> 45# endif 46# include <sys/stream.h> 47# include <sys/kmem.h> 48#endif 49#include <net/if.h> 50#ifdef sun 51# include <net/af.h> 52#endif 53#include <net/route.h> 54#include <netinet/in.h> 55#include <netinet/in_systm.h> 56#include <netinet/ip.h> 57#ifndef linux 58# include <netinet/ip_var.h> 59#endif 60#include <netinet/tcp.h> 61#include <netinet/udp.h> 62#include <netinet/ip_icmp.h> 63#include "netinet/ip_compat.h" 64#include <netinet/tcpip.h> 65#include "netinet/ip_fil.h" 66#include "netinet/ip_proxy.h" 67#include "netinet/ip_nat.h" 68#include "netinet/ip_frag.h" 69#include "netinet/ip_state.h" 70#include "netinet/ip_auth.h" 71#if (__FreeBSD_version >= 300000) 72# include <sys/malloc.h> 73# if (defined(KERNEL) || defined(_KERNEL)) 74# ifndef IPFILTER_LKM 75# include <sys/libkern.h> 76# include <sys/systm.h> 77# endif 78extern struct callout_handle ipfr_slowtimer_ch; 79# endif 80#endif 81#if defined(__NetBSD__) && (__NetBSD_Version__ >= 104230000) 82# include <sys/callout.h> 83extern struct callout ipfr_slowtimer_ch; 84#endif 85#if defined(__OpenBSD__) 86# include <sys/timeout.h> 87extern struct timeout ipfr_slowtimer_ch; 88#endif 89 90#if !defined(lint) 91static const char sccsid[] = "@(#)ip_frag.c 1.11 3/24/96 (C) 1993-2000 Darren Reed"; 92static const char rcsid[] = "@(#)$FreeBSD: head/sys/contrib/ipfilter/netinet/ip_frag.c 89336 2002-01-14 09:07:15Z alfred $"; 93#endif 94 95 96static ipfr_t *ipfr_heads[IPFT_SIZE]; 97static ipfr_t *ipfr_nattab[IPFT_SIZE]; 98static ipfrstat_t ipfr_stats; 99static int ipfr_inuse = 0; 100 101int fr_ipfrttl = 120; /* 60 seconds */ 102int fr_frag_lock = 0; 103 104#ifdef _KERNEL 105# if SOLARIS2 >= 7 106extern timeout_id_t ipfr_timer_id; 107# else 108extern int ipfr_timer_id; 109# endif 110#endif 111#if (SOLARIS || defined(__sgi)) && defined(_KERNEL) 112extern KRWLOCK_T ipf_frag, ipf_natfrag, ipf_nat, ipf_mutex; 113# if SOLARIS 114extern KRWLOCK_T ipf_solaris; 115# else 116KRWLOCK_T ipf_solaris; 117# endif 118extern kmutex_t ipf_rw; 119#endif 120 121 122static ipfr_t *ipfr_new __P((ip_t *, fr_info_t *, u_int, ipfr_t **)); 123static ipfr_t *ipfr_lookup __P((ip_t *, fr_info_t *, ipfr_t **)); 124static void ipfr_delete __P((ipfr_t *)); 125 126 127ipfrstat_t *ipfr_fragstats() 128{ 129 ipfr_stats.ifs_table = ipfr_heads; 130 ipfr_stats.ifs_nattab = ipfr_nattab; 131 ipfr_stats.ifs_inuse = ipfr_inuse; 132 return &ipfr_stats; 133} 134 135 136/* 137 * add a new entry to the fragment cache, registering it as having come 138 * through this box, with the result of the filter operation. 139 */ 140static ipfr_t *ipfr_new(ip, fin, pass, table) 141ip_t *ip; 142fr_info_t *fin; 143u_int pass; 144ipfr_t *table[]; 145{ 146 ipfr_t **fp, *fra, frag; 147 u_int idx, off; 148 149 if (ipfr_inuse >= IPFT_SIZE) 150 return NULL; 151 152 if (!(fin->fin_fl & FI_FRAG)) 153 return NULL; 154 155 frag.ipfr_p = ip->ip_p; 156 idx = ip->ip_p; 157 frag.ipfr_id = ip->ip_id; 158 idx += ip->ip_id; 159 frag.ipfr_tos = ip->ip_tos; 160 frag.ipfr_src.s_addr = ip->ip_src.s_addr; 161 idx += ip->ip_src.s_addr; 162 frag.ipfr_dst.s_addr = ip->ip_dst.s_addr; 163 idx += ip->ip_dst.s_addr; 164 frag.ipfr_ifp = fin->fin_ifp; 165 idx *= 127; 166 idx %= IPFT_SIZE; 167 168 frag.ipfr_optmsk = fin->fin_fi.fi_optmsk & IPF_OPTCOPY; 169 frag.ipfr_secmsk = fin->fin_fi.fi_secmsk; 170 frag.ipfr_auth = fin->fin_fi.fi_auth; 171 172 /* 173 * first, make sure it isn't already there... 174 */ 175 for (fp = &table[idx]; (fra = *fp); fp = &fra->ipfr_next) 176 if (!bcmp((char *)&frag.ipfr_src, (char *)&fra->ipfr_src, 177 IPFR_CMPSZ)) { 178 ATOMIC_INCL(ipfr_stats.ifs_exists); 179 return NULL; 180 } 181 182 /* 183 * allocate some memory, if possible, if not, just record that we 184 * failed to do so. 185 */ 186 KMALLOC(fra, ipfr_t *); 187 if (fra == NULL) { 188 ATOMIC_INCL(ipfr_stats.ifs_nomem); 189 return NULL; 190 } 191 192 if ((fra->ipfr_rule = fin->fin_fr) != NULL) { 193 ATOMIC_INC32(fin->fin_fr->fr_ref); 194 } 195 196 197 /* 198 * Instert the fragment into the fragment table, copy the struct used 199 * in the search using bcopy rather than reassign each field. 200 * Set the ttl to the default and mask out logging from "pass" 201 */ 202 if ((fra->ipfr_next = table[idx])) 203 table[idx]->ipfr_prev = fra; 204 fra->ipfr_prev = NULL; 205 fra->ipfr_data = NULL; 206 table[idx] = fra; 207 bcopy((char *)&frag.ipfr_src, (char *)&fra->ipfr_src, IPFR_CMPSZ); 208 fra->ipfr_ttl = fr_ipfrttl; 209 /* 210 * Compute the offset of the expected start of the next packet. 211 */ 212 off = ip->ip_off & IP_OFFMASK; 213 if (!off) 214 fra->ipfr_seen0 = 1; 215 fra->ipfr_off = off + (fin->fin_dlen >> 3); 216 ATOMIC_INCL(ipfr_stats.ifs_new); 217 ATOMIC_INC32(ipfr_inuse); 218 return fra; 219} 220 221 222int ipfr_newfrag(ip, fin, pass) 223ip_t *ip; 224fr_info_t *fin; 225u_int pass; 226{ 227 ipfr_t *ipf; 228 229 if ((ip->ip_v != 4) || (fr_frag_lock)) 230 return -1; 231 WRITE_ENTER(&ipf_frag); 232 ipf = ipfr_new(ip, fin, pass, ipfr_heads); 233 RWLOCK_EXIT(&ipf_frag); 234 if (ipf == NULL) { 235 ATOMIC_INCL(frstats[fin->fin_out].fr_bnfr); 236 return -1; 237 } 238 ATOMIC_INCL(frstats[fin->fin_out].fr_nfr); 239 return 0; 240} 241 242 243int ipfr_nat_newfrag(ip, fin, pass, nat) 244ip_t *ip; 245fr_info_t *fin; 246u_int pass; 247nat_t *nat; 248{ 249 ipfr_t *ipf; 250 int off; 251 252 if ((ip->ip_v != 4) || (fr_frag_lock)) 253 return -1; 254 255 off = fin->fin_off; 256 off <<= 3; 257 if ((off + fin->fin_dlen) > 0xffff || (fin->fin_dlen == 0)) 258 return NULL; 259 260 WRITE_ENTER(&ipf_natfrag); 261 ipf = ipfr_new(ip, fin, pass, ipfr_nattab); 262 if (ipf != NULL) { 263 ipf->ipfr_data = nat; 264 nat->nat_data = ipf; 265 } 266 RWLOCK_EXIT(&ipf_natfrag); 267 return ipf ? 0 : -1; 268} 269 270 271/* 272 * check the fragment cache to see if there is already a record of this packet 273 * with its filter result known. 274 */ 275static ipfr_t *ipfr_lookup(ip, fin, table) 276ip_t *ip; 277fr_info_t *fin; 278ipfr_t *table[]; 279{ 280 ipfr_t *f, frag; 281 u_int idx; 282 283 /* 284 * For fragments, we record protocol, packet id, TOS and both IP#'s 285 * (these should all be the same for all fragments of a packet). 286 * 287 * build up a hash value to index the table with. 288 */ 289 frag.ipfr_p = ip->ip_p; 290 idx = ip->ip_p; 291 frag.ipfr_id = ip->ip_id; 292 idx += ip->ip_id; 293 frag.ipfr_tos = ip->ip_tos; 294 frag.ipfr_src.s_addr = ip->ip_src.s_addr; 295 idx += ip->ip_src.s_addr; 296 frag.ipfr_dst.s_addr = ip->ip_dst.s_addr; 297 idx += ip->ip_dst.s_addr; 298 frag.ipfr_ifp = fin->fin_ifp; 299 idx *= 127; 300 idx %= IPFT_SIZE; 301 302 frag.ipfr_optmsk = fin->fin_fi.fi_optmsk & IPF_OPTCOPY; 303 frag.ipfr_secmsk = fin->fin_fi.fi_secmsk; 304 frag.ipfr_auth = fin->fin_fi.fi_auth; 305 306 /* 307 * check the table, careful to only compare the right amount of data 308 */ 309 for (f = table[idx]; f; f = f->ipfr_next) 310 if (!bcmp((char *)&frag.ipfr_src, (char *)&f->ipfr_src, 311 IPFR_CMPSZ)) { 312 u_short atoff, off; 313 314 off = fin->fin_off; 315 316 /* 317 * XXX - We really need to be guarding against the 318 * retransmission of (src,dst,id,offset-range) here 319 * because a fragmented packet is never resent with 320 * the same IP ID#. 321 */ 322 if (f->ipfr_seen0) { 323 if (!off || (fin->fin_fl & FI_SHORT)) 324 continue; 325 } else if (!off) 326 f->ipfr_seen0 = 1; 327 328 if (f != table[idx]) { 329 /* 330 * move fragment info. to the top of the list 331 * to speed up searches. 332 */ 333 if ((f->ipfr_prev->ipfr_next = f->ipfr_next)) 334 f->ipfr_next->ipfr_prev = f->ipfr_prev; 335 f->ipfr_next = table[idx]; 336 table[idx]->ipfr_prev = f; 337 f->ipfr_prev = NULL; 338 table[idx] = f; 339 } 340 atoff = off + (fin->fin_dlen >> 3); 341 /* 342 * If we've follwed the fragments, and this is the 343 * last (in order), shrink expiration time. 344 */ 345 if (off == f->ipfr_off) { 346 if (!(ip->ip_off & IP_MF)) 347 f->ipfr_ttl = 1; 348 else 349 f->ipfr_off = atoff; 350 } 351 ATOMIC_INCL(ipfr_stats.ifs_hits); 352 return f; 353 } 354 return NULL; 355} 356 357 358/* 359 * functional interface for NAT lookups of the NAT fragment cache 360 */ 361nat_t *ipfr_nat_knownfrag(ip, fin) 362ip_t *ip; 363fr_info_t *fin; 364{ 365 ipfr_t *ipf; 366 nat_t *nat; 367 int off; 368 369 if ((fin->fin_v != 4) || (fr_frag_lock)) 370 return NULL; 371 372 off = fin->fin_off; 373 off <<= 3; 374 if ((off + fin->fin_dlen) > 0xffff || (fin->fin_dlen == 0)) 375 return NULL; 376 377 READ_ENTER(&ipf_natfrag); 378 ipf = ipfr_lookup(ip, fin, ipfr_nattab); 379 if (ipf != NULL) { 380 nat = ipf->ipfr_data; 381 /* 382 * This is the last fragment for this packet. 383 */ 384 if ((ipf->ipfr_ttl == 1) && (nat != NULL)) { 385 nat->nat_data = NULL; 386 ipf->ipfr_data = NULL; 387 } 388 } else 389 nat = NULL; 390 RWLOCK_EXIT(&ipf_natfrag); 391 return nat; 392} 393 394 395/* 396 * functional interface for normal lookups of the fragment cache 397 */ 398frentry_t *ipfr_knownfrag(ip, fin) 399ip_t *ip; 400fr_info_t *fin; 401{ 402 frentry_t *fr; 403 ipfr_t *fra; 404 int off; 405 406 if ((fin->fin_v != 4) || (fr_frag_lock)) 407 return NULL; 408 409 off = fin->fin_off; 410 off <<= 3; 411 if ((off + fin->fin_dlen) > 0xffff || (fin->fin_dlen == 0)) 412 return NULL; 413 414 READ_ENTER(&ipf_frag); 415 fra = ipfr_lookup(ip, fin, ipfr_heads); 416 if (fra != NULL) 417 fr = fra->ipfr_rule; 418 else 419 fr = NULL; 420 RWLOCK_EXIT(&ipf_frag); 421 return fr; 422} 423 424 425/* 426 * forget any references to this external object. 427 */ 428void ipfr_forget(nat) 429void *nat; 430{ 431 ipfr_t *fr; 432 int idx; 433 434 WRITE_ENTER(&ipf_natfrag); 435 for (idx = IPFT_SIZE - 1; idx >= 0; idx--) 436 for (fr = ipfr_heads[idx]; fr; fr = fr->ipfr_next) 437 if (fr->ipfr_data == nat) 438 fr->ipfr_data = NULL; 439 440 RWLOCK_EXIT(&ipf_natfrag); 441} 442 443 444static void ipfr_delete(fra) 445ipfr_t *fra; 446{ 447 frentry_t *fr; 448 449 fr = fra->ipfr_rule; 450 if (fr != NULL) { 451 ATOMIC_DEC32(fr->fr_ref); 452 if (fr->fr_ref == 0) 453 KFREE(fr); 454 } 455 if (fra->ipfr_prev) 456 fra->ipfr_prev->ipfr_next = fra->ipfr_next; 457 if (fra->ipfr_next) 458 fra->ipfr_next->ipfr_prev = fra->ipfr_prev; 459 KFREE(fra); 460} 461 462 463/* 464 * Free memory in use by fragment state info. kept. 465 */ 466void ipfr_unload() 467{ 468 ipfr_t **fp, *fra; 469 nat_t *nat; 470 int idx; 471 472 WRITE_ENTER(&ipf_frag); 473 for (idx = IPFT_SIZE - 1; idx >= 0; idx--) 474 for (fp = &ipfr_heads[idx]; (fra = *fp); ) { 475 *fp = fra->ipfr_next; 476 ipfr_delete(fra); 477 } 478 RWLOCK_EXIT(&ipf_frag); 479 480 WRITE_ENTER(&ipf_nat); 481 WRITE_ENTER(&ipf_natfrag); 482 for (idx = IPFT_SIZE - 1; idx >= 0; idx--) 483 for (fp = &ipfr_nattab[idx]; (fra = *fp); ) { 484 *fp = fra->ipfr_next; 485 nat = fra->ipfr_data; 486 if (nat != NULL) { 487 if (nat->nat_data == fra) 488 nat->nat_data = NULL; 489 } 490 ipfr_delete(fra); 491 } 492 RWLOCK_EXIT(&ipf_natfrag); 493 RWLOCK_EXIT(&ipf_nat); 494} 495 496 497#ifdef _KERNEL 498void ipfr_fragexpire() 499{ 500 ipfr_t **fp, *fra; 501 nat_t *nat; 502 int idx; 503#if defined(_KERNEL) 504# if !SOLARIS 505 int s; 506# endif 507#endif 508 509 if (fr_frag_lock) 510 return; 511 512 SPL_NET(s); 513 WRITE_ENTER(&ipf_frag); 514 515 /* 516 * Go through the entire table, looking for entries to expire, 517 * decreasing the ttl by one for each entry. If it reaches 0, 518 * remove it from the chain and free it. 519 */ 520 for (idx = IPFT_SIZE - 1; idx >= 0; idx--) 521 for (fp = &ipfr_heads[idx]; (fra = *fp); ) { 522 --fra->ipfr_ttl; 523 if (fra->ipfr_ttl == 0) { 524 *fp = fra->ipfr_next; 525 ipfr_delete(fra); 526 ATOMIC_INCL(ipfr_stats.ifs_expire); 527 ATOMIC_DEC32(ipfr_inuse); 528 } else 529 fp = &fra->ipfr_next; 530 } 531 RWLOCK_EXIT(&ipf_frag); 532 533 /* 534 * Same again for the NAT table, except that if the structure also 535 * still points to a NAT structure, and the NAT structure points back 536 * at the one to be free'd, NULL the reference from the NAT struct. 537 * NOTE: We need to grab both mutex's early, and in this order so as 538 * to prevent a deadlock if both try to expire at the same time. 539 */ 540 WRITE_ENTER(&ipf_nat); 541 WRITE_ENTER(&ipf_natfrag); 542 for (idx = IPFT_SIZE - 1; idx >= 0; idx--) 543 for (fp = &ipfr_nattab[idx]; (fra = *fp); ) { 544 --fra->ipfr_ttl; 545 if (fra->ipfr_ttl == 0) { 546 ATOMIC_INCL(ipfr_stats.ifs_expire); 547 ATOMIC_DEC32(ipfr_inuse); 548 nat = fra->ipfr_data; 549 if (nat != NULL) { 550 if (nat->nat_data == fra) 551 nat->nat_data = NULL; 552 } 553 *fp = fra->ipfr_next; 554 ipfr_delete(fra); 555 } else 556 fp = &fra->ipfr_next; 557 } 558 RWLOCK_EXIT(&ipf_natfrag); 559 RWLOCK_EXIT(&ipf_nat); 560 SPL_X(s); 561} 562 563 564/* 565 * Slowly expire held state for fragments. Timeouts are set * in expectation 566 * of this being called twice per second. 567 */ 568# if (BSD >= 199306) || SOLARIS || defined(__sgi) 569# if defined(SOLARIS2) && (SOLARIS2 < 7) 570void ipfr_slowtimer() 571# else 572void ipfr_slowtimer __P((void *ptr)) 573# endif 574# else 575int ipfr_slowtimer() 576# endif 577{ 578#if defined(_KERNEL) && SOLARIS 579 extern int fr_running; 580 581 if (fr_running <= 0) 582 return; 583#endif 584 585 READ_ENTER(&ipf_solaris); 586#ifdef __sgi 587 ipfilter_sgi_intfsync(); 588#endif 589 590 ipfr_fragexpire(); 591 fr_timeoutstate(); 592 ip_natexpire(); 593 fr_authexpire(); 594# if SOLARIS 595 ipfr_timer_id = timeout(ipfr_slowtimer, NULL, drv_usectohz(500000)); 596 RWLOCK_EXIT(&ipf_solaris); 597# else 598# if defined(__NetBSD__) && (__NetBSD_Version__ >= 104240000) 599 callout_reset(&ipfr_slowtimer_ch, hz / 2, ipfr_slowtimer, NULL); 600# else 601# if (__FreeBSD_version >= 300000) 602 ipfr_slowtimer_ch = timeout(ipfr_slowtimer, NULL, hz/2); 603# else 604# if defined(__OpenBSD_) 605 timeout_add(&ipfr_slowtimer_ch, hz/2, ipfr_slowtimer, NULL); 606# else 607 timeout(ipfr_slowtimer, NULL, hz/2); 608# endif 609# endif 610# if (BSD < 199306) && !defined(__sgi) 611 return 0; 612# endif /* FreeBSD */ 613# endif /* NetBSD */ 614# endif /* SOLARIS */ 615} 616#endif /* defined(_KERNEL) */ 617