ip_input.c revision 1.229
1/* $NetBSD: ip_input.c,v 1.229 2006/08/30 18:55:09 christos Exp $ */ 2 3/* 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the project nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32/*- 33 * Copyright (c) 1998 The NetBSD Foundation, Inc. 34 * All rights reserved. 35 * 36 * This code is derived from software contributed to The NetBSD Foundation 37 * by Public Access Networks Corporation ("Panix"). It was developed under 38 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. All advertising materials mentioning features or use of this software 49 * must display the following acknowledgement: 50 * This product includes software developed by the NetBSD 51 * Foundation, Inc. and its contributors. 52 * 4. Neither the name of The NetBSD Foundation nor the names of its 53 * contributors may be used to endorse or promote products derived 54 * from this software without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 57 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 58 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 59 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 60 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 61 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 62 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 63 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 64 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 65 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 66 * POSSIBILITY OF SUCH DAMAGE. 67 */ 68 69/* 70 * Copyright (c) 1982, 1986, 1988, 1993 71 * The Regents of the University of California. All rights reserved. 72 * 73 * Redistribution and use in source and binary forms, with or without 74 * modification, are permitted provided that the following conditions 75 * are met: 76 * 1. Redistributions of source code must retain the above copyright 77 * notice, this list of conditions and the following disclaimer. 78 * 2. Redistributions in binary form must reproduce the above copyright 79 * notice, this list of conditions and the following disclaimer in the 80 * documentation and/or other materials provided with the distribution. 81 * 3. Neither the name of the University nor the names of its contributors 82 * may be used to endorse or promote products derived from this software 83 * without specific prior written permission. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 86 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 89 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 * SUCH DAMAGE. 96 * 97 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 98 */ 99 100#include <sys/cdefs.h> 101__KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.229 2006/08/30 18:55:09 christos Exp $"); 102 103#include "opt_inet.h" 104#include "opt_gateway.h" 105#include "opt_pfil_hooks.h" 106#include "opt_ipsec.h" 107#include "opt_mrouting.h" 108#include "opt_mbuftrace.h" 109#include "opt_inet_csum.h" 110 111#include <sys/param.h> 112#include <sys/systm.h> 113#include <sys/malloc.h> 114#include <sys/mbuf.h> 115#include <sys/domain.h> 116#include <sys/protosw.h> 117#include <sys/socket.h> 118#include <sys/socketvar.h> 119#include <sys/errno.h> 120#include <sys/time.h> 121#include <sys/kernel.h> 122#include <sys/pool.h> 123#include <sys/sysctl.h> 124 125#include <net/if.h> 126#include <net/if_dl.h> 127#include <net/route.h> 128#include <net/pfil.h> 129 130#include <netinet/in.h> 131#include <netinet/in_systm.h> 132#include <netinet/ip.h> 133#include <netinet/in_pcb.h> 134#include <netinet/in_proto.h> 135#include <netinet/in_var.h> 136#include <netinet/ip_var.h> 137#include <netinet/ip_icmp.h> 138/* just for gif_ttl */ 139#include <netinet/in_gif.h> 140#include "gif.h" 141#include <net/if_gre.h> 142#include "gre.h" 143 144#ifdef MROUTING 145#include <netinet/ip_mroute.h> 146#endif 147 148#ifdef IPSEC 149#include <netinet6/ipsec.h> 150#include <netkey/key.h> 151#endif 152#ifdef FAST_IPSEC 153#include <netipsec/ipsec.h> 154#include <netipsec/key.h> 155#endif /* FAST_IPSEC*/ 156 157#ifndef IPFORWARDING 158#ifdef GATEWAY 159#define IPFORWARDING 1 /* forward IP packets not for us */ 160#else /* GATEWAY */ 161#define IPFORWARDING 0 /* don't forward IP packets not for us */ 162#endif /* GATEWAY */ 163#endif /* IPFORWARDING */ 164#ifndef IPSENDREDIRECTS 165#define IPSENDREDIRECTS 1 166#endif 167#ifndef IPFORWSRCRT 168#define IPFORWSRCRT 1 /* forward source-routed packets */ 169#endif 170#ifndef IPALLOWSRCRT 171#define IPALLOWSRCRT 1 /* allow source-routed packets */ 172#endif 173#ifndef IPMTUDISC 174#define IPMTUDISC 1 175#endif 176#ifndef IPMTUDISCTIMEOUT 177#define IPMTUDISCTIMEOUT (10 * 60) /* as per RFC 1191 */ 178#endif 179 180/* 181 * Note: DIRECTED_BROADCAST is handled this way so that previous 182 * configuration using this option will Just Work. 183 */ 184#ifndef IPDIRECTEDBCAST 185#ifdef DIRECTED_BROADCAST 186#define IPDIRECTEDBCAST 1 187#else 188#define IPDIRECTEDBCAST 0 189#endif /* DIRECTED_BROADCAST */ 190#endif /* IPDIRECTEDBCAST */ 191int ipforwarding = IPFORWARDING; 192int ipsendredirects = IPSENDREDIRECTS; 193int ip_defttl = IPDEFTTL; 194int ip_forwsrcrt = IPFORWSRCRT; 195int ip_directedbcast = IPDIRECTEDBCAST; 196int ip_allowsrcrt = IPALLOWSRCRT; 197int ip_mtudisc = IPMTUDISC; 198int ip_mtudisc_timeout = IPMTUDISCTIMEOUT; 199#ifdef DIAGNOSTIC 200int ipprintfs = 0; 201#endif 202 203int ip_do_randomid = 0; 204 205/* 206 * XXX - Setting ip_checkinterface mostly implements the receive side of 207 * the Strong ES model described in RFC 1122, but since the routing table 208 * and transmit implementation do not implement the Strong ES model, 209 * setting this to 1 results in an odd hybrid. 210 * 211 * XXX - ip_checkinterface currently must be disabled if you use ipnat 212 * to translate the destination address to another local interface. 213 * 214 * XXX - ip_checkinterface must be disabled if you add IP aliases 215 * to the loopback interface instead of the interface where the 216 * packets for those addresses are received. 217 */ 218int ip_checkinterface = 0; 219 220 221struct rttimer_queue *ip_mtudisc_timeout_q = NULL; 222 223int ipqmaxlen = IFQ_MAXLEN; 224u_long in_ifaddrhash; /* size of hash table - 1 */ 225int in_ifaddrentries; /* total number of addrs */ 226struct in_ifaddrhead in_ifaddrhead; 227struct in_ifaddrhashhead *in_ifaddrhashtbl; 228u_long in_multihash; /* size of hash table - 1 */ 229int in_multientries; /* total number of addrs */ 230struct in_multihashhead *in_multihashtbl; 231struct ifqueue ipintrq; 232struct ipstat ipstat; 233uint16_t ip_id; 234 235#ifdef PFIL_HOOKS 236struct pfil_head inet_pfil_hook; 237#endif 238 239/* 240 * Cached copy of nmbclusters. If nbclusters is different, 241 * recalculate IP parameters derived from nmbclusters. 242 */ 243static int ip_nmbclusters; /* copy of nmbclusters */ 244static void ip_nmbclusters_changed(void); /* recalc limits */ 245 246#define CHECK_NMBCLUSTER_PARAMS() \ 247do { \ 248 if (__predict_false(ip_nmbclusters != nmbclusters)) \ 249 ip_nmbclusters_changed(); \ 250} while (/*CONSTCOND*/0) 251 252/* IP datagram reassembly queues (hashed) */ 253#define IPREASS_NHASH_LOG2 6 254#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) 255#define IPREASS_HMASK (IPREASS_NHASH - 1) 256#define IPREASS_HASH(x,y) \ 257 (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) 258struct ipqhead ipq[IPREASS_NHASH]; 259int ipq_locked; 260static int ip_nfragpackets; /* packets in reass queue */ 261static int ip_nfrags; /* total fragments in reass queues */ 262 263int ip_maxfragpackets = 200; /* limit on packets. XXX sysctl */ 264int ip_maxfrags; /* limit on fragments. XXX sysctl */ 265 266 267/* 268 * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for 269 * IP reassembly queue buffer managment. 270 * 271 * We keep a count of total IP fragments (NB: not fragmented packets!) 272 * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments. 273 * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the 274 * total fragments in reassembly queues.This AIMD policy avoids 275 * repeatedly deleting single packets under heavy fragmentation load 276 * (e.g., from lossy NFS peers). 277 */ 278static u_int ip_reass_ttl_decr(u_int ticks); 279static void ip_reass_drophalf(void); 280 281 282static inline int ipq_lock_try(void); 283static inline void ipq_unlock(void); 284 285static inline int 286ipq_lock_try(void) 287{ 288 int s; 289 290 /* 291 * Use splvm() -- we're blocking things that would cause 292 * mbuf allocation. 293 */ 294 s = splvm(); 295 if (ipq_locked) { 296 splx(s); 297 return (0); 298 } 299 ipq_locked = 1; 300 splx(s); 301 return (1); 302} 303 304static inline void 305ipq_unlock(void) 306{ 307 int s; 308 309 s = splvm(); 310 ipq_locked = 0; 311 splx(s); 312} 313 314#ifdef DIAGNOSTIC 315#define IPQ_LOCK() \ 316do { \ 317 if (ipq_lock_try() == 0) { \ 318 printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \ 319 panic("ipq_lock"); \ 320 } \ 321} while (/*CONSTCOND*/ 0) 322#define IPQ_LOCK_CHECK() \ 323do { \ 324 if (ipq_locked == 0) { \ 325 printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \ 326 panic("ipq lock check"); \ 327 } \ 328} while (/*CONSTCOND*/ 0) 329#else 330#define IPQ_LOCK() (void) ipq_lock_try() 331#define IPQ_LOCK_CHECK() /* nothing */ 332#endif 333 334#define IPQ_UNLOCK() ipq_unlock() 335 336POOL_INIT(inmulti_pool, sizeof(struct in_multi), 0, 0, 0, "inmltpl", NULL); 337POOL_INIT(ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl", NULL); 338 339#ifdef INET_CSUM_COUNTERS 340#include <sys/device.h> 341 342struct evcnt ip_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 343 NULL, "inet", "hwcsum bad"); 344struct evcnt ip_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 345 NULL, "inet", "hwcsum ok"); 346struct evcnt ip_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 347 NULL, "inet", "swcsum"); 348 349#define INET_CSUM_COUNTER_INCR(ev) (ev)->ev_count++ 350 351EVCNT_ATTACH_STATIC(ip_hwcsum_bad); 352EVCNT_ATTACH_STATIC(ip_hwcsum_ok); 353EVCNT_ATTACH_STATIC(ip_swcsum); 354 355#else 356 357#define INET_CSUM_COUNTER_INCR(ev) /* nothing */ 358 359#endif /* INET_CSUM_COUNTERS */ 360 361/* 362 * We need to save the IP options in case a protocol wants to respond 363 * to an incoming packet over the same route if the packet got here 364 * using IP source routing. This allows connection establishment and 365 * maintenance when the remote end is on a network that is not known 366 * to us. 367 */ 368int ip_nhops = 0; 369static struct ip_srcrt { 370 struct in_addr dst; /* final destination */ 371 char nop; /* one NOP to align */ 372 char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */ 373 struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)]; 374} ip_srcrt; 375 376static void save_rte(u_char *, struct in_addr); 377 378#ifdef MBUFTRACE 379struct mowner ip_rx_mowner = { "internet", "rx" }; 380struct mowner ip_tx_mowner = { "internet", "tx" }; 381#endif 382 383/* 384 * Compute IP limits derived from the value of nmbclusters. 385 */ 386static void 387ip_nmbclusters_changed(void) 388{ 389 ip_maxfrags = nmbclusters / 4; 390 ip_nmbclusters = nmbclusters; 391} 392 393/* 394 * IP initialization: fill in IP protocol switch table. 395 * All protocols not implemented in kernel go to raw IP protocol handler. 396 */ 397void 398ip_init(void) 399{ 400 const struct protosw *pr; 401 int i; 402 403 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 404 if (pr == 0) 405 panic("ip_init"); 406 for (i = 0; i < IPPROTO_MAX; i++) 407 ip_protox[i] = pr - inetsw; 408 for (pr = inetdomain.dom_protosw; 409 pr < inetdomain.dom_protoswNPROTOSW; pr++) 410 if (pr->pr_domain->dom_family == PF_INET && 411 pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) 412 ip_protox[pr->pr_protocol] = pr - inetsw; 413 414 for (i = 0; i < IPREASS_NHASH; i++) 415 LIST_INIT(&ipq[i]); 416 417 ip_id = time_second & 0xfffff; 418 419 ipintrq.ifq_maxlen = ipqmaxlen; 420 ip_nmbclusters_changed(); 421 422 TAILQ_INIT(&in_ifaddrhead); 423 in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, M_IFADDR, 424 M_WAITOK, &in_ifaddrhash); 425 in_multihashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, M_IPMADDR, 426 M_WAITOK, &in_multihash); 427 ip_mtudisc_timeout_q = rt_timer_queue_create(ip_mtudisc_timeout); 428#ifdef GATEWAY 429 ipflow_init(); 430#endif 431 432#ifdef PFIL_HOOKS 433 /* Register our Packet Filter hook. */ 434 inet_pfil_hook.ph_type = PFIL_TYPE_AF; 435 inet_pfil_hook.ph_af = AF_INET; 436 i = pfil_head_register(&inet_pfil_hook); 437 if (i != 0) 438 printf("ip_init: WARNING: unable to register pfil hook, " 439 "error %d\n", i); 440#endif /* PFIL_HOOKS */ 441 442#ifdef MBUFTRACE 443 MOWNER_ATTACH(&ip_tx_mowner); 444 MOWNER_ATTACH(&ip_rx_mowner); 445#endif /* MBUFTRACE */ 446} 447 448struct sockaddr_in ipaddr = { 449 .sin_len = sizeof(ipaddr), 450 .sin_family = AF_INET, 451}; 452struct route ipforward_rt; 453 454/* 455 * IP software interrupt routine 456 */ 457void 458ipintr(void) 459{ 460 int s; 461 struct mbuf *m; 462 463 while (1) { 464 s = splnet(); 465 IF_DEQUEUE(&ipintrq, m); 466 splx(s); 467 if (m == 0) 468 return; 469 MCLAIM(m, &ip_rx_mowner); 470 ip_input(m); 471 } 472} 473 474/* 475 * Ip input routine. Checksum and byte swap header. If fragmented 476 * try to reassemble. Process options. Pass to next level. 477 */ 478void 479ip_input(struct mbuf *m) 480{ 481 struct ip *ip = NULL; 482 struct ipq *fp; 483 struct in_ifaddr *ia; 484 struct ifaddr *ifa; 485 struct ipqent *ipqe; 486 int hlen = 0, mff, len; 487 int downmatch; 488 int checkif; 489 int srcrt = 0; 490 u_int hash; 491#ifdef FAST_IPSEC 492 struct m_tag *mtag; 493 struct tdb_ident *tdbi; 494 struct secpolicy *sp; 495 int s, error; 496#endif /* FAST_IPSEC */ 497 498 MCLAIM(m, &ip_rx_mowner); 499#ifdef DIAGNOSTIC 500 if ((m->m_flags & M_PKTHDR) == 0) 501 panic("ipintr no HDR"); 502#endif 503 504 /* 505 * If no IP addresses have been set yet but the interfaces 506 * are receiving, can't do anything with incoming packets yet. 507 */ 508 if (TAILQ_FIRST(&in_ifaddrhead) == 0) 509 goto bad; 510 ipstat.ips_total++; 511 /* 512 * If the IP header is not aligned, slurp it up into a new 513 * mbuf with space for link headers, in the event we forward 514 * it. Otherwise, if it is aligned, make sure the entire 515 * base IP header is in the first mbuf of the chain. 516 */ 517 if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { 518 if ((m = m_copyup(m, sizeof(struct ip), 519 (max_linkhdr + 3) & ~3)) == NULL) { 520 /* XXXJRT new stat, please */ 521 ipstat.ips_toosmall++; 522 return; 523 } 524 } else if (__predict_false(m->m_len < sizeof (struct ip))) { 525 if ((m = m_pullup(m, sizeof (struct ip))) == NULL) { 526 ipstat.ips_toosmall++; 527 return; 528 } 529 } 530 ip = mtod(m, struct ip *); 531 if (ip->ip_v != IPVERSION) { 532 ipstat.ips_badvers++; 533 goto bad; 534 } 535 hlen = ip->ip_hl << 2; 536 if (hlen < sizeof(struct ip)) { /* minimum header length */ 537 ipstat.ips_badhlen++; 538 goto bad; 539 } 540 if (hlen > m->m_len) { 541 if ((m = m_pullup(m, hlen)) == 0) { 542 ipstat.ips_badhlen++; 543 return; 544 } 545 ip = mtod(m, struct ip *); 546 } 547 548 /* 549 * RFC1122: packets with a multicast source address are 550 * not allowed. 551 */ 552 if (IN_MULTICAST(ip->ip_src.s_addr)) { 553 ipstat.ips_badaddr++; 554 goto bad; 555 } 556 557 /* 127/8 must not appear on wire - RFC1122 */ 558 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 559 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 560 if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { 561 ipstat.ips_badaddr++; 562 goto bad; 563 } 564 } 565 566 switch (m->m_pkthdr.csum_flags & 567 ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_IPv4) | 568 M_CSUM_IPv4_BAD)) { 569 case M_CSUM_IPv4|M_CSUM_IPv4_BAD: 570 INET_CSUM_COUNTER_INCR(&ip_hwcsum_bad); 571 goto badcsum; 572 573 case M_CSUM_IPv4: 574 /* Checksum was okay. */ 575 INET_CSUM_COUNTER_INCR(&ip_hwcsum_ok); 576 break; 577 578 default: 579 /* 580 * Must compute it ourselves. Maybe skip checksum on 581 * loopback interfaces. 582 */ 583 if (__predict_true(!(m->m_pkthdr.rcvif->if_flags & 584 IFF_LOOPBACK) || ip_do_loopback_cksum)) { 585 INET_CSUM_COUNTER_INCR(&ip_swcsum); 586 if (in_cksum(m, hlen) != 0) 587 goto badcsum; 588 } 589 break; 590 } 591 592 /* Retrieve the packet length. */ 593 len = ntohs(ip->ip_len); 594 595 /* 596 * Check for additional length bogosity 597 */ 598 if (len < hlen) { 599 ipstat.ips_badlen++; 600 goto bad; 601 } 602 603 /* 604 * Check that the amount of data in the buffers 605 * is as at least much as the IP header would have us expect. 606 * Trim mbufs if longer than we expect. 607 * Drop packet if shorter than we expect. 608 */ 609 if (m->m_pkthdr.len < len) { 610 ipstat.ips_tooshort++; 611 goto bad; 612 } 613 if (m->m_pkthdr.len > len) { 614 if (m->m_len == m->m_pkthdr.len) { 615 m->m_len = len; 616 m->m_pkthdr.len = len; 617 } else 618 m_adj(m, len - m->m_pkthdr.len); 619 } 620 621#if defined(IPSEC) 622 /* ipflow (IP fast forwarding) is not compatible with IPsec. */ 623 m->m_flags &= ~M_CANFASTFWD; 624#else 625 /* 626 * Assume that we can create a fast-forward IP flow entry 627 * based on this packet. 628 */ 629 m->m_flags |= M_CANFASTFWD; 630#endif 631 632#ifdef PFIL_HOOKS 633 /* 634 * Run through list of hooks for input packets. If there are any 635 * filters which require that additional packets in the flow are 636 * not fast-forwarded, they must clear the M_CANFASTFWD flag. 637 * Note that filters must _never_ set this flag, as another filter 638 * in the list may have previously cleared it. 639 */ 640 /* 641 * let ipfilter look at packet on the wire, 642 * not the decapsulated packet. 643 */ 644#ifdef IPSEC 645 if (!ipsec_getnhist(m)) 646#elif defined(FAST_IPSEC) 647 if (!ipsec_indone(m)) 648#else 649 if (1) 650#endif 651 { 652 struct in_addr odst; 653 654 odst = ip->ip_dst; 655 if (pfil_run_hooks(&inet_pfil_hook, &m, m->m_pkthdr.rcvif, 656 PFIL_IN) != 0) 657 return; 658 if (m == NULL) 659 return; 660 ip = mtod(m, struct ip *); 661 hlen = ip->ip_hl << 2; 662 /* 663 * XXX The setting of "srcrt" here is to prevent ip_forward() 664 * from generating ICMP redirects for packets that have 665 * been redirected by a hook back out on to the same LAN that 666 * they came from and is not an indication that the packet 667 * is being inffluenced by source routing options. This 668 * allows things like 669 * "rdr tlp0 0/0 port 80 -> 1.1.1.200 3128 tcp" 670 * where tlp0 is both on the 1.1.1.0/24 network and is the 671 * default route for hosts on 1.1.1.0/24. Of course this 672 * also requires a "map tlp0 ..." to complete the story. 673 * One might argue whether or not this kind of network config. 674 * should be supported in this manner... 675 */ 676 srcrt = (odst.s_addr != ip->ip_dst.s_addr); 677 } 678#endif /* PFIL_HOOKS */ 679 680#ifdef ALTQ 681 /* XXX Temporary until ALTQ is changed to use a pfil hook */ 682 if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) { 683 /* packet dropped by traffic conditioner */ 684 return; 685 } 686#endif 687 688 /* 689 * Process options and, if not destined for us, 690 * ship it on. ip_dooptions returns 1 when an 691 * error was detected (causing an icmp message 692 * to be sent and the original packet to be freed). 693 */ 694 ip_nhops = 0; /* for source routed packets */ 695 if (hlen > sizeof (struct ip) && ip_dooptions(m)) 696 return; 697 698 /* 699 * Enable a consistency check between the destination address 700 * and the arrival interface for a unicast packet (the RFC 1122 701 * strong ES model) if IP forwarding is disabled and the packet 702 * is not locally generated. 703 * 704 * XXX - Checking also should be disabled if the destination 705 * address is ipnat'ed to a different interface. 706 * 707 * XXX - Checking is incompatible with IP aliases added 708 * to the loopback interface instead of the interface where 709 * the packets are received. 710 * 711 * XXX - We need to add a per ifaddr flag for this so that 712 * we get finer grain control. 713 */ 714 checkif = ip_checkinterface && (ipforwarding == 0) && 715 (m->m_pkthdr.rcvif != NULL) && 716 ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0); 717 718 /* 719 * Check our list of addresses, to see if the packet is for us. 720 * 721 * Traditional 4.4BSD did not consult IFF_UP at all. 722 * The behavior here is to treat addresses on !IFF_UP interface 723 * as not mine. 724 */ 725 downmatch = 0; 726 LIST_FOREACH(ia, &IN_IFADDR_HASH(ip->ip_dst.s_addr), ia_hash) { 727 if (in_hosteq(ia->ia_addr.sin_addr, ip->ip_dst)) { 728 if (checkif && ia->ia_ifp != m->m_pkthdr.rcvif) 729 continue; 730 if ((ia->ia_ifp->if_flags & IFF_UP) != 0) 731 break; 732 else 733 downmatch++; 734 } 735 } 736 if (ia != NULL) 737 goto ours; 738 if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) { 739 IFADDR_FOREACH(ifa, m->m_pkthdr.rcvif) { 740 if (ifa->ifa_addr->sa_family != AF_INET) 741 continue; 742 ia = ifatoia(ifa); 743 if (in_hosteq(ip->ip_dst, ia->ia_broadaddr.sin_addr) || 744 in_hosteq(ip->ip_dst, ia->ia_netbroadcast) || 745 /* 746 * Look for all-0's host part (old broadcast addr), 747 * either for subnet or net. 748 */ 749 ip->ip_dst.s_addr == ia->ia_subnet || 750 ip->ip_dst.s_addr == ia->ia_net) 751 goto ours; 752 /* 753 * An interface with IP address zero accepts 754 * all packets that arrive on that interface. 755 */ 756 if (in_nullhost(ia->ia_addr.sin_addr)) 757 goto ours; 758 } 759 } 760 if (IN_MULTICAST(ip->ip_dst.s_addr)) { 761 struct in_multi *inm; 762#ifdef MROUTING 763 extern struct socket *ip_mrouter; 764 765 if (ip_mrouter) { 766 /* 767 * If we are acting as a multicast router, all 768 * incoming multicast packets are passed to the 769 * kernel-level multicast forwarding function. 770 * The packet is returned (relatively) intact; if 771 * ip_mforward() returns a non-zero value, the packet 772 * must be discarded, else it may be accepted below. 773 * 774 * (The IP ident field is put in the same byte order 775 * as expected when ip_mforward() is called from 776 * ip_output().) 777 */ 778 if (ip_mforward(m, m->m_pkthdr.rcvif) != 0) { 779 ipstat.ips_cantforward++; 780 m_freem(m); 781 return; 782 } 783 784 /* 785 * The process-level routing demon needs to receive 786 * all multicast IGMP packets, whether or not this 787 * host belongs to their destination groups. 788 */ 789 if (ip->ip_p == IPPROTO_IGMP) 790 goto ours; 791 ipstat.ips_forward++; 792 } 793#endif 794 /* 795 * See if we belong to the destination multicast group on the 796 * arrival interface. 797 */ 798 IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm); 799 if (inm == NULL) { 800 ipstat.ips_cantforward++; 801 m_freem(m); 802 return; 803 } 804 goto ours; 805 } 806 if (ip->ip_dst.s_addr == INADDR_BROADCAST || 807 in_nullhost(ip->ip_dst)) 808 goto ours; 809 810 /* 811 * Not for us; forward if possible and desirable. 812 */ 813 if (ipforwarding == 0) { 814 ipstat.ips_cantforward++; 815 m_freem(m); 816 } else { 817 /* 818 * If ip_dst matched any of my address on !IFF_UP interface, 819 * and there's no IFF_UP interface that matches ip_dst, 820 * send icmp unreach. Forwarding it will result in in-kernel 821 * forwarding loop till TTL goes to 0. 822 */ 823 if (downmatch) { 824 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); 825 ipstat.ips_cantforward++; 826 return; 827 } 828#ifdef IPSEC 829 if (ipsec4_in_reject(m, NULL)) { 830 ipsecstat.in_polvio++; 831 goto bad; 832 } 833#endif 834#ifdef FAST_IPSEC 835 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 836 s = splsoftnet(); 837 if (mtag != NULL) { 838 tdbi = (struct tdb_ident *)(mtag + 1); 839 sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND); 840 } else { 841 sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, 842 IP_FORWARDING, &error); 843 } 844 if (sp == NULL) { /* NB: can happen if error */ 845 splx(s); 846 /*XXX error stat???*/ 847 DPRINTF(("ip_input: no SP for forwarding\n")); /*XXX*/ 848 goto bad; 849 } 850 851 /* 852 * Check security policy against packet attributes. 853 */ 854 error = ipsec_in_reject(sp, m); 855 KEY_FREESP(&sp); 856 splx(s); 857 if (error) { 858 ipstat.ips_cantforward++; 859 goto bad; 860 } 861 862 /* 863 * Peek at the outbound SP for this packet to determine if 864 * it's a Fast Forward candidate. 865 */ 866 mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL); 867 if (mtag != NULL) 868 m->m_flags &= ~M_CANFASTFWD; 869 else { 870 s = splsoftnet(); 871 sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, 872 (IP_FORWARDING | 873 (ip_directedbcast ? IP_ALLOWBROADCAST : 0)), 874 &error, NULL); 875 if (sp != NULL) { 876 m->m_flags &= ~M_CANFASTFWD; 877 KEY_FREESP(&sp); 878 } 879 splx(s); 880 } 881#endif /* FAST_IPSEC */ 882 883 ip_forward(m, srcrt); 884 } 885 return; 886 887ours: 888 /* 889 * If offset or IP_MF are set, must reassemble. 890 * Otherwise, nothing need be done. 891 * (We could look in the reassembly queue to see 892 * if the packet was previously fragmented, 893 * but it's not worth the time; just let them time out.) 894 */ 895 if (ip->ip_off & ~htons(IP_DF|IP_RF)) { 896 897 /* 898 * Look for queue of fragments 899 * of this datagram. 900 */ 901 IPQ_LOCK(); 902 hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); 903 /* XXX LIST_FOREACH(fp, &ipq[hash], ipq_q) */ 904 for (fp = LIST_FIRST(&ipq[hash]); fp != NULL; 905 fp = LIST_NEXT(fp, ipq_q)) { 906 if (ip->ip_id == fp->ipq_id && 907 in_hosteq(ip->ip_src, fp->ipq_src) && 908 in_hosteq(ip->ip_dst, fp->ipq_dst) && 909 ip->ip_p == fp->ipq_p) 910 goto found; 911 912 } 913 fp = 0; 914found: 915 916 /* 917 * Adjust ip_len to not reflect header, 918 * set ipqe_mff if more fragments are expected, 919 * convert offset of this to bytes. 920 */ 921 ip->ip_len = htons(ntohs(ip->ip_len) - hlen); 922 mff = (ip->ip_off & htons(IP_MF)) != 0; 923 if (mff) { 924 /* 925 * Make sure that fragments have a data length 926 * that's a non-zero multiple of 8 bytes. 927 */ 928 if (ntohs(ip->ip_len) == 0 || 929 (ntohs(ip->ip_len) & 0x7) != 0) { 930 ipstat.ips_badfrags++; 931 IPQ_UNLOCK(); 932 goto bad; 933 } 934 } 935 ip->ip_off = htons((ntohs(ip->ip_off) & IP_OFFMASK) << 3); 936 937 /* 938 * If datagram marked as having more fragments 939 * or if this is not the first fragment, 940 * attempt reassembly; if it succeeds, proceed. 941 */ 942 if (mff || ip->ip_off != htons(0)) { 943 ipstat.ips_fragments++; 944 ipqe = pool_get(&ipqent_pool, PR_NOWAIT); 945 if (ipqe == NULL) { 946 ipstat.ips_rcvmemdrop++; 947 IPQ_UNLOCK(); 948 goto bad; 949 } 950 ipqe->ipqe_mff = mff; 951 ipqe->ipqe_m = m; 952 ipqe->ipqe_ip = ip; 953 m = ip_reass(ipqe, fp, &ipq[hash]); 954 if (m == 0) { 955 IPQ_UNLOCK(); 956 return; 957 } 958 ipstat.ips_reassembled++; 959 ip = mtod(m, struct ip *); 960 hlen = ip->ip_hl << 2; 961 ip->ip_len = htons(ntohs(ip->ip_len) + hlen); 962 } else 963 if (fp) 964 ip_freef(fp); 965 IPQ_UNLOCK(); 966 } 967 968#if defined(IPSEC) 969 /* 970 * enforce IPsec policy checking if we are seeing last header. 971 * note that we do not visit this with protocols with pcb layer 972 * code - like udp/tcp/raw ip. 973 */ 974 if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0 && 975 ipsec4_in_reject(m, NULL)) { 976 ipsecstat.in_polvio++; 977 goto bad; 978 } 979#endif 980#ifdef FAST_IPSEC 981 /* 982 * enforce IPsec policy checking if we are seeing last header. 983 * note that we do not visit this with protocols with pcb layer 984 * code - like udp/tcp/raw ip. 985 */ 986 if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0) { 987 /* 988 * Check if the packet has already had IPsec processing 989 * done. If so, then just pass it along. This tag gets 990 * set during AH, ESP, etc. input handling, before the 991 * packet is returned to the ip input queue for delivery. 992 */ 993 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 994 s = splsoftnet(); 995 if (mtag != NULL) { 996 tdbi = (struct tdb_ident *)(mtag + 1); 997 sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND); 998 } else { 999 sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, 1000 IP_FORWARDING, &error); 1001 } 1002 if (sp != NULL) { 1003 /* 1004 * Check security policy against packet attributes. 1005 */ 1006 error = ipsec_in_reject(sp, m); 1007 KEY_FREESP(&sp); 1008 } else { 1009 /* XXX error stat??? */ 1010 error = EINVAL; 1011DPRINTF(("ip_input: no SP, packet discarded\n"));/*XXX*/ 1012 goto bad; 1013 } 1014 splx(s); 1015 if (error) 1016 goto bad; 1017 } 1018#endif /* FAST_IPSEC */ 1019 1020 /* 1021 * Switch out to protocol's input routine. 1022 */ 1023#if IFA_STATS 1024 if (ia && ip) 1025 ia->ia_ifa.ifa_data.ifad_inbytes += ntohs(ip->ip_len); 1026#endif 1027 ipstat.ips_delivered++; 1028 { 1029 int off = hlen, nh = ip->ip_p; 1030 1031 (*inetsw[ip_protox[nh]].pr_input)(m, off, nh); 1032 return; 1033 } 1034bad: 1035 m_freem(m); 1036 return; 1037 1038badcsum: 1039 ipstat.ips_badsum++; 1040 m_freem(m); 1041} 1042 1043/* 1044 * Take incoming datagram fragment and try to 1045 * reassemble it into whole datagram. If a chain for 1046 * reassembly of this datagram already exists, then it 1047 * is given as fp; otherwise have to make a chain. 1048 */ 1049struct mbuf * 1050ip_reass(struct ipqent *ipqe, struct ipq *fp, struct ipqhead *ipqhead) 1051{ 1052 struct mbuf *m = ipqe->ipqe_m; 1053 struct ipqent *nq, *p, *q; 1054 struct ip *ip; 1055 struct mbuf *t; 1056 int hlen = ipqe->ipqe_ip->ip_hl << 2; 1057 int i, next; 1058 1059 IPQ_LOCK_CHECK(); 1060 1061 /* 1062 * Presence of header sizes in mbufs 1063 * would confuse code below. 1064 */ 1065 m->m_data += hlen; 1066 m->m_len -= hlen; 1067 1068#ifdef notyet 1069 /* make sure fragment limit is up-to-date */ 1070 CHECK_NMBCLUSTER_PARAMS(); 1071 1072 /* If we have too many fragments, drop the older half. */ 1073 if (ip_nfrags >= ip_maxfrags) 1074 ip_reass_drophalf(void); 1075#endif 1076 1077 /* 1078 * We are about to add a fragment; increment frag count. 1079 */ 1080 ip_nfrags++; 1081 1082 /* 1083 * If first fragment to arrive, create a reassembly queue. 1084 */ 1085 if (fp == 0) { 1086 /* 1087 * Enforce upper bound on number of fragmented packets 1088 * for which we attempt reassembly; 1089 * If maxfrag is 0, never accept fragments. 1090 * If maxfrag is -1, accept all fragments without limitation. 1091 */ 1092 if (ip_maxfragpackets < 0) 1093 ; 1094 else if (ip_nfragpackets >= ip_maxfragpackets) 1095 goto dropfrag; 1096 ip_nfragpackets++; 1097 MALLOC(fp, struct ipq *, sizeof (struct ipq), 1098 M_FTABLE, M_NOWAIT); 1099 if (fp == NULL) 1100 goto dropfrag; 1101 LIST_INSERT_HEAD(ipqhead, fp, ipq_q); 1102 fp->ipq_nfrags = 1; 1103 fp->ipq_ttl = IPFRAGTTL; 1104 fp->ipq_p = ipqe->ipqe_ip->ip_p; 1105 fp->ipq_id = ipqe->ipqe_ip->ip_id; 1106 TAILQ_INIT(&fp->ipq_fragq); 1107 fp->ipq_src = ipqe->ipqe_ip->ip_src; 1108 fp->ipq_dst = ipqe->ipqe_ip->ip_dst; 1109 p = NULL; 1110 goto insert; 1111 } else { 1112 fp->ipq_nfrags++; 1113 } 1114 1115 /* 1116 * Find a segment which begins after this one does. 1117 */ 1118 for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; 1119 p = q, q = TAILQ_NEXT(q, ipqe_q)) 1120 if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off)) 1121 break; 1122 1123 /* 1124 * If there is a preceding segment, it may provide some of 1125 * our data already. If so, drop the data from the incoming 1126 * segment. If it provides all of our data, drop us. 1127 */ 1128 if (p != NULL) { 1129 i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - 1130 ntohs(ipqe->ipqe_ip->ip_off); 1131 if (i > 0) { 1132 if (i >= ntohs(ipqe->ipqe_ip->ip_len)) 1133 goto dropfrag; 1134 m_adj(ipqe->ipqe_m, i); 1135 ipqe->ipqe_ip->ip_off = 1136 htons(ntohs(ipqe->ipqe_ip->ip_off) + i); 1137 ipqe->ipqe_ip->ip_len = 1138 htons(ntohs(ipqe->ipqe_ip->ip_len) - i); 1139 } 1140 } 1141 1142 /* 1143 * While we overlap succeeding segments trim them or, 1144 * if they are completely covered, dequeue them. 1145 */ 1146 for (; q != NULL && 1147 ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) > 1148 ntohs(q->ipqe_ip->ip_off); q = nq) { 1149 i = (ntohs(ipqe->ipqe_ip->ip_off) + 1150 ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off); 1151 if (i < ntohs(q->ipqe_ip->ip_len)) { 1152 q->ipqe_ip->ip_len = 1153 htons(ntohs(q->ipqe_ip->ip_len) - i); 1154 q->ipqe_ip->ip_off = 1155 htons(ntohs(q->ipqe_ip->ip_off) + i); 1156 m_adj(q->ipqe_m, i); 1157 break; 1158 } 1159 nq = TAILQ_NEXT(q, ipqe_q); 1160 m_freem(q->ipqe_m); 1161 TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); 1162 pool_put(&ipqent_pool, q); 1163 fp->ipq_nfrags--; 1164 ip_nfrags--; 1165 } 1166 1167insert: 1168 /* 1169 * Stick new segment in its place; 1170 * check for complete reassembly. 1171 */ 1172 if (p == NULL) { 1173 TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q); 1174 } else { 1175 TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q); 1176 } 1177 next = 0; 1178 for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; 1179 p = q, q = TAILQ_NEXT(q, ipqe_q)) { 1180 if (ntohs(q->ipqe_ip->ip_off) != next) 1181 return (0); 1182 next += ntohs(q->ipqe_ip->ip_len); 1183 } 1184 if (p->ipqe_mff) 1185 return (0); 1186 1187 /* 1188 * Reassembly is complete. Check for a bogus message size and 1189 * concatenate fragments. 1190 */ 1191 q = TAILQ_FIRST(&fp->ipq_fragq); 1192 ip = q->ipqe_ip; 1193 if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) { 1194 ipstat.ips_toolong++; 1195 ip_freef(fp); 1196 return (0); 1197 } 1198 m = q->ipqe_m; 1199 t = m->m_next; 1200 m->m_next = 0; 1201 m_cat(m, t); 1202 nq = TAILQ_NEXT(q, ipqe_q); 1203 pool_put(&ipqent_pool, q); 1204 for (q = nq; q != NULL; q = nq) { 1205 t = q->ipqe_m; 1206 nq = TAILQ_NEXT(q, ipqe_q); 1207 pool_put(&ipqent_pool, q); 1208 m_cat(m, t); 1209 } 1210 ip_nfrags -= fp->ipq_nfrags; 1211 1212 /* 1213 * Create header for new ip packet by 1214 * modifying header of first packet; 1215 * dequeue and discard fragment reassembly header. 1216 * Make header visible. 1217 */ 1218 ip->ip_len = htons(next); 1219 ip->ip_src = fp->ipq_src; 1220 ip->ip_dst = fp->ipq_dst; 1221 LIST_REMOVE(fp, ipq_q); 1222 FREE(fp, M_FTABLE); 1223 ip_nfragpackets--; 1224 m->m_len += (ip->ip_hl << 2); 1225 m->m_data -= (ip->ip_hl << 2); 1226 /* some debugging cruft by sklower, below, will go away soon */ 1227 if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ 1228 int plen = 0; 1229 for (t = m; t; t = t->m_next) 1230 plen += t->m_len; 1231 m->m_pkthdr.len = plen; 1232 m->m_pkthdr.csum_flags = 0; 1233 } 1234 return (m); 1235 1236dropfrag: 1237 if (fp != 0) 1238 fp->ipq_nfrags--; 1239 ip_nfrags--; 1240 ipstat.ips_fragdropped++; 1241 m_freem(m); 1242 pool_put(&ipqent_pool, ipqe); 1243 return (0); 1244} 1245 1246/* 1247 * Free a fragment reassembly header and all 1248 * associated datagrams. 1249 */ 1250void 1251ip_freef(struct ipq *fp) 1252{ 1253 struct ipqent *q, *p; 1254 u_int nfrags = 0; 1255 1256 IPQ_LOCK_CHECK(); 1257 1258 for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) { 1259 p = TAILQ_NEXT(q, ipqe_q); 1260 m_freem(q->ipqe_m); 1261 nfrags++; 1262 TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); 1263 pool_put(&ipqent_pool, q); 1264 } 1265 1266 if (nfrags != fp->ipq_nfrags) 1267 printf("ip_freef: nfrags %d != %d\n", fp->ipq_nfrags, nfrags); 1268 ip_nfrags -= nfrags; 1269 LIST_REMOVE(fp, ipq_q); 1270 FREE(fp, M_FTABLE); 1271 ip_nfragpackets--; 1272} 1273 1274/* 1275 * IP reassembly TTL machinery for multiplicative drop. 1276 */ 1277static u_int fragttl_histo[(IPFRAGTTL+1)]; 1278 1279 1280/* 1281 * Decrement TTL of all reasembly queue entries by `ticks'. 1282 * Count number of distinct fragments (as opposed to partial, fragmented 1283 * datagrams) in the reassembly queue. While we traverse the entire 1284 * reassembly queue, compute and return the median TTL over all fragments. 1285 */ 1286static u_int 1287ip_reass_ttl_decr(u_int ticks) 1288{ 1289 u_int nfrags, median, dropfraction, keepfraction; 1290 struct ipq *fp, *nfp; 1291 int i; 1292 1293 nfrags = 0; 1294 memset(fragttl_histo, 0, sizeof fragttl_histo); 1295 1296 for (i = 0; i < IPREASS_NHASH; i++) { 1297 for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) { 1298 fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ? 1299 0 : fp->ipq_ttl - ticks); 1300 nfp = LIST_NEXT(fp, ipq_q); 1301 if (fp->ipq_ttl == 0) { 1302 ipstat.ips_fragtimeout++; 1303 ip_freef(fp); 1304 } else { 1305 nfrags += fp->ipq_nfrags; 1306 fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags; 1307 } 1308 } 1309 } 1310 1311 KASSERT(ip_nfrags == nfrags); 1312 1313 /* Find median (or other drop fraction) in histogram. */ 1314 dropfraction = (ip_nfrags / 2); 1315 keepfraction = ip_nfrags - dropfraction; 1316 for (i = IPFRAGTTL, median = 0; i >= 0; i--) { 1317 median += fragttl_histo[i]; 1318 if (median >= keepfraction) 1319 break; 1320 } 1321 1322 /* Return TTL of median (or other fraction). */ 1323 return (u_int)i; 1324} 1325 1326void 1327ip_reass_drophalf(void) 1328{ 1329 1330 u_int median_ticks; 1331 /* 1332 * Compute median TTL of all fragments, and count frags 1333 * with that TTL or lower (roughly half of all fragments). 1334 */ 1335 median_ticks = ip_reass_ttl_decr(0); 1336 1337 /* Drop half. */ 1338 median_ticks = ip_reass_ttl_decr(median_ticks); 1339 1340} 1341 1342/* 1343 * IP timer processing; 1344 * if a timer expires on a reassembly 1345 * queue, discard it. 1346 */ 1347void 1348ip_slowtimo(void) 1349{ 1350 static u_int dropscanidx = 0; 1351 u_int i; 1352 u_int median_ttl; 1353 int s = splsoftnet(); 1354 1355 IPQ_LOCK(); 1356 1357 /* Age TTL of all fragments by 1 tick .*/ 1358 median_ttl = ip_reass_ttl_decr(1); 1359 1360 /* make sure fragment limit is up-to-date */ 1361 CHECK_NMBCLUSTER_PARAMS(); 1362 1363 /* If we have too many fragments, drop the older half. */ 1364 if (ip_nfrags > ip_maxfrags) 1365 ip_reass_ttl_decr(median_ttl); 1366 1367 /* 1368 * If we are over the maximum number of fragmented packets 1369 * (due to the limit being lowered), drain off 1370 * enough to get down to the new limit. Start draining 1371 * from the reassembly hashqueue most recently drained. 1372 */ 1373 if (ip_maxfragpackets < 0) 1374 ; 1375 else { 1376 int wrapped = 0; 1377 1378 i = dropscanidx; 1379 while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) { 1380 while (LIST_FIRST(&ipq[i]) != NULL) 1381 ip_freef(LIST_FIRST(&ipq[i])); 1382 if (++i >= IPREASS_NHASH) { 1383 i = 0; 1384 } 1385 /* 1386 * Dont scan forever even if fragment counters are 1387 * wrong: stop after scanning entire reassembly queue. 1388 */ 1389 if (i == dropscanidx) 1390 wrapped = 1; 1391 } 1392 dropscanidx = i; 1393 } 1394 IPQ_UNLOCK(); 1395#ifdef GATEWAY 1396 ipflow_slowtimo(); 1397#endif 1398 splx(s); 1399} 1400 1401/* 1402 * Drain off all datagram fragments. 1403 */ 1404void 1405ip_drain(void) 1406{ 1407 1408 /* 1409 * We may be called from a device's interrupt context. If 1410 * the ipq is already busy, just bail out now. 1411 */ 1412 if (ipq_lock_try() == 0) 1413 return; 1414 1415 /* 1416 * Drop half the total fragments now. If more mbufs are needed, 1417 * we will be called again soon. 1418 */ 1419 ip_reass_drophalf(); 1420 1421 IPQ_UNLOCK(); 1422} 1423 1424/* 1425 * Do option processing on a datagram, 1426 * possibly discarding it if bad options are encountered, 1427 * or forwarding it if source-routed. 1428 * Returns 1 if packet has been forwarded/freed, 1429 * 0 if the packet should be processed further. 1430 */ 1431int 1432ip_dooptions(struct mbuf *m) 1433{ 1434 struct ip *ip = mtod(m, struct ip *); 1435 u_char *cp, *cp0; 1436 struct ip_timestamp *ipt; 1437 struct in_ifaddr *ia; 1438 int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; 1439 struct in_addr dst; 1440 n_time ntime; 1441 1442 dst = ip->ip_dst; 1443 cp = (u_char *)(ip + 1); 1444 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 1445 for (; cnt > 0; cnt -= optlen, cp += optlen) { 1446 opt = cp[IPOPT_OPTVAL]; 1447 if (opt == IPOPT_EOL) 1448 break; 1449 if (opt == IPOPT_NOP) 1450 optlen = 1; 1451 else { 1452 if (cnt < IPOPT_OLEN + sizeof(*cp)) { 1453 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1454 goto bad; 1455 } 1456 optlen = cp[IPOPT_OLEN]; 1457 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { 1458 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1459 goto bad; 1460 } 1461 } 1462 switch (opt) { 1463 1464 default: 1465 break; 1466 1467 /* 1468 * Source routing with record. 1469 * Find interface with current destination address. 1470 * If none on this machine then drop if strictly routed, 1471 * or do nothing if loosely routed. 1472 * Record interface address and bring up next address 1473 * component. If strictly routed make sure next 1474 * address is on directly accessible net. 1475 */ 1476 case IPOPT_LSRR: 1477 case IPOPT_SSRR: 1478 if (ip_allowsrcrt == 0) { 1479 type = ICMP_UNREACH; 1480 code = ICMP_UNREACH_NET_PROHIB; 1481 goto bad; 1482 } 1483 if (optlen < IPOPT_OFFSET + sizeof(*cp)) { 1484 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1485 goto bad; 1486 } 1487 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 1488 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 1489 goto bad; 1490 } 1491 ipaddr.sin_addr = ip->ip_dst; 1492 ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr))); 1493 if (ia == 0) { 1494 if (opt == IPOPT_SSRR) { 1495 type = ICMP_UNREACH; 1496 code = ICMP_UNREACH_SRCFAIL; 1497 goto bad; 1498 } 1499 /* 1500 * Loose routing, and not at next destination 1501 * yet; nothing to do except forward. 1502 */ 1503 break; 1504 } 1505 off--; /* 0 origin */ 1506 if ((off + sizeof(struct in_addr)) > optlen) { 1507 /* 1508 * End of source route. Should be for us. 1509 */ 1510 save_rte(cp, ip->ip_src); 1511 break; 1512 } 1513 /* 1514 * locate outgoing interface 1515 */ 1516 bcopy((caddr_t)(cp + off), (caddr_t)&ipaddr.sin_addr, 1517 sizeof(ipaddr.sin_addr)); 1518 if (opt == IPOPT_SSRR) 1519 ia = ifatoia(ifa_ifwithladdr(sintosa(&ipaddr))); 1520 else 1521 ia = ip_rtaddr(ipaddr.sin_addr); 1522 if (ia == 0) { 1523 type = ICMP_UNREACH; 1524 code = ICMP_UNREACH_SRCFAIL; 1525 goto bad; 1526 } 1527 ip->ip_dst = ipaddr.sin_addr; 1528 bcopy((caddr_t)&ia->ia_addr.sin_addr, 1529 (caddr_t)(cp + off), sizeof(struct in_addr)); 1530 cp[IPOPT_OFFSET] += sizeof(struct in_addr); 1531 /* 1532 * Let ip_intr's mcast routing check handle mcast pkts 1533 */ 1534 forward = !IN_MULTICAST(ip->ip_dst.s_addr); 1535 break; 1536 1537 case IPOPT_RR: 1538 if (optlen < IPOPT_OFFSET + sizeof(*cp)) { 1539 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1540 goto bad; 1541 } 1542 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 1543 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 1544 goto bad; 1545 } 1546 /* 1547 * If no space remains, ignore. 1548 */ 1549 off--; /* 0 origin */ 1550 if ((off + sizeof(struct in_addr)) > optlen) 1551 break; 1552 bcopy((caddr_t)(&ip->ip_dst), (caddr_t)&ipaddr.sin_addr, 1553 sizeof(ipaddr.sin_addr)); 1554 /* 1555 * locate outgoing interface; if we're the destination, 1556 * use the incoming interface (should be same). 1557 */ 1558 if ((ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr)))) 1559 == NULL && 1560 (ia = ip_rtaddr(ipaddr.sin_addr)) == NULL) { 1561 type = ICMP_UNREACH; 1562 code = ICMP_UNREACH_HOST; 1563 goto bad; 1564 } 1565 bcopy((caddr_t)&ia->ia_addr.sin_addr, 1566 (caddr_t)(cp + off), sizeof(struct in_addr)); 1567 cp[IPOPT_OFFSET] += sizeof(struct in_addr); 1568 break; 1569 1570 case IPOPT_TS: 1571 code = cp - (u_char *)ip; 1572 ipt = (struct ip_timestamp *)cp; 1573 if (ipt->ipt_len < 4 || ipt->ipt_len > 40) { 1574 code = (u_char *)&ipt->ipt_len - (u_char *)ip; 1575 goto bad; 1576 } 1577 if (ipt->ipt_ptr < 5) { 1578 code = (u_char *)&ipt->ipt_ptr - (u_char *)ip; 1579 goto bad; 1580 } 1581 if (ipt->ipt_ptr > ipt->ipt_len - sizeof (int32_t)) { 1582 if (++ipt->ipt_oflw == 0) { 1583 code = (u_char *)&ipt->ipt_ptr - 1584 (u_char *)ip; 1585 goto bad; 1586 } 1587 break; 1588 } 1589 cp0 = (cp + ipt->ipt_ptr - 1); 1590 switch (ipt->ipt_flg) { 1591 1592 case IPOPT_TS_TSONLY: 1593 break; 1594 1595 case IPOPT_TS_TSANDADDR: 1596 if (ipt->ipt_ptr - 1 + sizeof(n_time) + 1597 sizeof(struct in_addr) > ipt->ipt_len) { 1598 code = (u_char *)&ipt->ipt_ptr - 1599 (u_char *)ip; 1600 goto bad; 1601 } 1602 ipaddr.sin_addr = dst; 1603 ia = ifatoia(ifaof_ifpforaddr(sintosa(&ipaddr), 1604 m->m_pkthdr.rcvif)); 1605 if (ia == 0) 1606 continue; 1607 bcopy(&ia->ia_addr.sin_addr, 1608 cp0, sizeof(struct in_addr)); 1609 ipt->ipt_ptr += sizeof(struct in_addr); 1610 break; 1611 1612 case IPOPT_TS_PRESPEC: 1613 if (ipt->ipt_ptr - 1 + sizeof(n_time) + 1614 sizeof(struct in_addr) > ipt->ipt_len) { 1615 code = (u_char *)&ipt->ipt_ptr - 1616 (u_char *)ip; 1617 goto bad; 1618 } 1619 bcopy(cp0, &ipaddr.sin_addr, 1620 sizeof(struct in_addr)); 1621 if (ifatoia(ifa_ifwithaddr(sintosa(&ipaddr))) 1622 == NULL) 1623 continue; 1624 ipt->ipt_ptr += sizeof(struct in_addr); 1625 break; 1626 1627 default: 1628 /* XXX can't take &ipt->ipt_flg */ 1629 code = (u_char *)&ipt->ipt_ptr - 1630 (u_char *)ip + 1; 1631 goto bad; 1632 } 1633 ntime = iptime(); 1634 cp0 = (u_char *) &ntime; /* XXX grumble, GCC... */ 1635 bcopy(cp0, (caddr_t)cp + ipt->ipt_ptr - 1, 1636 sizeof(n_time)); 1637 ipt->ipt_ptr += sizeof(n_time); 1638 } 1639 } 1640 if (forward) { 1641 if (ip_forwsrcrt == 0) { 1642 type = ICMP_UNREACH; 1643 code = ICMP_UNREACH_SRCFAIL; 1644 goto bad; 1645 } 1646 ip_forward(m, 1); 1647 return (1); 1648 } 1649 return (0); 1650bad: 1651 icmp_error(m, type, code, 0, 0); 1652 ipstat.ips_badoptions++; 1653 return (1); 1654} 1655 1656/* 1657 * Given address of next destination (final or next hop), 1658 * return internet address info of interface to be used to get there. 1659 */ 1660struct in_ifaddr * 1661ip_rtaddr(struct in_addr dst) 1662{ 1663 struct sockaddr_in *sin; 1664 1665 sin = satosin(&ipforward_rt.ro_dst); 1666 1667 if (ipforward_rt.ro_rt == 0 || !in_hosteq(dst, sin->sin_addr)) { 1668 if (ipforward_rt.ro_rt) { 1669 RTFREE(ipforward_rt.ro_rt); 1670 ipforward_rt.ro_rt = 0; 1671 } 1672 sin->sin_family = AF_INET; 1673 sin->sin_len = sizeof(*sin); 1674 sin->sin_addr = dst; 1675 1676 rtalloc(&ipforward_rt); 1677 } 1678 if (ipforward_rt.ro_rt == 0) 1679 return ((struct in_ifaddr *)0); 1680 return (ifatoia(ipforward_rt.ro_rt->rt_ifa)); 1681} 1682 1683/* 1684 * Save incoming source route for use in replies, 1685 * to be picked up later by ip_srcroute if the receiver is interested. 1686 */ 1687void 1688save_rte(u_char *option, struct in_addr dst) 1689{ 1690 unsigned olen; 1691 1692 olen = option[IPOPT_OLEN]; 1693#ifdef DIAGNOSTIC 1694 if (ipprintfs) 1695 printf("save_rte: olen %d\n", olen); 1696#endif /* 0 */ 1697 if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst))) 1698 return; 1699 bcopy((caddr_t)option, (caddr_t)ip_srcrt.srcopt, olen); 1700 ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); 1701 ip_srcrt.dst = dst; 1702} 1703 1704/* 1705 * Retrieve incoming source route for use in replies, 1706 * in the same form used by setsockopt. 1707 * The first hop is placed before the options, will be removed later. 1708 */ 1709struct mbuf * 1710ip_srcroute(void) 1711{ 1712 struct in_addr *p, *q; 1713 struct mbuf *m; 1714 1715 if (ip_nhops == 0) 1716 return ((struct mbuf *)0); 1717 m = m_get(M_DONTWAIT, MT_SOOPTS); 1718 if (m == 0) 1719 return ((struct mbuf *)0); 1720 1721 MCLAIM(m, &inetdomain.dom_mowner); 1722#define OPTSIZ (sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt)) 1723 1724 /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ 1725 m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) + 1726 OPTSIZ; 1727#ifdef DIAGNOSTIC 1728 if (ipprintfs) 1729 printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len); 1730#endif 1731 1732 /* 1733 * First save first hop for return route 1734 */ 1735 p = &ip_srcrt.route[ip_nhops - 1]; 1736 *(mtod(m, struct in_addr *)) = *p--; 1737#ifdef DIAGNOSTIC 1738 if (ipprintfs) 1739 printf(" hops %x", ntohl(mtod(m, struct in_addr *)->s_addr)); 1740#endif 1741 1742 /* 1743 * Copy option fields and padding (nop) to mbuf. 1744 */ 1745 ip_srcrt.nop = IPOPT_NOP; 1746 ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; 1747 bcopy((caddr_t)&ip_srcrt.nop, 1748 mtod(m, caddr_t) + sizeof(struct in_addr), OPTSIZ); 1749 q = (struct in_addr *)(mtod(m, caddr_t) + 1750 sizeof(struct in_addr) + OPTSIZ); 1751#undef OPTSIZ 1752 /* 1753 * Record return path as an IP source route, 1754 * reversing the path (pointers are now aligned). 1755 */ 1756 while (p >= ip_srcrt.route) { 1757#ifdef DIAGNOSTIC 1758 if (ipprintfs) 1759 printf(" %x", ntohl(q->s_addr)); 1760#endif 1761 *q++ = *p--; 1762 } 1763 /* 1764 * Last hop goes to final destination. 1765 */ 1766 *q = ip_srcrt.dst; 1767#ifdef DIAGNOSTIC 1768 if (ipprintfs) 1769 printf(" %x\n", ntohl(q->s_addr)); 1770#endif 1771 return (m); 1772} 1773 1774/* 1775 * Strip out IP options, at higher 1776 * level protocol in the kernel. 1777 * Second argument is buffer to which options 1778 * will be moved, and return value is their length. 1779 * XXX should be deleted; last arg currently ignored. 1780 */ 1781void 1782ip_stripoptions(struct mbuf *m, struct mbuf *mopt) 1783{ 1784 int i; 1785 struct ip *ip = mtod(m, struct ip *); 1786 caddr_t opts; 1787 int olen; 1788 1789 olen = (ip->ip_hl << 2) - sizeof (struct ip); 1790 opts = (caddr_t)(ip + 1); 1791 i = m->m_len - (sizeof (struct ip) + olen); 1792 bcopy(opts + olen, opts, (unsigned)i); 1793 m->m_len -= olen; 1794 if (m->m_flags & M_PKTHDR) 1795 m->m_pkthdr.len -= olen; 1796 ip->ip_len = htons(ntohs(ip->ip_len) - olen); 1797 ip->ip_hl = sizeof (struct ip) >> 2; 1798} 1799 1800const int inetctlerrmap[PRC_NCMDS] = { 1801 0, 0, 0, 0, 1802 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, 1803 EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, 1804 EMSGSIZE, EHOSTUNREACH, 0, 0, 1805 0, 0, 0, 0, 1806 ENOPROTOOPT 1807}; 1808 1809/* 1810 * Forward a packet. If some error occurs return the sender 1811 * an icmp packet. Note we can't always generate a meaningful 1812 * icmp message because icmp doesn't have a large enough repertoire 1813 * of codes and types. 1814 * 1815 * If not forwarding, just drop the packet. This could be confusing 1816 * if ipforwarding was zero but some routing protocol was advancing 1817 * us as a gateway to somewhere. However, we must let the routing 1818 * protocol deal with that. 1819 * 1820 * The srcrt parameter indicates whether the packet is being forwarded 1821 * via a source route. 1822 */ 1823void 1824ip_forward(struct mbuf *m, int srcrt) 1825{ 1826 struct ip *ip = mtod(m, struct ip *); 1827 struct sockaddr_in *sin; 1828 struct rtentry *rt; 1829 int error, type = 0, code = 0, destmtu = 0; 1830 struct mbuf *mcopy; 1831 n_long dest; 1832 1833 /* 1834 * We are now in the output path. 1835 */ 1836 MCLAIM(m, &ip_tx_mowner); 1837 1838 /* 1839 * Clear any in-bound checksum flags for this packet. 1840 */ 1841 m->m_pkthdr.csum_flags = 0; 1842 1843 dest = 0; 1844#ifdef DIAGNOSTIC 1845 if (ipprintfs) { 1846 printf("forward: src %s ", inet_ntoa(ip->ip_src)); 1847 printf("dst %s ttl %x\n", inet_ntoa(ip->ip_dst), ip->ip_ttl); 1848 } 1849#endif 1850 if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { 1851 ipstat.ips_cantforward++; 1852 m_freem(m); 1853 return; 1854 } 1855 if (ip->ip_ttl <= IPTTLDEC) { 1856 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0); 1857 return; 1858 } 1859 1860 sin = satosin(&ipforward_rt.ro_dst); 1861 if ((rt = ipforward_rt.ro_rt) == 0 || 1862 !in_hosteq(ip->ip_dst, sin->sin_addr)) { 1863 if (ipforward_rt.ro_rt) { 1864 RTFREE(ipforward_rt.ro_rt); 1865 ipforward_rt.ro_rt = 0; 1866 } 1867 sin->sin_family = AF_INET; 1868 sin->sin_len = sizeof(struct sockaddr_in); 1869 sin->sin_addr = ip->ip_dst; 1870 1871 rtalloc(&ipforward_rt); 1872 if (ipforward_rt.ro_rt == 0) { 1873 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NET, dest, 0); 1874 return; 1875 } 1876 rt = ipforward_rt.ro_rt; 1877 } 1878 1879 /* 1880 * Save at most 68 bytes of the packet in case 1881 * we need to generate an ICMP message to the src. 1882 * Pullup to avoid sharing mbuf cluster between m and mcopy. 1883 */ 1884 mcopy = m_copym(m, 0, imin(ntohs(ip->ip_len), 68), M_DONTWAIT); 1885 if (mcopy) 1886 mcopy = m_pullup(mcopy, ip->ip_hl << 2); 1887 1888 ip->ip_ttl -= IPTTLDEC; 1889 1890 /* 1891 * If forwarding packet using same interface that it came in on, 1892 * perhaps should send a redirect to sender to shortcut a hop. 1893 * Only send redirect if source is sending directly to us, 1894 * and if packet was not source routed (or has any options). 1895 * Also, don't send redirect if forwarding using a default route 1896 * or a route modified by a redirect. 1897 */ 1898 if (rt->rt_ifp == m->m_pkthdr.rcvif && 1899 (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && 1900 !in_nullhost(satosin(rt_key(rt))->sin_addr) && 1901 ipsendredirects && !srcrt) { 1902 if (rt->rt_ifa && 1903 (ip->ip_src.s_addr & ifatoia(rt->rt_ifa)->ia_subnetmask) == 1904 ifatoia(rt->rt_ifa)->ia_subnet) { 1905 if (rt->rt_flags & RTF_GATEWAY) 1906 dest = satosin(rt->rt_gateway)->sin_addr.s_addr; 1907 else 1908 dest = ip->ip_dst.s_addr; 1909 /* 1910 * Router requirements says to only send host 1911 * redirects. 1912 */ 1913 type = ICMP_REDIRECT; 1914 code = ICMP_REDIRECT_HOST; 1915#ifdef DIAGNOSTIC 1916 if (ipprintfs) 1917 printf("redirect (%d) to %x\n", code, 1918 (u_int32_t)dest); 1919#endif 1920 } 1921 } 1922 1923 error = ip_output(m, (struct mbuf *)0, &ipforward_rt, 1924 (IP_FORWARDING | (ip_directedbcast ? IP_ALLOWBROADCAST : 0)), 1925 (struct ip_moptions *)NULL, (struct socket *)NULL); 1926 1927 if (error) 1928 ipstat.ips_cantforward++; 1929 else { 1930 ipstat.ips_forward++; 1931 if (type) 1932 ipstat.ips_redirectsent++; 1933 else { 1934 if (mcopy) { 1935#ifdef GATEWAY 1936 if (mcopy->m_flags & M_CANFASTFWD) 1937 ipflow_create(&ipforward_rt, mcopy); 1938#endif 1939 m_freem(mcopy); 1940 } 1941 return; 1942 } 1943 } 1944 if (mcopy == NULL) 1945 return; 1946 1947 switch (error) { 1948 1949 case 0: /* forwarded, but need redirect */ 1950 /* type, code set above */ 1951 break; 1952 1953 case ENETUNREACH: /* shouldn't happen, checked above */ 1954 case EHOSTUNREACH: 1955 case ENETDOWN: 1956 case EHOSTDOWN: 1957 default: 1958 type = ICMP_UNREACH; 1959 code = ICMP_UNREACH_HOST; 1960 break; 1961 1962 case EMSGSIZE: 1963 type = ICMP_UNREACH; 1964 code = ICMP_UNREACH_NEEDFRAG; 1965#if !defined(IPSEC) && !defined(FAST_IPSEC) 1966 if (ipforward_rt.ro_rt) 1967 destmtu = ipforward_rt.ro_rt->rt_ifp->if_mtu; 1968#else 1969 /* 1970 * If the packet is routed over IPsec tunnel, tell the 1971 * originator the tunnel MTU. 1972 * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz 1973 * XXX quickhack!!! 1974 */ 1975 if (ipforward_rt.ro_rt) { 1976 struct secpolicy *sp; 1977 int ipsecerror; 1978 size_t ipsechdr; 1979 struct route *ro; 1980 1981 sp = ipsec4_getpolicybyaddr(mcopy, 1982 IPSEC_DIR_OUTBOUND, IP_FORWARDING, 1983 &ipsecerror); 1984 1985 if (sp == NULL) 1986 destmtu = ipforward_rt.ro_rt->rt_ifp->if_mtu; 1987 else { 1988 /* count IPsec header size */ 1989 ipsechdr = ipsec4_hdrsiz(mcopy, 1990 IPSEC_DIR_OUTBOUND, NULL); 1991 1992 /* 1993 * find the correct route for outer IPv4 1994 * header, compute tunnel MTU. 1995 */ 1996 1997 if (sp->req != NULL 1998 && sp->req->sav != NULL 1999 && sp->req->sav->sah != NULL) { 2000 ro = &sp->req->sav->sah->sa_route; 2001 if (ro->ro_rt && ro->ro_rt->rt_ifp) { 2002 destmtu = 2003 ro->ro_rt->rt_rmx.rmx_mtu ? 2004 ro->ro_rt->rt_rmx.rmx_mtu : 2005 ro->ro_rt->rt_ifp->if_mtu; 2006 destmtu -= ipsechdr; 2007 } 2008 } 2009 2010#ifdef IPSEC 2011 key_freesp(sp); 2012#else 2013 KEY_FREESP(&sp); 2014#endif 2015 } 2016 } 2017#endif /*IPSEC*/ 2018 ipstat.ips_cantfrag++; 2019 break; 2020 2021 case ENOBUFS: 2022#if 1 2023 /* 2024 * a router should not generate ICMP_SOURCEQUENCH as 2025 * required in RFC1812 Requirements for IP Version 4 Routers. 2026 * source quench could be a big problem under DoS attacks, 2027 * or if the underlying interface is rate-limited. 2028 */ 2029 if (mcopy) 2030 m_freem(mcopy); 2031 return; 2032#else 2033 type = ICMP_SOURCEQUENCH; 2034 code = 0; 2035 break; 2036#endif 2037 } 2038 icmp_error(mcopy, type, code, dest, destmtu); 2039} 2040 2041void 2042ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, 2043 struct mbuf *m) 2044{ 2045 2046 if (inp->inp_socket->so_options & SO_TIMESTAMP) { 2047 struct timeval tv; 2048 2049 microtime(&tv); 2050 *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), 2051 SCM_TIMESTAMP, SOL_SOCKET); 2052 if (*mp) 2053 mp = &(*mp)->m_next; 2054 } 2055 if (inp->inp_flags & INP_RECVDSTADDR) { 2056 *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, 2057 sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); 2058 if (*mp) 2059 mp = &(*mp)->m_next; 2060 } 2061#ifdef notyet 2062 /* 2063 * XXX 2064 * Moving these out of udp_input() made them even more broken 2065 * than they already were. 2066 * - fenner@parc.xerox.com 2067 */ 2068 /* options were tossed already */ 2069 if (inp->inp_flags & INP_RECVOPTS) { 2070 *mp = sbcreatecontrol((caddr_t) opts_deleted_above, 2071 sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); 2072 if (*mp) 2073 mp = &(*mp)->m_next; 2074 } 2075 /* ip_srcroute doesn't do what we want here, need to fix */ 2076 if (inp->inp_flags & INP_RECVRETOPTS) { 2077 *mp = sbcreatecontrol((caddr_t) ip_srcroute(), 2078 sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); 2079 if (*mp) 2080 mp = &(*mp)->m_next; 2081 } 2082#endif 2083 if (inp->inp_flags & INP_RECVIF) { 2084 struct sockaddr_dl sdl; 2085 2086 sdl.sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); 2087 sdl.sdl_family = AF_LINK; 2088 sdl.sdl_index = m->m_pkthdr.rcvif ? 2089 m->m_pkthdr.rcvif->if_index : 0; 2090 sdl.sdl_nlen = sdl.sdl_alen = sdl.sdl_slen = 0; 2091 *mp = sbcreatecontrol((caddr_t) &sdl, sdl.sdl_len, 2092 IP_RECVIF, IPPROTO_IP); 2093 if (*mp) 2094 mp = &(*mp)->m_next; 2095 } 2096} 2097 2098/* 2099 * sysctl helper routine for net.inet.ip.forwsrcrt. 2100 */ 2101static int 2102sysctl_net_inet_ip_forwsrcrt(SYSCTLFN_ARGS) 2103{ 2104 int error, tmp; 2105 struct sysctlnode node; 2106 2107 node = *rnode; 2108 tmp = ip_forwsrcrt; 2109 node.sysctl_data = &tmp; 2110 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 2111 if (error || newp == NULL) 2112 return (error); 2113 2114 if (securelevel > 0) 2115 return (EPERM); 2116 2117 ip_forwsrcrt = tmp; 2118 2119 return (0); 2120} 2121 2122/* 2123 * sysctl helper routine for net.inet.ip.mtudisctimeout. checks the 2124 * range of the new value and tweaks timers if it changes. 2125 */ 2126static int 2127sysctl_net_inet_ip_pmtudto(SYSCTLFN_ARGS) 2128{ 2129 int error, tmp; 2130 struct sysctlnode node; 2131 2132 node = *rnode; 2133 tmp = ip_mtudisc_timeout; 2134 node.sysctl_data = &tmp; 2135 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 2136 if (error || newp == NULL) 2137 return (error); 2138 if (tmp < 0) 2139 return (EINVAL); 2140 2141 ip_mtudisc_timeout = tmp; 2142 rt_timer_queue_change(ip_mtudisc_timeout_q, ip_mtudisc_timeout); 2143 2144 return (0); 2145} 2146 2147#ifdef GATEWAY 2148/* 2149 * sysctl helper routine for net.inet.ip.maxflows. apparently if 2150 * maxflows is even looked up, we "reap flows". 2151 */ 2152static int 2153sysctl_net_inet_ip_maxflows(SYSCTLFN_ARGS) 2154{ 2155 int s; 2156 2157 s = sysctl_lookup(SYSCTLFN_CALL(rnode)); 2158 if (s) 2159 return (s); 2160 2161 s = splsoftnet(); 2162 ipflow_reap(0); 2163 splx(s); 2164 2165 return (0); 2166} 2167#endif /* GATEWAY */ 2168 2169 2170SYSCTL_SETUP(sysctl_net_inet_ip_setup, "sysctl net.inet.ip subtree setup") 2171{ 2172 extern int subnetsarelocal, hostzeroisbroadcast; 2173 2174 sysctl_createv(clog, 0, NULL, NULL, 2175 CTLFLAG_PERMANENT, 2176 CTLTYPE_NODE, "net", NULL, 2177 NULL, 0, NULL, 0, 2178 CTL_NET, CTL_EOL); 2179 sysctl_createv(clog, 0, NULL, NULL, 2180 CTLFLAG_PERMANENT, 2181 CTLTYPE_NODE, "inet", 2182 SYSCTL_DESCR("PF_INET related settings"), 2183 NULL, 0, NULL, 0, 2184 CTL_NET, PF_INET, CTL_EOL); 2185 sysctl_createv(clog, 0, NULL, NULL, 2186 CTLFLAG_PERMANENT, 2187 CTLTYPE_NODE, "ip", 2188 SYSCTL_DESCR("IPv4 related settings"), 2189 NULL, 0, NULL, 0, 2190 CTL_NET, PF_INET, IPPROTO_IP, CTL_EOL); 2191 2192 sysctl_createv(clog, 0, NULL, NULL, 2193 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2194 CTLTYPE_INT, "forwarding", 2195 SYSCTL_DESCR("Enable forwarding of INET datagrams"), 2196 NULL, 0, &ipforwarding, 0, 2197 CTL_NET, PF_INET, IPPROTO_IP, 2198 IPCTL_FORWARDING, CTL_EOL); 2199 sysctl_createv(clog, 0, NULL, NULL, 2200 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2201 CTLTYPE_INT, "redirect", 2202 SYSCTL_DESCR("Enable sending of ICMP redirect messages"), 2203 NULL, 0, &ipsendredirects, 0, 2204 CTL_NET, PF_INET, IPPROTO_IP, 2205 IPCTL_SENDREDIRECTS, CTL_EOL); 2206 sysctl_createv(clog, 0, NULL, NULL, 2207 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2208 CTLTYPE_INT, "ttl", 2209 SYSCTL_DESCR("Default TTL for an INET datagram"), 2210 NULL, 0, &ip_defttl, 0, 2211 CTL_NET, PF_INET, IPPROTO_IP, 2212 IPCTL_DEFTTL, CTL_EOL); 2213#ifdef IPCTL_DEFMTU 2214 sysctl_createv(clog, 0, NULL, NULL, 2215 CTLFLAG_PERMANENT /* |CTLFLAG_READWRITE? */, 2216 CTLTYPE_INT, "mtu", 2217 SYSCTL_DESCR("Default MTA for an INET route"), 2218 NULL, 0, &ip_mtu, 0, 2219 CTL_NET, PF_INET, IPPROTO_IP, 2220 IPCTL_DEFMTU, CTL_EOL); 2221#endif /* IPCTL_DEFMTU */ 2222 sysctl_createv(clog, 0, NULL, NULL, 2223 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2224 CTLTYPE_INT, "forwsrcrt", 2225 SYSCTL_DESCR("Enable forwarding of source-routed " 2226 "datagrams"), 2227 sysctl_net_inet_ip_forwsrcrt, 0, &ip_forwsrcrt, 0, 2228 CTL_NET, PF_INET, IPPROTO_IP, 2229 IPCTL_FORWSRCRT, CTL_EOL); 2230 sysctl_createv(clog, 0, NULL, NULL, 2231 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2232 CTLTYPE_INT, "directed-broadcast", 2233 SYSCTL_DESCR("Enable forwarding of broadcast datagrams"), 2234 NULL, 0, &ip_directedbcast, 0, 2235 CTL_NET, PF_INET, IPPROTO_IP, 2236 IPCTL_DIRECTEDBCAST, CTL_EOL); 2237 sysctl_createv(clog, 0, NULL, NULL, 2238 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2239 CTLTYPE_INT, "allowsrcrt", 2240 SYSCTL_DESCR("Accept source-routed datagrams"), 2241 NULL, 0, &ip_allowsrcrt, 0, 2242 CTL_NET, PF_INET, IPPROTO_IP, 2243 IPCTL_ALLOWSRCRT, CTL_EOL); 2244 sysctl_createv(clog, 0, NULL, NULL, 2245 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2246 CTLTYPE_INT, "subnetsarelocal", 2247 SYSCTL_DESCR("Whether logical subnets are considered " 2248 "local"), 2249 NULL, 0, &subnetsarelocal, 0, 2250 CTL_NET, PF_INET, IPPROTO_IP, 2251 IPCTL_SUBNETSARELOCAL, CTL_EOL); 2252 sysctl_createv(clog, 0, NULL, NULL, 2253 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2254 CTLTYPE_INT, "mtudisc", 2255 SYSCTL_DESCR("Use RFC1191 Path MTU Discovery"), 2256 NULL, 0, &ip_mtudisc, 0, 2257 CTL_NET, PF_INET, IPPROTO_IP, 2258 IPCTL_MTUDISC, CTL_EOL); 2259 sysctl_createv(clog, 0, NULL, NULL, 2260 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2261 CTLTYPE_INT, "anonportmin", 2262 SYSCTL_DESCR("Lowest ephemeral port number to assign"), 2263 sysctl_net_inet_ip_ports, 0, &anonportmin, 0, 2264 CTL_NET, PF_INET, IPPROTO_IP, 2265 IPCTL_ANONPORTMIN, CTL_EOL); 2266 sysctl_createv(clog, 0, NULL, NULL, 2267 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2268 CTLTYPE_INT, "anonportmax", 2269 SYSCTL_DESCR("Highest ephemeral port number to assign"), 2270 sysctl_net_inet_ip_ports, 0, &anonportmax, 0, 2271 CTL_NET, PF_INET, IPPROTO_IP, 2272 IPCTL_ANONPORTMAX, CTL_EOL); 2273 sysctl_createv(clog, 0, NULL, NULL, 2274 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2275 CTLTYPE_INT, "mtudisctimeout", 2276 SYSCTL_DESCR("Lifetime of a Path MTU Discovered route"), 2277 sysctl_net_inet_ip_pmtudto, 0, &ip_mtudisc_timeout, 0, 2278 CTL_NET, PF_INET, IPPROTO_IP, 2279 IPCTL_MTUDISCTIMEOUT, CTL_EOL); 2280#ifdef GATEWAY 2281 sysctl_createv(clog, 0, NULL, NULL, 2282 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2283 CTLTYPE_INT, "maxflows", 2284 SYSCTL_DESCR("Number of flows for fast forwarding"), 2285 sysctl_net_inet_ip_maxflows, 0, &ip_maxflows, 0, 2286 CTL_NET, PF_INET, IPPROTO_IP, 2287 IPCTL_MAXFLOWS, CTL_EOL); 2288#endif /* GATEWAY */ 2289 sysctl_createv(clog, 0, NULL, NULL, 2290 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2291 CTLTYPE_INT, "hostzerobroadcast", 2292 SYSCTL_DESCR("All zeroes address is broadcast address"), 2293 NULL, 0, &hostzeroisbroadcast, 0, 2294 CTL_NET, PF_INET, IPPROTO_IP, 2295 IPCTL_HOSTZEROBROADCAST, CTL_EOL); 2296#if NGIF > 0 2297 sysctl_createv(clog, 0, NULL, NULL, 2298 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2299 CTLTYPE_INT, "gifttl", 2300 SYSCTL_DESCR("Default TTL for a gif tunnel datagram"), 2301 NULL, 0, &ip_gif_ttl, 0, 2302 CTL_NET, PF_INET, IPPROTO_IP, 2303 IPCTL_GIF_TTL, CTL_EOL); 2304#endif /* NGIF */ 2305#ifndef IPNOPRIVPORTS 2306 sysctl_createv(clog, 0, NULL, NULL, 2307 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2308 CTLTYPE_INT, "lowportmin", 2309 SYSCTL_DESCR("Lowest privileged ephemeral port number " 2310 "to assign"), 2311 sysctl_net_inet_ip_ports, 0, &lowportmin, 0, 2312 CTL_NET, PF_INET, IPPROTO_IP, 2313 IPCTL_LOWPORTMIN, CTL_EOL); 2314 sysctl_createv(clog, 0, NULL, NULL, 2315 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2316 CTLTYPE_INT, "lowportmax", 2317 SYSCTL_DESCR("Highest privileged ephemeral port number " 2318 "to assign"), 2319 sysctl_net_inet_ip_ports, 0, &lowportmax, 0, 2320 CTL_NET, PF_INET, IPPROTO_IP, 2321 IPCTL_LOWPORTMAX, CTL_EOL); 2322#endif /* IPNOPRIVPORTS */ 2323 sysctl_createv(clog, 0, NULL, NULL, 2324 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2325 CTLTYPE_INT, "maxfragpackets", 2326 SYSCTL_DESCR("Maximum number of fragments to retain for " 2327 "possible reassembly"), 2328 NULL, 0, &ip_maxfragpackets, 0, 2329 CTL_NET, PF_INET, IPPROTO_IP, 2330 IPCTL_MAXFRAGPACKETS, CTL_EOL); 2331#if NGRE > 0 2332 sysctl_createv(clog, 0, NULL, NULL, 2333 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2334 CTLTYPE_INT, "grettl", 2335 SYSCTL_DESCR("Default TTL for a gre tunnel datagram"), 2336 NULL, 0, &ip_gre_ttl, 0, 2337 CTL_NET, PF_INET, IPPROTO_IP, 2338 IPCTL_GRE_TTL, CTL_EOL); 2339#endif /* NGRE */ 2340 sysctl_createv(clog, 0, NULL, NULL, 2341 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2342 CTLTYPE_INT, "checkinterface", 2343 SYSCTL_DESCR("Enable receive side of Strong ES model " 2344 "from RFC1122"), 2345 NULL, 0, &ip_checkinterface, 0, 2346 CTL_NET, PF_INET, IPPROTO_IP, 2347 IPCTL_CHECKINTERFACE, CTL_EOL); 2348 sysctl_createv(clog, 0, NULL, NULL, 2349 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2350 CTLTYPE_INT, "random_id", 2351 SYSCTL_DESCR("Assign random ip_id values"), 2352 NULL, 0, &ip_do_randomid, 0, 2353 CTL_NET, PF_INET, IPPROTO_IP, 2354 IPCTL_RANDOMID, CTL_EOL); 2355 sysctl_createv(clog, 0, NULL, NULL, 2356 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2357 CTLTYPE_INT, "do_loopback_cksum", 2358 SYSCTL_DESCR("Perform IP checksum on loopback"), 2359 NULL, 0, &ip_do_loopback_cksum, 0, 2360 CTL_NET, PF_INET, IPPROTO_IP, 2361 IPCTL_LOOPBACKCKSUM, CTL_EOL); 2362 sysctl_createv(clog, 0, NULL, NULL, 2363 CTLFLAG_PERMANENT, 2364 CTLTYPE_STRUCT, "stats", 2365 SYSCTL_DESCR("IP statistics"), 2366 NULL, 0, &ipstat, sizeof(ipstat), 2367 CTL_NET, PF_INET, IPPROTO_IP, IPCTL_STATS, 2368 CTL_EOL); 2369} 2370