ip_reass.c revision 193502
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: head/sys/netinet/ip_input.c 193502 2009-06-05 13:44:30Z luigi $"); 34 35#include "opt_bootp.h" 36#include "opt_ipfw.h" 37#include "opt_ipstealth.h" 38#include "opt_ipsec.h" 39#include "opt_route.h" 40#include "opt_mac.h" 41#include "opt_carp.h" 42 43#include <sys/param.h> 44#include <sys/systm.h> 45#include <sys/callout.h> 46#include <sys/mbuf.h> 47#include <sys/malloc.h> 48#include <sys/domain.h> 49#include <sys/protosw.h> 50#include <sys/socket.h> 51#include <sys/time.h> 52#include <sys/kernel.h> 53#include <sys/lock.h> 54#include <sys/rwlock.h> 55#include <sys/syslog.h> 56#include <sys/sysctl.h> 57#include <sys/vimage.h> 58 59#include <net/pfil.h> 60#include <net/if.h> 61#include <net/if_types.h> 62#include <net/if_var.h> 63#include <net/if_dl.h> 64#include <net/route.h> 65#include <net/netisr.h> 66#include <net/vnet.h> 67#include <net/flowtable.h> 68 69#include <netinet/in.h> 70#include <netinet/in_systm.h> 71#include <netinet/in_var.h> 72#include <netinet/ip.h> 73#include <netinet/in_pcb.h> 74#include <netinet/ip_var.h> 75#include <netinet/ip_icmp.h> 76#include <netinet/ip_options.h> 77#include <machine/in_cksum.h> 78#include <netinet/vinet.h> 79#ifdef DEV_CARP 80#include <netinet/ip_carp.h> 81#endif 82#ifdef IPSEC 83#include <netinet/ip_ipsec.h> 84#endif /* IPSEC */ 85 86#include <sys/socketvar.h> 87 88#include <security/mac/mac_framework.h> 89 90#ifdef CTASSERT 91CTASSERT(sizeof(struct ip) == 20); 92#endif 93 94#ifndef VIMAGE 95#ifndef VIMAGE_GLOBALS 96struct vnet_inet vnet_inet_0; 97#endif 98#endif 99 100#ifdef VIMAGE_GLOBALS 101static int ipsendredirects; 102static int ip_checkinterface; 103static int ip_keepfaith; 104static int ip_sendsourcequench; 105int ip_defttl; 106int ip_do_randomid; 107int ipforwarding; 108struct in_ifaddrhead in_ifaddrhead; /* first inet address */ 109struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ 110u_long in_ifaddrhmask; /* mask for hash table */ 111struct ipstat ipstat; 112static int ip_rsvp_on; 113struct socket *ip_rsvpd; 114int rsvp_on; 115static struct ipqhead ipq[IPREASS_NHASH]; 116static int maxnipq; /* Administrative limit on # reass queues. */ 117static int maxfragsperpacket; 118int ipstealth; 119static int nipq; /* Total # of reass queues */ 120#endif 121 122SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_FORWARDING, 123 forwarding, CTLFLAG_RW, ipforwarding, 0, 124 "Enable IP forwarding between interfaces"); 125 126SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_SENDREDIRECTS, 127 redirect, CTLFLAG_RW, ipsendredirects, 0, 128 "Enable sending IP redirects"); 129 130SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_DEFTTL, 131 ttl, CTLFLAG_RW, ip_defttl, 0, "Maximum TTL on IP packets"); 132 133SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_KEEPFAITH, 134 keepfaith, CTLFLAG_RW, ip_keepfaith, 0, 135 "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); 136 137SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, 138 sendsourcequench, CTLFLAG_RW, ip_sendsourcequench, 0, 139 "Enable the transmission of source quench packets"); 140 141SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, random_id, 142 CTLFLAG_RW, ip_do_randomid, 0, "Assign random ip_id values"); 143 144/* 145 * XXX - Setting ip_checkinterface mostly implements the receive side of 146 * the Strong ES model described in RFC 1122, but since the routing table 147 * and transmit implementation do not implement the Strong ES model, 148 * setting this to 1 results in an odd hybrid. 149 * 150 * XXX - ip_checkinterface currently must be disabled if you use ipnat 151 * to translate the destination address to another local interface. 152 * 153 * XXX - ip_checkinterface must be disabled if you add IP aliases 154 * to the loopback interface instead of the interface where the 155 * packets for those addresses are received. 156 */ 157SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, 158 check_interface, CTLFLAG_RW, ip_checkinterface, 0, 159 "Verify packet arrives on correct interface"); 160 161struct pfil_head inet_pfil_hook; /* Packet filter hooks */ 162 163static struct netisr_handler ip_nh = { 164 .nh_name = "ip", 165 .nh_handler = ip_input, 166 .nh_proto = NETISR_IP, 167 .nh_policy = NETISR_POLICY_FLOW, 168}; 169 170extern struct domain inetdomain; 171extern struct protosw inetsw[]; 172u_char ip_protox[IPPROTO_MAX]; 173 174 175SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW, 176 ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); 177 178#ifdef VIMAGE_GLOBALS 179static uma_zone_t ipq_zone; 180#endif 181static struct mtx ipqlock; 182 183#define IPQ_LOCK() mtx_lock(&ipqlock) 184#define IPQ_UNLOCK() mtx_unlock(&ipqlock) 185#define IPQ_LOCK_INIT() mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF) 186#define IPQ_LOCK_ASSERT() mtx_assert(&ipqlock, MA_OWNED) 187 188static void maxnipq_update(void); 189static void ipq_zone_change(void *); 190 191SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, fragpackets, 192 CTLFLAG_RD, nipq, 0, 193 "Current number of IPv4 fragment reassembly queue entries"); 194 195SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, maxfragsperpacket, 196 CTLFLAG_RW, maxfragsperpacket, 0, 197 "Maximum number of IPv4 fragments allowed per packet"); 198 199struct callout ipport_tick_callout; 200 201#ifdef IPCTL_DEFMTU 202SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, 203 &ip_mtu, 0, "Default MTU"); 204#endif 205 206#ifdef IPSTEALTH 207SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, 208 ipstealth, 0, "IP stealth mode, no TTL decrementation on forwarding"); 209#endif 210static int ip_output_flowtable_size = 2048; 211TUNABLE_INT("net.inet.ip.output_flowtable_size", &ip_output_flowtable_size); 212SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, output_flowtable_size, 213 CTLFLAG_RDTUN, ip_output_flowtable_size, 2048, 214 "number of entries in the per-cpu output flow caches"); 215 216#ifdef VIMAGE_GLOBALS 217int fw_one_pass; 218#endif 219struct flowtable *ip_ft; 220 221static void ip_freef(struct ipqhead *, struct ipq *); 222 223#ifndef VIMAGE_GLOBALS 224static void vnet_inet_register(void); 225 226static const vnet_modinfo_t vnet_inet_modinfo = { 227 .vmi_id = VNET_MOD_INET, 228 .vmi_name = "inet", 229 .vmi_size = sizeof(struct vnet_inet) 230}; 231 232static void vnet_inet_register() 233{ 234 235 vnet_mod_register(&vnet_inet_modinfo); 236} 237 238SYSINIT(inet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, vnet_inet_register, 0); 239#endif 240 241static int 242sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) 243{ 244 int error, qlimit; 245 246 netisr_getqlimit(&ip_nh, &qlimit); 247 error = sysctl_handle_int(oidp, &qlimit, 0, req); 248 if (error || !req->newptr) 249 return (error); 250 if (qlimit < 1) 251 return (EINVAL); 252 return (netisr_setqlimit(&ip_nh, qlimit)); 253} 254SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, 255 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I", 256 "Maximum size of the IP input queue"); 257 258static int 259sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS) 260{ 261 u_int64_t qdrops_long; 262 int error, qdrops; 263 264 netisr_getqdrops(&ip_nh, &qdrops_long); 265 qdrops = qdrops_long; 266 error = sysctl_handle_int(oidp, &qdrops, 0, req); 267 if (error || !req->newptr) 268 return (error); 269 if (qdrops != 0) 270 return (EINVAL); 271 netisr_clearqdrops(&ip_nh); 272 return (0); 273} 274 275SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, 276 CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", 277 "Number of packets dropped from the IP input queue"); 278 279/* 280 * IP initialization: fill in IP protocol switch table. 281 * All protocols not implemented in kernel go to raw IP protocol handler. 282 */ 283void 284ip_init(void) 285{ 286 INIT_VNET_INET(curvnet); 287 struct protosw *pr; 288 int i; 289 290 V_ipsendredirects = 1; /* XXX */ 291 V_ip_checkinterface = 0; 292 V_ip_keepfaith = 0; 293 V_ip_sendsourcequench = 0; 294 V_rsvp_on = 0; 295 V_ip_defttl = IPDEFTTL; 296 V_ip_do_randomid = 0; 297 V_ip_id = time_second & 0xffff; 298 V_ipforwarding = 0; 299 V_ipstealth = 0; 300 V_nipq = 0; /* Total # of reass queues */ 301 302 V_ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ 303 V_ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ 304 V_ipport_firstauto = IPPORT_EPHEMERALFIRST; /* 10000 */ 305 V_ipport_lastauto = IPPORT_EPHEMERALLAST; /* 65535 */ 306 V_ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ 307 V_ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ 308 V_ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */ 309 V_ipport_reservedlow = 0; 310 V_ipport_randomized = 1; /* user controlled via sysctl */ 311 V_ipport_randomcps = 10; /* user controlled via sysctl */ 312 V_ipport_randomtime = 45; /* user controlled via sysctl */ 313 V_ipport_stoprandom = 0; /* toggled by ipport_tick */ 314 315 V_fw_one_pass = 1; 316 317#ifdef NOTYET 318 /* XXX global static but not instantiated in this file */ 319 V_ipfastforward_active = 0; 320 V_subnetsarelocal = 0; 321 V_sameprefixcarponly = 0; 322#endif 323 324 TAILQ_INIT(&V_in_ifaddrhead); 325 V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); 326 327 /* Initialize IP reassembly queue. */ 328 for (i = 0; i < IPREASS_NHASH; i++) 329 TAILQ_INIT(&V_ipq[i]); 330 V_maxnipq = nmbclusters / 32; 331 V_maxfragsperpacket = 16; 332 V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, 333 NULL, UMA_ALIGN_PTR, 0); 334 maxnipq_update(); 335 336 /* Skip initialization of globals for non-default instances. */ 337 if (!IS_DEFAULT_VNET(curvnet)) 338 return; 339 340 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 341 if (pr == NULL) 342 panic("ip_init: PF_INET not found"); 343 344 /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ 345 for (i = 0; i < IPPROTO_MAX; i++) 346 ip_protox[i] = pr - inetsw; 347 /* 348 * Cycle through IP protocols and put them into the appropriate place 349 * in ip_protox[]. 350 */ 351 for (pr = inetdomain.dom_protosw; 352 pr < inetdomain.dom_protoswNPROTOSW; pr++) 353 if (pr->pr_domain->dom_family == PF_INET && 354 pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { 355 /* Be careful to only index valid IP protocols. */ 356 if (pr->pr_protocol < IPPROTO_MAX) 357 ip_protox[pr->pr_protocol] = pr - inetsw; 358 } 359 360 /* Initialize packet filter hooks. */ 361 inet_pfil_hook.ph_type = PFIL_TYPE_AF; 362 inet_pfil_hook.ph_af = AF_INET; 363 if ((i = pfil_head_register(&inet_pfil_hook)) != 0) 364 printf("%s: WARNING: unable to register pfil hook, " 365 "error %d\n", __func__, i); 366 367 /* Start ipport_tick. */ 368 callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); 369 callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); 370 EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, 371 SHUTDOWN_PRI_DEFAULT); 372 EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change, 373 NULL, EVENTHANDLER_PRI_ANY); 374 375 /* Initialize various other remaining things. */ 376 IPQ_LOCK_INIT(); 377 netisr_register(&ip_nh); 378 ip_ft = flowtable_alloc(ip_output_flowtable_size, FL_PCPU); 379} 380 381void 382ip_fini(void *xtp) 383{ 384 385 callout_stop(&ipport_tick_callout); 386} 387 388/* 389 * Ip input routine. Checksum and byte swap header. If fragmented 390 * try to reassemble. Process options. Pass to next level. 391 */ 392void 393ip_input(struct mbuf *m) 394{ 395 INIT_VNET_INET(curvnet); 396 struct ip *ip = NULL; 397 struct in_ifaddr *ia = NULL; 398 struct ifaddr *ifa; 399 struct ifnet *ifp; 400 int checkif, hlen = 0; 401 u_short sum; 402 int dchg = 0; /* dest changed after fw */ 403 struct in_addr odst; /* original dst address */ 404 405 M_ASSERTPKTHDR(m); 406 407 if (m->m_flags & M_FASTFWD_OURS) { 408 /* 409 * Firewall or NAT changed destination to local. 410 * We expect ip_len and ip_off to be in host byte order. 411 */ 412 m->m_flags &= ~M_FASTFWD_OURS; 413 /* Set up some basics that will be used later. */ 414 ip = mtod(m, struct ip *); 415 hlen = ip->ip_hl << 2; 416 goto ours; 417 } 418 419 IPSTAT_INC(ips_total); 420 421 if (m->m_pkthdr.len < sizeof(struct ip)) 422 goto tooshort; 423 424 if (m->m_len < sizeof (struct ip) && 425 (m = m_pullup(m, sizeof (struct ip))) == NULL) { 426 IPSTAT_INC(ips_toosmall); 427 return; 428 } 429 ip = mtod(m, struct ip *); 430 431 if (ip->ip_v != IPVERSION) { 432 IPSTAT_INC(ips_badvers); 433 goto bad; 434 } 435 436 hlen = ip->ip_hl << 2; 437 if (hlen < sizeof(struct ip)) { /* minimum header length */ 438 IPSTAT_INC(ips_badhlen); 439 goto bad; 440 } 441 if (hlen > m->m_len) { 442 if ((m = m_pullup(m, hlen)) == NULL) { 443 IPSTAT_INC(ips_badhlen); 444 return; 445 } 446 ip = mtod(m, struct ip *); 447 } 448 449 /* 127/8 must not appear on wire - RFC1122 */ 450 ifp = m->m_pkthdr.rcvif; 451 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 452 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 453 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 454 IPSTAT_INC(ips_badaddr); 455 goto bad; 456 } 457 } 458 459 if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { 460 sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); 461 } else { 462 if (hlen == sizeof(struct ip)) { 463 sum = in_cksum_hdr(ip); 464 } else { 465 sum = in_cksum(m, hlen); 466 } 467 } 468 if (sum) { 469 IPSTAT_INC(ips_badsum); 470 goto bad; 471 } 472 473#ifdef ALTQ 474 if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) 475 /* packet is dropped by traffic conditioner */ 476 return; 477#endif 478 479 /* 480 * Convert fields to host representation. 481 */ 482 ip->ip_len = ntohs(ip->ip_len); 483 if (ip->ip_len < hlen) { 484 IPSTAT_INC(ips_badlen); 485 goto bad; 486 } 487 ip->ip_off = ntohs(ip->ip_off); 488 489 /* 490 * Check that the amount of data in the buffers 491 * is as at least much as the IP header would have us expect. 492 * Trim mbufs if longer than we expect. 493 * Drop packet if shorter than we expect. 494 */ 495 if (m->m_pkthdr.len < ip->ip_len) { 496tooshort: 497 IPSTAT_INC(ips_tooshort); 498 goto bad; 499 } 500 if (m->m_pkthdr.len > ip->ip_len) { 501 if (m->m_len == m->m_pkthdr.len) { 502 m->m_len = ip->ip_len; 503 m->m_pkthdr.len = ip->ip_len; 504 } else 505 m_adj(m, ip->ip_len - m->m_pkthdr.len); 506 } 507#ifdef IPSEC 508 /* 509 * Bypass packet filtering for packets from a tunnel (gif). 510 */ 511 if (ip_ipsec_filtertunnel(m)) 512 goto passin; 513#endif /* IPSEC */ 514 515 /* 516 * Run through list of hooks for input packets. 517 * 518 * NB: Beware of the destination address changing (e.g. 519 * by NAT rewriting). When this happens, tell 520 * ip_forward to do the right thing. 521 */ 522 523 /* Jump over all PFIL processing if hooks are not active. */ 524 if (!PFIL_HOOKED(&inet_pfil_hook)) 525 goto passin; 526 527 odst = ip->ip_dst; 528 if (pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0) 529 return; 530 if (m == NULL) /* consumed by filter */ 531 return; 532 533 ip = mtod(m, struct ip *); 534 dchg = (odst.s_addr != ip->ip_dst.s_addr); 535 ifp = m->m_pkthdr.rcvif; 536 537#ifdef IPFIREWALL_FORWARD 538 if (m->m_flags & M_FASTFWD_OURS) { 539 m->m_flags &= ~M_FASTFWD_OURS; 540 goto ours; 541 } 542 if ((dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL)) != 0) { 543 /* 544 * Directly ship on the packet. This allows to forward packets 545 * that were destined for us to some other directly connected 546 * host. 547 */ 548 ip_forward(m, dchg); 549 return; 550 } 551#endif /* IPFIREWALL_FORWARD */ 552 553passin: 554 /* 555 * Process options and, if not destined for us, 556 * ship it on. ip_dooptions returns 1 when an 557 * error was detected (causing an icmp message 558 * to be sent and the original packet to be freed). 559 */ 560 if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) 561 return; 562 563 /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no 564 * matter if it is destined to another node, or whether it is 565 * a multicast one, RSVP wants it! and prevents it from being forwarded 566 * anywhere else. Also checks if the rsvp daemon is running before 567 * grabbing the packet. 568 */ 569 if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) 570 goto ours; 571 572 /* 573 * Check our list of addresses, to see if the packet is for us. 574 * If we don't have any addresses, assume any unicast packet 575 * we receive might be for us (and let the upper layers deal 576 * with it). 577 */ 578 if (TAILQ_EMPTY(&V_in_ifaddrhead) && 579 (m->m_flags & (M_MCAST|M_BCAST)) == 0) 580 goto ours; 581 582 /* 583 * Enable a consistency check between the destination address 584 * and the arrival interface for a unicast packet (the RFC 1122 585 * strong ES model) if IP forwarding is disabled and the packet 586 * is not locally generated and the packet is not subject to 587 * 'ipfw fwd'. 588 * 589 * XXX - Checking also should be disabled if the destination 590 * address is ipnat'ed to a different interface. 591 * 592 * XXX - Checking is incompatible with IP aliases added 593 * to the loopback interface instead of the interface where 594 * the packets are received. 595 * 596 * XXX - This is the case for carp vhost IPs as well so we 597 * insert a workaround. If the packet got here, we already 598 * checked with carp_iamatch() and carp_forus(). 599 */ 600 checkif = V_ip_checkinterface && (V_ipforwarding == 0) && 601 ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) && 602#ifdef DEV_CARP 603 !ifp->if_carp && 604#endif 605 (dchg == 0); 606 607 /* 608 * Check for exact addresses in the hash bucket. 609 */ 610 LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { 611 /* 612 * If the address matches, verify that the packet 613 * arrived via the correct interface if checking is 614 * enabled. 615 */ 616 if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && 617 (!checkif || ia->ia_ifp == ifp)) 618 goto ours; 619 } 620 /* 621 * Check for broadcast addresses. 622 * 623 * Only accept broadcast packets that arrive via the matching 624 * interface. Reception of forwarded directed broadcasts would 625 * be handled via ip_forward() and ether_output() with the loopback 626 * into the stack for SIMPLEX interfaces handled by ether_output(). 627 */ 628 if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { 629 IF_ADDR_LOCK(ifp); 630 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 631 if (ifa->ifa_addr->sa_family != AF_INET) 632 continue; 633 ia = ifatoia(ifa); 634 if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == 635 ip->ip_dst.s_addr) { 636 IF_ADDR_UNLOCK(ifp); 637 goto ours; 638 } 639 if (ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr) { 640 IF_ADDR_UNLOCK(ifp); 641 goto ours; 642 } 643#ifdef BOOTP_COMPAT 644 if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { 645 IF_ADDR_UNLOCK(ifp); 646 goto ours; 647 } 648#endif 649 } 650 IF_ADDR_UNLOCK(ifp); 651 } 652 /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ 653 if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { 654 IPSTAT_INC(ips_cantforward); 655 m_freem(m); 656 return; 657 } 658 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 659 if (V_ip_mrouter) { 660 /* 661 * If we are acting as a multicast router, all 662 * incoming multicast packets are passed to the 663 * kernel-level multicast forwarding function. 664 * The packet is returned (relatively) intact; if 665 * ip_mforward() returns a non-zero value, the packet 666 * must be discarded, else it may be accepted below. 667 */ 668 if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { 669 IPSTAT_INC(ips_cantforward); 670 m_freem(m); 671 return; 672 } 673 674 /* 675 * The process-level routing daemon needs to receive 676 * all multicast IGMP packets, whether or not this 677 * host belongs to their destination groups. 678 */ 679 if (ip->ip_p == IPPROTO_IGMP) 680 goto ours; 681 IPSTAT_INC(ips_forward); 682 } 683 /* 684 * Assume the packet is for us, to avoid prematurely taking 685 * a lock on the in_multi hash. Protocols must perform 686 * their own filtering and update statistics accordingly. 687 */ 688 goto ours; 689 } 690 if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) 691 goto ours; 692 if (ip->ip_dst.s_addr == INADDR_ANY) 693 goto ours; 694 695 /* 696 * FAITH(Firewall Aided Internet Translator) 697 */ 698 if (ifp && ifp->if_type == IFT_FAITH) { 699 if (V_ip_keepfaith) { 700 if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) 701 goto ours; 702 } 703 m_freem(m); 704 return; 705 } 706 707 /* 708 * Not for us; forward if possible and desirable. 709 */ 710 if (V_ipforwarding == 0) { 711 IPSTAT_INC(ips_cantforward); 712 m_freem(m); 713 } else { 714#ifdef IPSEC 715 if (ip_ipsec_fwd(m)) 716 goto bad; 717#endif /* IPSEC */ 718 ip_forward(m, dchg); 719 } 720 return; 721 722ours: 723#ifdef IPSTEALTH 724 /* 725 * IPSTEALTH: Process non-routing options only 726 * if the packet is destined for us. 727 */ 728 if (V_ipstealth && hlen > sizeof (struct ip) && 729 ip_dooptions(m, 1)) 730 return; 731#endif /* IPSTEALTH */ 732 733 /* Count the packet in the ip address stats */ 734 if (ia != NULL) { 735 ia->ia_ifa.if_ipackets++; 736 ia->ia_ifa.if_ibytes += m->m_pkthdr.len; 737 } 738 739 /* 740 * Attempt reassembly; if it succeeds, proceed. 741 * ip_reass() will return a different mbuf. 742 */ 743 if (ip->ip_off & (IP_MF | IP_OFFMASK)) { 744 m = ip_reass(m); 745 if (m == NULL) 746 return; 747 ip = mtod(m, struct ip *); 748 /* Get the header length of the reassembled packet */ 749 hlen = ip->ip_hl << 2; 750 } 751 752 /* 753 * Further protocols expect the packet length to be w/o the 754 * IP header. 755 */ 756 ip->ip_len -= hlen; 757 758#ifdef IPSEC 759 /* 760 * enforce IPsec policy checking if we are seeing last header. 761 * note that we do not visit this with protocols with pcb layer 762 * code - like udp/tcp/raw ip. 763 */ 764 if (ip_ipsec_input(m)) 765 goto bad; 766#endif /* IPSEC */ 767 768 /* 769 * Switch out to protocol's input routine. 770 */ 771 IPSTAT_INC(ips_delivered); 772 773 (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); 774 return; 775bad: 776 m_freem(m); 777} 778 779/* 780 * After maxnipq has been updated, propagate the change to UMA. The UMA zone 781 * max has slightly different semantics than the sysctl, for historical 782 * reasons. 783 */ 784static void 785maxnipq_update(void) 786{ 787 INIT_VNET_INET(curvnet); 788 789 /* 790 * -1 for unlimited allocation. 791 */ 792 if (V_maxnipq < 0) 793 uma_zone_set_max(V_ipq_zone, 0); 794 /* 795 * Positive number for specific bound. 796 */ 797 if (V_maxnipq > 0) 798 uma_zone_set_max(V_ipq_zone, V_maxnipq); 799 /* 800 * Zero specifies no further fragment queue allocation -- set the 801 * bound very low, but rely on implementation elsewhere to actually 802 * prevent allocation and reclaim current queues. 803 */ 804 if (V_maxnipq == 0) 805 uma_zone_set_max(V_ipq_zone, 1); 806} 807 808static void 809ipq_zone_change(void *tag) 810{ 811 INIT_VNET_INET(curvnet); 812 813 if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) { 814 V_maxnipq = nmbclusters / 32; 815 maxnipq_update(); 816 } 817} 818 819static int 820sysctl_maxnipq(SYSCTL_HANDLER_ARGS) 821{ 822 INIT_VNET_INET(curvnet); 823 int error, i; 824 825 i = V_maxnipq; 826 error = sysctl_handle_int(oidp, &i, 0, req); 827 if (error || !req->newptr) 828 return (error); 829 830 /* 831 * XXXRW: Might be a good idea to sanity check the argument and place 832 * an extreme upper bound. 833 */ 834 if (i < -1) 835 return (EINVAL); 836 V_maxnipq = i; 837 maxnipq_update(); 838 return (0); 839} 840 841SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW, 842 NULL, 0, sysctl_maxnipq, "I", 843 "Maximum number of IPv4 fragment reassembly queue entries"); 844 845/* 846 * Take incoming datagram fragment and try to reassemble it into 847 * whole datagram. If the argument is the first fragment or one 848 * in between the function will return NULL and store the mbuf 849 * in the fragment chain. If the argument is the last fragment 850 * the packet will be reassembled and the pointer to the new 851 * mbuf returned for further processing. Only m_tags attached 852 * to the first packet/fragment are preserved. 853 * The IP header is *NOT* adjusted out of iplen. 854 */ 855struct mbuf * 856ip_reass(struct mbuf *m) 857{ 858 INIT_VNET_INET(curvnet); 859 struct ip *ip; 860 struct mbuf *p, *q, *nq, *t; 861 struct ipq *fp = NULL; 862 struct ipqhead *head; 863 int i, hlen, next; 864 u_int8_t ecn, ecn0; 865 u_short hash; 866 867 /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ 868 if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { 869 IPSTAT_INC(ips_fragments); 870 IPSTAT_INC(ips_fragdropped); 871 m_freem(m); 872 return (NULL); 873 } 874 875 ip = mtod(m, struct ip *); 876 hlen = ip->ip_hl << 2; 877 878 hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); 879 head = &V_ipq[hash]; 880 IPQ_LOCK(); 881 882 /* 883 * Look for queue of fragments 884 * of this datagram. 885 */ 886 TAILQ_FOREACH(fp, head, ipq_list) 887 if (ip->ip_id == fp->ipq_id && 888 ip->ip_src.s_addr == fp->ipq_src.s_addr && 889 ip->ip_dst.s_addr == fp->ipq_dst.s_addr && 890#ifdef MAC 891 mac_ipq_match(m, fp) && 892#endif 893 ip->ip_p == fp->ipq_p) 894 goto found; 895 896 fp = NULL; 897 898 /* 899 * Attempt to trim the number of allocated fragment queues if it 900 * exceeds the administrative limit. 901 */ 902 if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) { 903 /* 904 * drop something from the tail of the current queue 905 * before proceeding further 906 */ 907 struct ipq *q = TAILQ_LAST(head, ipqhead); 908 if (q == NULL) { /* gak */ 909 for (i = 0; i < IPREASS_NHASH; i++) { 910 struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead); 911 if (r) { 912 IPSTAT_ADD(ips_fragtimeout, 913 r->ipq_nfrags); 914 ip_freef(&V_ipq[i], r); 915 break; 916 } 917 } 918 } else { 919 IPSTAT_ADD(ips_fragtimeout, q->ipq_nfrags); 920 ip_freef(head, q); 921 } 922 } 923 924found: 925 /* 926 * Adjust ip_len to not reflect header, 927 * convert offset of this to bytes. 928 */ 929 ip->ip_len -= hlen; 930 if (ip->ip_off & IP_MF) { 931 /* 932 * Make sure that fragments have a data length 933 * that's a non-zero multiple of 8 bytes. 934 */ 935 if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { 936 IPSTAT_INC(ips_toosmall); /* XXX */ 937 goto dropfrag; 938 } 939 m->m_flags |= M_FRAG; 940 } else 941 m->m_flags &= ~M_FRAG; 942 ip->ip_off <<= 3; 943 944 945 /* 946 * Attempt reassembly; if it succeeds, proceed. 947 * ip_reass() will return a different mbuf. 948 */ 949 IPSTAT_INC(ips_fragments); 950 m->m_pkthdr.header = ip; 951 952 /* Previous ip_reass() started here. */ 953 /* 954 * Presence of header sizes in mbufs 955 * would confuse code below. 956 */ 957 m->m_data += hlen; 958 m->m_len -= hlen; 959 960 /* 961 * If first fragment to arrive, create a reassembly queue. 962 */ 963 if (fp == NULL) { 964 fp = uma_zalloc(V_ipq_zone, M_NOWAIT); 965 if (fp == NULL) 966 goto dropfrag; 967#ifdef MAC 968 if (mac_ipq_init(fp, M_NOWAIT) != 0) { 969 uma_zfree(V_ipq_zone, fp); 970 fp = NULL; 971 goto dropfrag; 972 } 973 mac_ipq_create(m, fp); 974#endif 975 TAILQ_INSERT_HEAD(head, fp, ipq_list); 976 V_nipq++; 977 fp->ipq_nfrags = 1; 978 fp->ipq_ttl = IPFRAGTTL; 979 fp->ipq_p = ip->ip_p; 980 fp->ipq_id = ip->ip_id; 981 fp->ipq_src = ip->ip_src; 982 fp->ipq_dst = ip->ip_dst; 983 fp->ipq_frags = m; 984 m->m_nextpkt = NULL; 985 goto done; 986 } else { 987 fp->ipq_nfrags++; 988#ifdef MAC 989 mac_ipq_update(m, fp); 990#endif 991 } 992 993#define GETIP(m) ((struct ip*)((m)->m_pkthdr.header)) 994 995 /* 996 * Handle ECN by comparing this segment with the first one; 997 * if CE is set, do not lose CE. 998 * drop if CE and not-ECT are mixed for the same packet. 999 */ 1000 ecn = ip->ip_tos & IPTOS_ECN_MASK; 1001 ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; 1002 if (ecn == IPTOS_ECN_CE) { 1003 if (ecn0 == IPTOS_ECN_NOTECT) 1004 goto dropfrag; 1005 if (ecn0 != IPTOS_ECN_CE) 1006 GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; 1007 } 1008 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) 1009 goto dropfrag; 1010 1011 /* 1012 * Find a segment which begins after this one does. 1013 */ 1014 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) 1015 if (GETIP(q)->ip_off > ip->ip_off) 1016 break; 1017 1018 /* 1019 * If there is a preceding segment, it may provide some of 1020 * our data already. If so, drop the data from the incoming 1021 * segment. If it provides all of our data, drop us, otherwise 1022 * stick new segment in the proper place. 1023 * 1024 * If some of the data is dropped from the the preceding 1025 * segment, then it's checksum is invalidated. 1026 */ 1027 if (p) { 1028 i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off; 1029 if (i > 0) { 1030 if (i >= ip->ip_len) 1031 goto dropfrag; 1032 m_adj(m, i); 1033 m->m_pkthdr.csum_flags = 0; 1034 ip->ip_off += i; 1035 ip->ip_len -= i; 1036 } 1037 m->m_nextpkt = p->m_nextpkt; 1038 p->m_nextpkt = m; 1039 } else { 1040 m->m_nextpkt = fp->ipq_frags; 1041 fp->ipq_frags = m; 1042 } 1043 1044 /* 1045 * While we overlap succeeding segments trim them or, 1046 * if they are completely covered, dequeue them. 1047 */ 1048 for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off; 1049 q = nq) { 1050 i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off; 1051 if (i < GETIP(q)->ip_len) { 1052 GETIP(q)->ip_len -= i; 1053 GETIP(q)->ip_off += i; 1054 m_adj(q, i); 1055 q->m_pkthdr.csum_flags = 0; 1056 break; 1057 } 1058 nq = q->m_nextpkt; 1059 m->m_nextpkt = nq; 1060 IPSTAT_INC(ips_fragdropped); 1061 fp->ipq_nfrags--; 1062 m_freem(q); 1063 } 1064 1065 /* 1066 * Check for complete reassembly and perform frag per packet 1067 * limiting. 1068 * 1069 * Frag limiting is performed here so that the nth frag has 1070 * a chance to complete the packet before we drop the packet. 1071 * As a result, n+1 frags are actually allowed per packet, but 1072 * only n will ever be stored. (n = maxfragsperpacket.) 1073 * 1074 */ 1075 next = 0; 1076 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { 1077 if (GETIP(q)->ip_off != next) { 1078 if (fp->ipq_nfrags > V_maxfragsperpacket) { 1079 IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); 1080 ip_freef(head, fp); 1081 } 1082 goto done; 1083 } 1084 next += GETIP(q)->ip_len; 1085 } 1086 /* Make sure the last packet didn't have the IP_MF flag */ 1087 if (p->m_flags & M_FRAG) { 1088 if (fp->ipq_nfrags > V_maxfragsperpacket) { 1089 IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); 1090 ip_freef(head, fp); 1091 } 1092 goto done; 1093 } 1094 1095 /* 1096 * Reassembly is complete. Make sure the packet is a sane size. 1097 */ 1098 q = fp->ipq_frags; 1099 ip = GETIP(q); 1100 if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { 1101 IPSTAT_INC(ips_toolong); 1102 IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); 1103 ip_freef(head, fp); 1104 goto done; 1105 } 1106 1107 /* 1108 * Concatenate fragments. 1109 */ 1110 m = q; 1111 t = m->m_next; 1112 m->m_next = NULL; 1113 m_cat(m, t); 1114 nq = q->m_nextpkt; 1115 q->m_nextpkt = NULL; 1116 for (q = nq; q != NULL; q = nq) { 1117 nq = q->m_nextpkt; 1118 q->m_nextpkt = NULL; 1119 m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; 1120 m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; 1121 m_cat(m, q); 1122 } 1123 /* 1124 * In order to do checksumming faster we do 'end-around carry' here 1125 * (and not in for{} loop), though it implies we are not going to 1126 * reassemble more than 64k fragments. 1127 */ 1128 m->m_pkthdr.csum_data = 1129 (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); 1130#ifdef MAC 1131 mac_ipq_reassemble(fp, m); 1132 mac_ipq_destroy(fp); 1133#endif 1134 1135 /* 1136 * Create header for new ip packet by modifying header of first 1137 * packet; dequeue and discard fragment reassembly header. 1138 * Make header visible. 1139 */ 1140 ip->ip_len = (ip->ip_hl << 2) + next; 1141 ip->ip_src = fp->ipq_src; 1142 ip->ip_dst = fp->ipq_dst; 1143 TAILQ_REMOVE(head, fp, ipq_list); 1144 V_nipq--; 1145 uma_zfree(V_ipq_zone, fp); 1146 m->m_len += (ip->ip_hl << 2); 1147 m->m_data -= (ip->ip_hl << 2); 1148 /* some debugging cruft by sklower, below, will go away soon */ 1149 if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ 1150 m_fixhdr(m); 1151 IPSTAT_INC(ips_reassembled); 1152 IPQ_UNLOCK(); 1153 return (m); 1154 1155dropfrag: 1156 IPSTAT_INC(ips_fragdropped); 1157 if (fp != NULL) 1158 fp->ipq_nfrags--; 1159 m_freem(m); 1160done: 1161 IPQ_UNLOCK(); 1162 return (NULL); 1163 1164#undef GETIP 1165} 1166 1167/* 1168 * Free a fragment reassembly header and all 1169 * associated datagrams. 1170 */ 1171static void 1172ip_freef(struct ipqhead *fhp, struct ipq *fp) 1173{ 1174 INIT_VNET_INET(curvnet); 1175 struct mbuf *q; 1176 1177 IPQ_LOCK_ASSERT(); 1178 1179 while (fp->ipq_frags) { 1180 q = fp->ipq_frags; 1181 fp->ipq_frags = q->m_nextpkt; 1182 m_freem(q); 1183 } 1184 TAILQ_REMOVE(fhp, fp, ipq_list); 1185 uma_zfree(V_ipq_zone, fp); 1186 V_nipq--; 1187} 1188 1189/* 1190 * IP timer processing; 1191 * if a timer expires on a reassembly 1192 * queue, discard it. 1193 */ 1194void 1195ip_slowtimo(void) 1196{ 1197 VNET_ITERATOR_DECL(vnet_iter); 1198 struct ipq *fp; 1199 int i; 1200 1201 IPQ_LOCK(); 1202 VNET_LIST_RLOCK(); 1203 VNET_FOREACH(vnet_iter) { 1204 CURVNET_SET(vnet_iter); 1205 INIT_VNET_INET(vnet_iter); 1206 for (i = 0; i < IPREASS_NHASH; i++) { 1207 for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) { 1208 struct ipq *fpp; 1209 1210 fpp = fp; 1211 fp = TAILQ_NEXT(fp, ipq_list); 1212 if(--fpp->ipq_ttl == 0) { 1213 IPSTAT_ADD(ips_fragtimeout, 1214 fpp->ipq_nfrags); 1215 ip_freef(&V_ipq[i], fpp); 1216 } 1217 } 1218 } 1219 /* 1220 * If we are over the maximum number of fragments 1221 * (due to the limit being lowered), drain off 1222 * enough to get down to the new limit. 1223 */ 1224 if (V_maxnipq >= 0 && V_nipq > V_maxnipq) { 1225 for (i = 0; i < IPREASS_NHASH; i++) { 1226 while (V_nipq > V_maxnipq && 1227 !TAILQ_EMPTY(&V_ipq[i])) { 1228 IPSTAT_ADD(ips_fragdropped, 1229 TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); 1230 ip_freef(&V_ipq[i], 1231 TAILQ_FIRST(&V_ipq[i])); 1232 } 1233 } 1234 } 1235 CURVNET_RESTORE(); 1236 } 1237 VNET_LIST_RUNLOCK(); 1238 IPQ_UNLOCK(); 1239} 1240 1241/* 1242 * Drain off all datagram fragments. 1243 */ 1244void 1245ip_drain(void) 1246{ 1247 VNET_ITERATOR_DECL(vnet_iter); 1248 int i; 1249 1250 IPQ_LOCK(); 1251 VNET_LIST_RLOCK(); 1252 VNET_FOREACH(vnet_iter) { 1253 CURVNET_SET(vnet_iter); 1254 INIT_VNET_INET(vnet_iter); 1255 for (i = 0; i < IPREASS_NHASH; i++) { 1256 while(!TAILQ_EMPTY(&V_ipq[i])) { 1257 IPSTAT_ADD(ips_fragdropped, 1258 TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); 1259 ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); 1260 } 1261 } 1262 CURVNET_RESTORE(); 1263 } 1264 VNET_LIST_RUNLOCK(); 1265 IPQ_UNLOCK(); 1266 in_rtqdrain(); 1267} 1268 1269/* 1270 * The protocol to be inserted into ip_protox[] must be already registered 1271 * in inetsw[], either statically or through pf_proto_register(). 1272 */ 1273int 1274ipproto_register(u_char ipproto) 1275{ 1276 struct protosw *pr; 1277 1278 /* Sanity checks. */ 1279 if (ipproto == 0) 1280 return (EPROTONOSUPPORT); 1281 1282 /* 1283 * The protocol slot must not be occupied by another protocol 1284 * already. An index pointing to IPPROTO_RAW is unused. 1285 */ 1286 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 1287 if (pr == NULL) 1288 return (EPFNOSUPPORT); 1289 if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ 1290 return (EEXIST); 1291 1292 /* Find the protocol position in inetsw[] and set the index. */ 1293 for (pr = inetdomain.dom_protosw; 1294 pr < inetdomain.dom_protoswNPROTOSW; pr++) { 1295 if (pr->pr_domain->dom_family == PF_INET && 1296 pr->pr_protocol && pr->pr_protocol == ipproto) { 1297 /* Be careful to only index valid IP protocols. */ 1298 if (pr->pr_protocol < IPPROTO_MAX) { 1299 ip_protox[pr->pr_protocol] = pr - inetsw; 1300 return (0); 1301 } else 1302 return (EINVAL); 1303 } 1304 } 1305 return (EPROTONOSUPPORT); 1306} 1307 1308int 1309ipproto_unregister(u_char ipproto) 1310{ 1311 struct protosw *pr; 1312 1313 /* Sanity checks. */ 1314 if (ipproto == 0) 1315 return (EPROTONOSUPPORT); 1316 1317 /* Check if the protocol was indeed registered. */ 1318 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 1319 if (pr == NULL) 1320 return (EPFNOSUPPORT); 1321 if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ 1322 return (ENOENT); 1323 1324 /* Reset the protocol slot to IPPROTO_RAW. */ 1325 ip_protox[ipproto] = pr - inetsw; 1326 return (0); 1327} 1328 1329/* 1330 * Given address of next destination (final or next hop), 1331 * return internet address info of interface to be used to get there. 1332 */ 1333struct in_ifaddr * 1334ip_rtaddr(struct in_addr dst, u_int fibnum) 1335{ 1336 struct route sro; 1337 struct sockaddr_in *sin; 1338 struct in_ifaddr *ifa; 1339 1340 bzero(&sro, sizeof(sro)); 1341 sin = (struct sockaddr_in *)&sro.ro_dst; 1342 sin->sin_family = AF_INET; 1343 sin->sin_len = sizeof(*sin); 1344 sin->sin_addr = dst; 1345 in_rtalloc_ign(&sro, 0, fibnum); 1346 1347 if (sro.ro_rt == NULL) 1348 return (NULL); 1349 1350 ifa = ifatoia(sro.ro_rt->rt_ifa); 1351 RTFREE(sro.ro_rt); 1352 return (ifa); 1353} 1354 1355u_char inetctlerrmap[PRC_NCMDS] = { 1356 0, 0, 0, 0, 1357 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, 1358 EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, 1359 EMSGSIZE, EHOSTUNREACH, 0, 0, 1360 0, 0, EHOSTUNREACH, 0, 1361 ENOPROTOOPT, ECONNREFUSED 1362}; 1363 1364/* 1365 * Forward a packet. If some error occurs return the sender 1366 * an icmp packet. Note we can't always generate a meaningful 1367 * icmp message because icmp doesn't have a large enough repertoire 1368 * of codes and types. 1369 * 1370 * If not forwarding, just drop the packet. This could be confusing 1371 * if ipforwarding was zero but some routing protocol was advancing 1372 * us as a gateway to somewhere. However, we must let the routing 1373 * protocol deal with that. 1374 * 1375 * The srcrt parameter indicates whether the packet is being forwarded 1376 * via a source route. 1377 */ 1378void 1379ip_forward(struct mbuf *m, int srcrt) 1380{ 1381 INIT_VNET_INET(curvnet); 1382 struct ip *ip = mtod(m, struct ip *); 1383 struct in_ifaddr *ia; 1384 struct mbuf *mcopy; 1385 struct in_addr dest; 1386 struct route ro; 1387 int error, type = 0, code = 0, mtu = 0; 1388 1389 if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { 1390 IPSTAT_INC(ips_cantforward); 1391 m_freem(m); 1392 return; 1393 } 1394#ifdef IPSTEALTH 1395 if (!V_ipstealth) { 1396#endif 1397 if (ip->ip_ttl <= IPTTLDEC) { 1398 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 1399 0, 0); 1400 return; 1401 } 1402#ifdef IPSTEALTH 1403 } 1404#endif 1405 1406 ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m)); 1407#ifndef IPSEC 1408 /* 1409 * 'ia' may be NULL if there is no route for this destination. 1410 * In case of IPsec, Don't discard it just yet, but pass it to 1411 * ip_output in case of outgoing IPsec policy. 1412 */ 1413 if (!srcrt && ia == NULL) { 1414 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); 1415 return; 1416 } 1417#endif 1418 1419 /* 1420 * Save the IP header and at most 8 bytes of the payload, 1421 * in case we need to generate an ICMP message to the src. 1422 * 1423 * XXX this can be optimized a lot by saving the data in a local 1424 * buffer on the stack (72 bytes at most), and only allocating the 1425 * mbuf if really necessary. The vast majority of the packets 1426 * are forwarded without having to send an ICMP back (either 1427 * because unnecessary, or because rate limited), so we are 1428 * really we are wasting a lot of work here. 1429 * 1430 * We don't use m_copy() because it might return a reference 1431 * to a shared cluster. Both this function and ip_output() 1432 * assume exclusive access to the IP header in `m', so any 1433 * data in a cluster may change before we reach icmp_error(). 1434 */ 1435 MGETHDR(mcopy, M_DONTWAIT, m->m_type); 1436 if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_DONTWAIT)) { 1437 /* 1438 * It's probably ok if the pkthdr dup fails (because 1439 * the deep copy of the tag chain failed), but for now 1440 * be conservative and just discard the copy since 1441 * code below may some day want the tags. 1442 */ 1443 m_free(mcopy); 1444 mcopy = NULL; 1445 } 1446 if (mcopy != NULL) { 1447 mcopy->m_len = min(ip->ip_len, M_TRAILINGSPACE(mcopy)); 1448 mcopy->m_pkthdr.len = mcopy->m_len; 1449 m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); 1450 } 1451 1452#ifdef IPSTEALTH 1453 if (!V_ipstealth) { 1454#endif 1455 ip->ip_ttl -= IPTTLDEC; 1456#ifdef IPSTEALTH 1457 } 1458#endif 1459 1460 /* 1461 * If forwarding packet using same interface that it came in on, 1462 * perhaps should send a redirect to sender to shortcut a hop. 1463 * Only send redirect if source is sending directly to us, 1464 * and if packet was not source routed (or has any options). 1465 * Also, don't send redirect if forwarding using a default route 1466 * or a route modified by a redirect. 1467 */ 1468 dest.s_addr = 0; 1469 if (!srcrt && V_ipsendredirects && 1470 ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { 1471 struct sockaddr_in *sin; 1472 struct rtentry *rt; 1473 1474 bzero(&ro, sizeof(ro)); 1475 sin = (struct sockaddr_in *)&ro.ro_dst; 1476 sin->sin_family = AF_INET; 1477 sin->sin_len = sizeof(*sin); 1478 sin->sin_addr = ip->ip_dst; 1479 in_rtalloc_ign(&ro, 0, M_GETFIB(m)); 1480 1481 rt = ro.ro_rt; 1482 1483 if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && 1484 satosin(rt_key(rt))->sin_addr.s_addr != 0) { 1485#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) 1486 u_long src = ntohl(ip->ip_src.s_addr); 1487 1488 if (RTA(rt) && 1489 (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { 1490 if (rt->rt_flags & RTF_GATEWAY) 1491 dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; 1492 else 1493 dest.s_addr = ip->ip_dst.s_addr; 1494 /* Router requirements says to only send host redirects */ 1495 type = ICMP_REDIRECT; 1496 code = ICMP_REDIRECT_HOST; 1497 } 1498 } 1499 if (rt) 1500 RTFREE(rt); 1501 } 1502 1503 /* 1504 * Try to cache the route MTU from ip_output so we can consider it for 1505 * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191. 1506 */ 1507 bzero(&ro, sizeof(ro)); 1508 1509 error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); 1510 1511 if (error == EMSGSIZE && ro.ro_rt) 1512 mtu = ro.ro_rt->rt_rmx.rmx_mtu; 1513 if (ro.ro_rt) 1514 RTFREE(ro.ro_rt); 1515 1516 if (error) 1517 IPSTAT_INC(ips_cantforward); 1518 else { 1519 IPSTAT_INC(ips_forward); 1520 if (type) 1521 IPSTAT_INC(ips_redirectsent); 1522 else { 1523 if (mcopy) 1524 m_freem(mcopy); 1525 return; 1526 } 1527 } 1528 if (mcopy == NULL) 1529 return; 1530 1531 switch (error) { 1532 1533 case 0: /* forwarded, but need redirect */ 1534 /* type, code set above */ 1535 break; 1536 1537 case ENETUNREACH: 1538 case EHOSTUNREACH: 1539 case ENETDOWN: 1540 case EHOSTDOWN: 1541 default: 1542 type = ICMP_UNREACH; 1543 code = ICMP_UNREACH_HOST; 1544 break; 1545 1546 case EMSGSIZE: 1547 type = ICMP_UNREACH; 1548 code = ICMP_UNREACH_NEEDFRAG; 1549 1550#ifdef IPSEC 1551 /* 1552 * If IPsec is configured for this path, 1553 * override any possibly mtu value set by ip_output. 1554 */ 1555 mtu = ip_ipsec_mtu(m, mtu); 1556#endif /* IPSEC */ 1557 /* 1558 * If the MTU was set before make sure we are below the 1559 * interface MTU. 1560 * If the MTU wasn't set before use the interface mtu or 1561 * fall back to the next smaller mtu step compared to the 1562 * current packet size. 1563 */ 1564 if (mtu != 0) { 1565 if (ia != NULL) 1566 mtu = min(mtu, ia->ia_ifp->if_mtu); 1567 } else { 1568 if (ia != NULL) 1569 mtu = ia->ia_ifp->if_mtu; 1570 else 1571 mtu = ip_next_mtu(ip->ip_len, 0); 1572 } 1573 IPSTAT_INC(ips_cantfrag); 1574 break; 1575 1576 case ENOBUFS: 1577 /* 1578 * A router should not generate ICMP_SOURCEQUENCH as 1579 * required in RFC1812 Requirements for IP Version 4 Routers. 1580 * Source quench could be a big problem under DoS attacks, 1581 * or if the underlying interface is rate-limited. 1582 * Those who need source quench packets may re-enable them 1583 * via the net.inet.ip.sendsourcequench sysctl. 1584 */ 1585 if (V_ip_sendsourcequench == 0) { 1586 m_freem(mcopy); 1587 return; 1588 } else { 1589 type = ICMP_SOURCEQUENCH; 1590 code = 0; 1591 } 1592 break; 1593 1594 case EACCES: /* ipfw denied packet */ 1595 m_freem(mcopy); 1596 return; 1597 } 1598 icmp_error(mcopy, type, code, dest.s_addr, mtu); 1599} 1600 1601void 1602ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, 1603 struct mbuf *m) 1604{ 1605 INIT_VNET_NET(inp->inp_vnet); 1606 1607 if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) { 1608 struct bintime bt; 1609 1610 bintime(&bt); 1611 if (inp->inp_socket->so_options & SO_BINTIME) { 1612 *mp = sbcreatecontrol((caddr_t) &bt, sizeof(bt), 1613 SCM_BINTIME, SOL_SOCKET); 1614 if (*mp) 1615 mp = &(*mp)->m_next; 1616 } 1617 if (inp->inp_socket->so_options & SO_TIMESTAMP) { 1618 struct timeval tv; 1619 1620 bintime2timeval(&bt, &tv); 1621 *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), 1622 SCM_TIMESTAMP, SOL_SOCKET); 1623 if (*mp) 1624 mp = &(*mp)->m_next; 1625 } 1626 } 1627 if (inp->inp_flags & INP_RECVDSTADDR) { 1628 *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, 1629 sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); 1630 if (*mp) 1631 mp = &(*mp)->m_next; 1632 } 1633 if (inp->inp_flags & INP_RECVTTL) { 1634 *mp = sbcreatecontrol((caddr_t) &ip->ip_ttl, 1635 sizeof(u_char), IP_RECVTTL, IPPROTO_IP); 1636 if (*mp) 1637 mp = &(*mp)->m_next; 1638 } 1639#ifdef notyet 1640 /* XXX 1641 * Moving these out of udp_input() made them even more broken 1642 * than they already were. 1643 */ 1644 /* options were tossed already */ 1645 if (inp->inp_flags & INP_RECVOPTS) { 1646 *mp = sbcreatecontrol((caddr_t) opts_deleted_above, 1647 sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); 1648 if (*mp) 1649 mp = &(*mp)->m_next; 1650 } 1651 /* ip_srcroute doesn't do what we want here, need to fix */ 1652 if (inp->inp_flags & INP_RECVRETOPTS) { 1653 *mp = sbcreatecontrol((caddr_t) ip_srcroute(m), 1654 sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); 1655 if (*mp) 1656 mp = &(*mp)->m_next; 1657 } 1658#endif 1659 if (inp->inp_flags & INP_RECVIF) { 1660 struct ifnet *ifp; 1661 struct sdlbuf { 1662 struct sockaddr_dl sdl; 1663 u_char pad[32]; 1664 } sdlbuf; 1665 struct sockaddr_dl *sdp; 1666 struct sockaddr_dl *sdl2 = &sdlbuf.sdl; 1667 1668 if (((ifp = m->m_pkthdr.rcvif)) 1669 && ( ifp->if_index && (ifp->if_index <= V_if_index))) { 1670 sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; 1671 /* 1672 * Change our mind and don't try copy. 1673 */ 1674 if ((sdp->sdl_family != AF_LINK) 1675 || (sdp->sdl_len > sizeof(sdlbuf))) { 1676 goto makedummy; 1677 } 1678 bcopy(sdp, sdl2, sdp->sdl_len); 1679 } else { 1680makedummy: 1681 sdl2->sdl_len 1682 = offsetof(struct sockaddr_dl, sdl_data[0]); 1683 sdl2->sdl_family = AF_LINK; 1684 sdl2->sdl_index = 0; 1685 sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; 1686 } 1687 *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, 1688 IP_RECVIF, IPPROTO_IP); 1689 if (*mp) 1690 mp = &(*mp)->m_next; 1691 } 1692} 1693 1694/* 1695 * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the 1696 * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on 1697 * locking. This code remains in ip_input.c as ip_mroute.c is optionally 1698 * compiled. 1699 */ 1700int 1701ip_rsvp_init(struct socket *so) 1702{ 1703 INIT_VNET_INET(so->so_vnet); 1704 1705 if (so->so_type != SOCK_RAW || 1706 so->so_proto->pr_protocol != IPPROTO_RSVP) 1707 return EOPNOTSUPP; 1708 1709 if (V_ip_rsvpd != NULL) 1710 return EADDRINUSE; 1711 1712 V_ip_rsvpd = so; 1713 /* 1714 * This may seem silly, but we need to be sure we don't over-increment 1715 * the RSVP counter, in case something slips up. 1716 */ 1717 if (!V_ip_rsvp_on) { 1718 V_ip_rsvp_on = 1; 1719 V_rsvp_on++; 1720 } 1721 1722 return 0; 1723} 1724 1725int 1726ip_rsvp_done(void) 1727{ 1728 INIT_VNET_INET(curvnet); 1729 1730 V_ip_rsvpd = NULL; 1731 /* 1732 * This may seem silly, but we need to be sure we don't over-decrement 1733 * the RSVP counter, in case something slips up. 1734 */ 1735 if (V_ip_rsvp_on) { 1736 V_ip_rsvp_on = 0; 1737 V_rsvp_on--; 1738 } 1739 return 0; 1740} 1741 1742void 1743rsvp_input(struct mbuf *m, int off) /* XXX must fixup manually */ 1744{ 1745 INIT_VNET_INET(curvnet); 1746 1747 if (rsvp_input_p) { /* call the real one if loaded */ 1748 rsvp_input_p(m, off); 1749 return; 1750 } 1751 1752 /* Can still get packets with rsvp_on = 0 if there is a local member 1753 * of the group to which the RSVP packet is addressed. But in this 1754 * case we want to throw the packet away. 1755 */ 1756 1757 if (!V_rsvp_on) { 1758 m_freem(m); 1759 return; 1760 } 1761 1762 if (V_ip_rsvpd != NULL) { 1763 rip_input(m, off); 1764 return; 1765 } 1766 /* Drop the packet */ 1767 m_freem(m); 1768} 1769