1/* 2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1988, 1993 30 * The Regents of the University of California. All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or without 33 * modification, are permitted provided that the following conditions 34 * are met: 35 * 1. Redistributions of source code must retain the above copyright 36 * notice, this list of conditions and the following disclaimer. 37 * 2. Redistributions in binary form must reproduce the above copyright 38 * notice, this list of conditions and the following disclaimer in the 39 * documentation and/or other materials provided with the distribution. 40 * 3. All advertising materials mentioning features or use of this software 41 * must display the following acknowledgement: 42 * This product includes software developed by the University of 43 * California, Berkeley and its contributors. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 61 */ 62/* 63 * NOTICE: This file was modified by SPARTA, Inc. in 2007 to introduce 64 * support for mandatory and extensible security protections. This notice 65 * is included in support of clause 2.2 (b) of the Apple Public License, 66 * Version 2.0. 67 */ 68 69#define _IP_VHL 70 71#include <sys/param.h> 72#include <sys/systm.h> 73#include <sys/mbuf.h> 74#include <sys/malloc.h> 75#include <sys/domain.h> 76#include <sys/protosw.h> 77#include <sys/socket.h> 78#include <sys/time.h> 79#include <sys/kernel.h> 80#include <sys/syslog.h> 81#include <sys/sysctl.h> 82#include <sys/mcache.h> 83#include <sys/socketvar.h> 84#include <sys/kdebug.h> 85#include <mach/mach_time.h> 86#include <mach/sdt.h> 87 88#include <machine/endian.h> 89#include <dev/random/randomdev.h> 90 91#include <kern/queue.h> 92#include <kern/locks.h> 93#include <libkern/OSAtomic.h> 94 95#include <pexpert/pexpert.h> 96 97#include <net/if.h> 98#include <net/if_var.h> 99#include <net/if_dl.h> 100#include <net/route.h> 101#include <net/kpi_protocol.h> 102#include <net/ntstat.h> 103#include <net/dlil.h> 104#include <net/classq/classq.h> 105#if PF 106#include <net/pfvar.h> 107#endif /* PF */ 108 109#include <netinet/in.h> 110#include <netinet/in_systm.h> 111#include <netinet/in_var.h> 112#include <netinet/in_arp.h> 113#include <netinet/ip.h> 114#include <netinet/in_pcb.h> 115#include <netinet/ip_var.h> 116#include <netinet/ip_icmp.h> 117#include <netinet/ip_fw.h> 118#include <netinet/ip_divert.h> 119#include <netinet/kpi_ipfilter_var.h> 120#include <netinet/udp.h> 121#include <netinet/udp_var.h> 122#include <netinet/bootp.h> 123#include <netinet/lro_ext.h> 124 125#if DUMMYNET 126#include <netinet/ip_dummynet.h> 127#endif /* DUMMYNET */ 128 129#if CONFIG_MACF_NET 130#include <security/mac_framework.h> 131#endif /* CONFIG_MACF_NET */ 132 133#if IPSEC 134#include <netinet6/ipsec.h> 135#include <netkey/key.h> 136#endif /* IPSEC */ 137 138#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 0) 139#define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 2) 140#define DBG_FNC_IP_INPUT NETDBG_CODE(DBG_NETIP, (2 << 8)) 141 142#if IPSEC 143extern int ipsec_bypass; 144extern lck_mtx_t *sadb_mutex; 145 146lck_grp_t *sadb_stat_mutex_grp; 147lck_grp_attr_t *sadb_stat_mutex_grp_attr; 148lck_attr_t *sadb_stat_mutex_attr; 149decl_lck_mtx_data(, sadb_stat_mutex_data); 150lck_mtx_t *sadb_stat_mutex = &sadb_stat_mutex_data; 151#endif /* IPSEC */ 152 153#if MROUTING 154int rsvp_on = 0; 155static int ip_rsvp_on; 156struct socket *ip_rsvpd; 157#endif /* MROUTING */ 158 159MBUFQ_HEAD(fq_head); 160 161static int frag_timeout_run; /* frag timer is scheduled to run */ 162static void frag_timeout(void *); 163static void frag_sched_timeout(void); 164 165static struct ipq *ipq_alloc(int); 166static void ipq_free(struct ipq *); 167static void ipq_updateparams(void); 168 169decl_lck_mtx_data(static, ipqlock); 170static lck_attr_t *ipqlock_attr; 171static lck_grp_t *ipqlock_grp; 172static lck_grp_attr_t *ipqlock_grp_attr; 173 174/* Packet reassembly stuff */ 175#define IPREASS_NHASH_LOG2 6 176#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) 177#define IPREASS_HMASK (IPREASS_NHASH - 1) 178#define IPREASS_HASH(x, y) \ 179 (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) 180 181/* IP fragment reassembly queues (protected by ipqlock) */ 182static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; /* ip reassembly queues */ 183static int maxnipq; /* max packets in reass queues */ 184static u_int32_t maxfragsperpacket; /* max frags/packet in reass queues */ 185static u_int32_t nipq; /* # of packets in reass queues */ 186static u_int32_t ipq_limit; /* ipq allocation limit */ 187static u_int32_t ipq_count; /* current # of allocated ipq's */ 188 189static int sysctl_ipforwarding SYSCTL_HANDLER_ARGS; 190static int sysctl_maxnipq SYSCTL_HANDLER_ARGS; 191static int sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS; 192 193int ipforwarding = 0; 194SYSCTL_PROC(_net_inet_ip, IPCTL_FORWARDING, forwarding, 195 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &ipforwarding, 0, 196 sysctl_ipforwarding, "I", "Enable IP forwarding between interfaces"); 197 198static int ipsendredirects = 1; /* XXX */ 199SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, 200 CTLFLAG_RW | CTLFLAG_LOCKED, &ipsendredirects, 0, 201 "Enable sending IP redirects"); 202 203int ip_defttl = IPDEFTTL; 204SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW | CTLFLAG_LOCKED, 205 &ip_defttl, 0, "Maximum TTL on IP packets"); 206 207static int ip_dosourceroute = 0; 208SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, 209 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_dosourceroute, 0, 210 "Enable forwarding source routed IP packets"); 211 212static int ip_acceptsourceroute = 0; 213SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, 214 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_acceptsourceroute, 0, 215 "Enable accepting source routed IP packets"); 216 217static int ip_sendsourcequench = 0; 218SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench, 219 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_sendsourcequench, 0, 220 "Enable the transmission of source quench packets"); 221 222SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, 223 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &maxnipq, 0, sysctl_maxnipq, 224 "I", "Maximum number of IPv4 fragment reassembly queue entries"); 225 226SYSCTL_UINT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD | CTLFLAG_LOCKED, 227 &nipq, 0, "Current number of IPv4 fragment reassembly queue entries"); 228 229SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragsperpacket, 230 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &maxfragsperpacket, 0, 231 sysctl_maxfragsperpacket, "I", 232 "Maximum number of IPv4 fragments allowed per packet"); 233 234int ip_doscopedroute = 1; 235SYSCTL_INT(_net_inet_ip, OID_AUTO, scopedroute, CTLFLAG_RD | CTLFLAG_LOCKED, 236 &ip_doscopedroute, 0, "Enable IPv4 scoped routing"); 237 238static uint32_t ip_adj_clear_hwcksum = 0; 239SYSCTL_UINT(_net_inet_ip, OID_AUTO, adj_clear_hwcksum, 240 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_adj_clear_hwcksum, 0, 241 "Invalidate hwcksum info when adjusting length"); 242 243/* 244 * XXX - Setting ip_checkinterface mostly implements the receive side of 245 * the Strong ES model described in RFC 1122, but since the routing table 246 * and transmit implementation do not implement the Strong ES model, 247 * setting this to 1 results in an odd hybrid. 248 * 249 * XXX - ip_checkinterface currently must be disabled if you use ipnat 250 * to translate the destination address to another local interface. 251 * 252 * XXX - ip_checkinterface must be disabled if you add IP aliases 253 * to the loopback interface instead of the interface where the 254 * packets for those addresses are received. 255 */ 256static int ip_checkinterface = 0; 257SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW | CTLFLAG_LOCKED, 258 &ip_checkinterface, 0, "Verify packet arrives on correct interface"); 259 260#if DIAGNOSTIC 261static int ipprintfs = 0; 262#endif 263 264struct protosw *ip_protox[IPPROTO_MAX]; 265 266static lck_grp_attr_t *in_ifaddr_rwlock_grp_attr; 267static lck_grp_t *in_ifaddr_rwlock_grp; 268static lck_attr_t *in_ifaddr_rwlock_attr; 269decl_lck_rw_data(, in_ifaddr_rwlock_data); 270lck_rw_t *in_ifaddr_rwlock = &in_ifaddr_rwlock_data; 271 272/* Protected by in_ifaddr_rwlock */ 273struct in_ifaddrhead in_ifaddrhead; /* first inet address */ 274struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ 275 276#define INADDR_NHASH 61 277static u_int32_t inaddr_nhash; /* hash table size */ 278static u_int32_t inaddr_hashp; /* next largest prime */ 279 280static int ip_getstat SYSCTL_HANDLER_ARGS; 281struct ipstat ipstat; 282SYSCTL_PROC(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, 283 0, 0, ip_getstat, "S,ipstat", 284 "IP statistics (struct ipstat, netinet/ip_var.h)"); 285 286#if IPCTL_DEFMTU 287SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW | CTLFLAG_LOCKED, 288 &ip_mtu, 0, "Default MTU"); 289#endif /* IPCTL_DEFMTU */ 290 291#if IPSTEALTH 292static int ipstealth = 0; 293SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW | CTLFLAG_LOCKED, 294 &ipstealth, 0, ""); 295#endif /* IPSTEALTH */ 296 297/* Firewall hooks */ 298#if IPFIREWALL 299ip_fw_chk_t *ip_fw_chk_ptr; 300int fw_enable = 1; 301int fw_bypass = 1; 302int fw_one_pass = 0; 303#endif /* IPFIREWALL */ 304 305#if DUMMYNET 306ip_dn_io_t *ip_dn_io_ptr; 307#endif /* DUMMYNET */ 308 309SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal, 310 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local"); 311 312struct ip_linklocal_stat ip_linklocal_stat; 313SYSCTL_STRUCT(_net_inet_ip_linklocal, OID_AUTO, stat, 314 CTLFLAG_RD | CTLFLAG_LOCKED, &ip_linklocal_stat, ip_linklocal_stat, 315 "Number of link local packets with TTL less than 255"); 316 317SYSCTL_NODE(_net_inet_ip_linklocal, OID_AUTO, in, 318 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local input"); 319 320int ip_linklocal_in_allowbadttl = 1; 321SYSCTL_INT(_net_inet_ip_linklocal_in, OID_AUTO, allowbadttl, 322 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_linklocal_in_allowbadttl, 0, 323 "Allow incoming link local packets with TTL less than 255"); 324 325 326/* 327 * We need to save the IP options in case a protocol wants to respond 328 * to an incoming packet over the same route if the packet got here 329 * using IP source routing. This allows connection establishment and 330 * maintenance when the remote end is on a network that is not known 331 * to us. 332 */ 333static int ip_nhops = 0; 334static struct ip_srcrt { 335 struct in_addr dst; /* final destination */ 336 char nop; /* one NOP to align */ 337 char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */ 338 struct in_addr route[MAX_IPOPTLEN / sizeof (struct in_addr)]; 339} ip_srcrt; 340 341static void in_ifaddrhashtbl_init(void); 342static void save_rte(u_char *, struct in_addr); 343static int ip_dooptions(struct mbuf *, int, struct sockaddr_in *); 344static void ip_forward(struct mbuf *, int, struct sockaddr_in *); 345static void frag_freef(struct ipqhead *, struct ipq *); 346#if IPDIVERT 347#ifdef IPDIVERT_44 348static struct mbuf *ip_reass(struct mbuf *, u_int32_t *, u_int16_t *); 349#else /* !IPDIVERT_44 */ 350static struct mbuf *ip_reass(struct mbuf *, u_int16_t *, u_int16_t *); 351#endif /* !IPDIVERT_44 */ 352#else /* !IPDIVERT */ 353static struct mbuf *ip_reass(struct mbuf *); 354#endif /* !IPDIVERT */ 355static void ip_fwd_route_copyout(struct ifnet *, struct route *); 356static void ip_fwd_route_copyin(struct ifnet *, struct route *); 357static inline u_short ip_cksum(struct mbuf *, int); 358 359int ip_use_randomid = 1; 360SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW | CTLFLAG_LOCKED, 361 &ip_use_randomid, 0, "Randomize IP packets IDs"); 362 363/* 364 * On platforms which require strict alignment (currently for anything but 365 * i386 or x86_64), check if the IP header pointer is 32-bit aligned; if not, 366 * copy the contents of the mbuf chain into a new chain, and free the original 367 * one. Create some head room in the first mbuf of the new chain, in case 368 * it's needed later on. 369 */ 370#if defined(__i386__) || defined(__x86_64__) 371#define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { } while (0) 372#else /* !__i386__ && !__x86_64__ */ 373#define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { \ 374 if (!IP_HDR_ALIGNED_P(mtod(_m, caddr_t))) { \ 375 struct mbuf *_n; \ 376 struct ifnet *__ifp = (_ifp); \ 377 atomic_add_64(&(__ifp)->if_alignerrs, 1); \ 378 if (((_m)->m_flags & M_PKTHDR) && \ 379 (_m)->m_pkthdr.pkt_hdr != NULL) \ 380 (_m)->m_pkthdr.pkt_hdr = NULL; \ 381 _n = m_defrag_offset(_m, max_linkhdr, M_NOWAIT); \ 382 if (_n == NULL) { \ 383 atomic_add_32(&ipstat.ips_toosmall, 1); \ 384 m_freem(_m); \ 385 (_m) = NULL; \ 386 _action; \ 387 } else { \ 388 VERIFY(_n != (_m)); \ 389 (_m) = _n; \ 390 } \ 391 } \ 392} while (0) 393#endif /* !__i386__ && !__x86_64__ */ 394 395/* 396 * GRE input handler function, settable via ip_gre_register_input() for PPTP. 397 */ 398static gre_input_func_t gre_input_func; 399 400/* 401 * IP initialization: fill in IP protocol switch table. 402 * All protocols not implemented in kernel go to raw IP protocol handler. 403 */ 404void 405ip_init(struct protosw *pp, struct domain *dp) 406{ 407 static int ip_initialized = 0; 408 struct protosw *pr; 409 struct timeval tv; 410 int i; 411 412 domain_proto_mtx_lock_assert_held(); 413 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED); 414 415 /* ipq_alloc() uses mbufs for IP fragment queue structures */ 416 _CASSERT(sizeof (struct ipq) <= _MLEN); 417 418 /* 419 * Some ioctls (e.g. SIOCAIFADDR) use ifaliasreq struct, which is 420 * interchangeable with in_aliasreq; they must have the same size. 421 */ 422 _CASSERT(sizeof (struct ifaliasreq) == sizeof (struct in_aliasreq)); 423 424 if (ip_initialized) 425 return; 426 ip_initialized = 1; 427 428 PE_parse_boot_argn("net.inet.ip.scopedroute", 429 &ip_doscopedroute, sizeof (ip_doscopedroute)); 430 431 in_ifaddr_init(); 432 433 in_ifaddr_rwlock_grp_attr = lck_grp_attr_alloc_init(); 434 in_ifaddr_rwlock_grp = lck_grp_alloc_init("in_ifaddr_rwlock", 435 in_ifaddr_rwlock_grp_attr); 436 in_ifaddr_rwlock_attr = lck_attr_alloc_init(); 437 lck_rw_init(in_ifaddr_rwlock, in_ifaddr_rwlock_grp, 438 in_ifaddr_rwlock_attr); 439 440 TAILQ_INIT(&in_ifaddrhead); 441 in_ifaddrhashtbl_init(); 442 443 ip_moptions_init(); 444 445 pr = pffindproto_locked(PF_INET, IPPROTO_RAW, SOCK_RAW); 446 if (pr == NULL) { 447 panic("%s: Unable to find [PF_INET,IPPROTO_RAW,SOCK_RAW]\n", 448 __func__); 449 /* NOTREACHED */ 450 } 451 452 /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ 453 for (i = 0; i < IPPROTO_MAX; i++) 454 ip_protox[i] = pr; 455 /* 456 * Cycle through IP protocols and put them into the appropriate place 457 * in ip_protox[], skipping protocols IPPROTO_{IP,RAW}. 458 */ 459 VERIFY(dp == inetdomain && dp->dom_family == PF_INET); 460 TAILQ_FOREACH(pr, &dp->dom_protosw, pr_entry) { 461 VERIFY(pr->pr_domain == dp); 462 if (pr->pr_protocol != 0 && pr->pr_protocol != IPPROTO_RAW) { 463 /* Be careful to only index valid IP protocols. */ 464 if (pr->pr_protocol < IPPROTO_MAX) 465 ip_protox[pr->pr_protocol] = pr; 466 } 467 } 468 469 /* IP fragment reassembly queue lock */ 470 ipqlock_grp_attr = lck_grp_attr_alloc_init(); 471 ipqlock_grp = lck_grp_alloc_init("ipqlock", ipqlock_grp_attr); 472 ipqlock_attr = lck_attr_alloc_init(); 473 lck_mtx_init(&ipqlock, ipqlock_grp, ipqlock_attr); 474 475 lck_mtx_lock(&ipqlock); 476 /* Initialize IP reassembly queue. */ 477 for (i = 0; i < IPREASS_NHASH; i++) 478 TAILQ_INIT(&ipq[i]); 479 480 maxnipq = nmbclusters / 32; 481 maxfragsperpacket = 128; /* enough for 64k in 512 byte fragments */ 482 ipq_updateparams(); 483 lck_mtx_unlock(&ipqlock); 484 485 getmicrotime(&tv); 486 ip_id = RandomULong() ^ tv.tv_usec; 487 ip_initid(); 488 489 ipf_init(); 490 491#if IPSEC 492 sadb_stat_mutex_grp_attr = lck_grp_attr_alloc_init(); 493 sadb_stat_mutex_grp = lck_grp_alloc_init("sadb_stat", 494 sadb_stat_mutex_grp_attr); 495 sadb_stat_mutex_attr = lck_attr_alloc_init(); 496 lck_mtx_init(sadb_stat_mutex, sadb_stat_mutex_grp, 497 sadb_stat_mutex_attr); 498 499#endif 500 arp_init(); 501} 502 503/* 504 * Initialize IPv4 source address hash table. 505 */ 506static void 507in_ifaddrhashtbl_init(void) 508{ 509 int i, k, p; 510 511 if (in_ifaddrhashtbl != NULL) 512 return; 513 514 PE_parse_boot_argn("inaddr_nhash", &inaddr_nhash, 515 sizeof (inaddr_nhash)); 516 if (inaddr_nhash == 0) 517 inaddr_nhash = INADDR_NHASH; 518 519 MALLOC(in_ifaddrhashtbl, struct in_ifaddrhashhead *, 520 inaddr_nhash * sizeof (*in_ifaddrhashtbl), 521 M_IFADDR, M_WAITOK | M_ZERO); 522 if (in_ifaddrhashtbl == NULL) 523 panic("in_ifaddrhashtbl_init allocation failed"); 524 525 /* 526 * Generate the next largest prime greater than inaddr_nhash. 527 */ 528 k = (inaddr_nhash % 2 == 0) ? inaddr_nhash + 1 : inaddr_nhash + 2; 529 for (;;) { 530 p = 1; 531 for (i = 3; i * i <= k; i += 2) { 532 if (k % i == 0) 533 p = 0; 534 } 535 if (p == 1) 536 break; 537 k += 2; 538 } 539 inaddr_hashp = k; 540} 541 542u_int32_t 543inaddr_hashval(u_int32_t key) 544{ 545 /* 546 * The hash index is the computed prime times the key modulo 547 * the hash size, as documented in "Introduction to Algorithms" 548 * (Cormen, Leiserson, Rivest). 549 */ 550 if (inaddr_nhash > 1) 551 return ((key * inaddr_hashp) % inaddr_nhash); 552 else 553 return (0); 554} 555 556void 557ip_proto_dispatch_in_wrapper(struct mbuf *m, int hlen, u_int8_t proto) 558{ 559 ip_proto_dispatch_in(m, hlen, proto, 0); 560} 561 562__private_extern__ void 563ip_proto_dispatch_in(struct mbuf *m, int hlen, u_int8_t proto, 564 ipfilter_t inject_ipfref) 565{ 566 struct ipfilter *filter; 567 int seen = (inject_ipfref == NULL); 568 int changed_header = 0; 569 struct ip *ip; 570 void (*pr_input)(struct mbuf *, int len); 571 572 if (!TAILQ_EMPTY(&ipv4_filters)) { 573 ipf_ref(); 574 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { 575 if (seen == 0) { 576 if ((struct ipfilter *)inject_ipfref == filter) 577 seen = 1; 578 } else if (filter->ipf_filter.ipf_input) { 579 errno_t result; 580 581 if (changed_header == 0) { 582 /* 583 * Perform IP header alignment fixup, 584 * if needed, before passing packet 585 * into filter(s). 586 */ 587 IP_HDR_ALIGNMENT_FIXUP(m, 588 m->m_pkthdr.rcvif, ipf_unref()); 589 590 /* ipf_unref() already called */ 591 if (m == NULL) 592 return; 593 594 changed_header = 1; 595 ip = mtod(m, struct ip *); 596 ip->ip_len = htons(ip->ip_len + hlen); 597 ip->ip_off = htons(ip->ip_off); 598 ip->ip_sum = 0; 599 ip->ip_sum = ip_cksum_hdr_in(m, hlen); 600 } 601 result = filter->ipf_filter.ipf_input( 602 filter->ipf_filter.cookie, (mbuf_t *)&m, 603 hlen, proto); 604 if (result == EJUSTRETURN) { 605 ipf_unref(); 606 return; 607 } 608 if (result != 0) { 609 ipf_unref(); 610 m_freem(m); 611 return; 612 } 613 } 614 } 615 ipf_unref(); 616 } 617 618 /* Perform IP header alignment fixup (post-filters), if needed */ 619 IP_HDR_ALIGNMENT_FIXUP(m, m->m_pkthdr.rcvif, return); 620 621 /* 622 * If there isn't a specific lock for the protocol 623 * we're about to call, use the generic lock for AF_INET. 624 * otherwise let the protocol deal with its own locking 625 */ 626 ip = mtod(m, struct ip *); 627 628 if (changed_header) { 629 ip->ip_len = ntohs(ip->ip_len) - hlen; 630 ip->ip_off = ntohs(ip->ip_off); 631 } 632 633 if ((pr_input = ip_protox[ip->ip_p]->pr_input) == NULL) { 634 m_freem(m); 635 } else if (!(ip_protox[ip->ip_p]->pr_flags & PR_PROTOLOCK)) { 636 lck_mtx_lock(inet_domain_mutex); 637 pr_input(m, hlen); 638 lck_mtx_unlock(inet_domain_mutex); 639 } else { 640 pr_input(m, hlen); 641 } 642} 643 644/* 645 * Ip input routine. Checksum and byte swap header. If fragmented 646 * try to reassemble. Process options. Pass to next level. 647 */ 648void 649ip_input(struct mbuf *m) 650{ 651 struct ip *ip; 652 struct in_ifaddr *ia = NULL; 653 unsigned int hlen, checkif; 654 u_short sum = 0; 655 struct in_addr pkt_dst; 656#if IPFIREWALL 657 int i; 658 u_int32_t div_info = 0; /* packet divert/tee info */ 659#endif 660#if IPFIREWALL || DUMMYNET 661 struct ip_fw_args args; 662 struct m_tag *tag; 663#endif 664 ipfilter_t inject_filter_ref = NULL; 665 struct ifnet *inifp; 666 667 /* Check if the mbuf is still valid after interface filter processing */ 668 MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif); 669 inifp = m->m_pkthdr.rcvif; 670 VERIFY(inifp != NULL); 671 672 /* Perform IP header alignment fixup, if needed */ 673 IP_HDR_ALIGNMENT_FIXUP(m, inifp, goto bad); 674 675 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED; 676 677#if IPFIREWALL || DUMMYNET 678 bzero(&args, sizeof (struct ip_fw_args)); 679 680 /* 681 * Don't bother searching for tag(s) if there's none. 682 */ 683 if (SLIST_EMPTY(&m->m_pkthdr.tags)) 684 goto ipfw_tags_done; 685 686 /* Grab info from mtags prepended to the chain */ 687#if DUMMYNET 688 if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, 689 KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { 690 struct dn_pkt_tag *dn_tag; 691 692 dn_tag = (struct dn_pkt_tag *)(tag+1); 693 args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule; 694 args.fwa_pf_rule = dn_tag->dn_pf_rule; 695 696 m_tag_delete(m, tag); 697 } 698#endif /* DUMMYNET */ 699 700#if IPDIVERT 701 if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, 702 KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { 703 struct divert_tag *div_tag; 704 705 div_tag = (struct divert_tag *)(tag+1); 706 args.fwa_divert_rule = div_tag->cookie; 707 708 m_tag_delete(m, tag); 709 } 710#endif 711 712 if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, 713 KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { 714 struct ip_fwd_tag *ipfwd_tag; 715 716 ipfwd_tag = (struct ip_fwd_tag *)(tag+1); 717 args.fwa_next_hop = ipfwd_tag->next_hop; 718 719 m_tag_delete(m, tag); 720 } 721 722#if DIAGNOSTIC 723 if (m == NULL || !(m->m_flags & M_PKTHDR)) 724 panic("ip_input no HDR"); 725#endif 726 727#if DUMMYNET 728 if (args.fwa_ipfw_rule || args.fwa_pf_rule) { 729 /* dummynet already filtered us */ 730 ip = mtod(m, struct ip *); 731 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 732 inject_filter_ref = ipf_get_inject_filter(m); 733#if IPFIREWALL 734 if (args.fwa_ipfw_rule) 735 goto iphack; 736#endif /* IPFIREWALL */ 737 if (args.fwa_pf_rule) 738 goto check_with_pf; 739 } 740#endif /* DUMMYNET */ 741ipfw_tags_done: 742#endif /* IPFIREWALL || DUMMYNET */ 743 744 /* 745 * No need to process packet twice if we've already seen it. 746 */ 747 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) 748 inject_filter_ref = ipf_get_inject_filter(m); 749 if (inject_filter_ref != NULL) { 750 ip = mtod(m, struct ip *); 751 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 752 753 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, 754 struct ip *, ip, struct ifnet *, inifp, 755 struct ip *, ip, struct ip6_hdr *, NULL); 756 757 ip->ip_len = ntohs(ip->ip_len) - hlen; 758 ip->ip_off = ntohs(ip->ip_off); 759 ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref); 760 return; 761 } 762 763 OSAddAtomic(1, &ipstat.ips_total); 764 if (m->m_pkthdr.len < sizeof (struct ip)) 765 goto tooshort; 766 767 if (m->m_len < sizeof (struct ip) && 768 (m = m_pullup(m, sizeof (struct ip))) == NULL) { 769 OSAddAtomic(1, &ipstat.ips_toosmall); 770 return; 771 } 772 ip = mtod(m, struct ip *); 773 774 KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr, 775 ip->ip_p, ip->ip_off, ip->ip_len); 776 777 if (IP_VHL_V(ip->ip_vhl) != IPVERSION) { 778 OSAddAtomic(1, &ipstat.ips_badvers); 779 goto bad; 780 } 781 782 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 783 if (hlen < sizeof (struct ip)) { /* minimum header length */ 784 OSAddAtomic(1, &ipstat.ips_badhlen); 785 goto bad; 786 } 787 if (hlen > m->m_len) { 788 if ((m = m_pullup(m, hlen)) == NULL) { 789 OSAddAtomic(1, &ipstat.ips_badhlen); 790 return; 791 } 792 ip = mtod(m, struct ip *); 793 } 794 795 /* 127/8 must not appear on wire - RFC1122 */ 796 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 797 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 798 /* 799 * Allow for the following exceptions: 800 * 801 * 1. If the packet was sent to loopback (i.e. rcvif 802 * would have been set earlier at output time.) 803 * 804 * 2. If the packet was sent out on loopback from a local 805 * source address which belongs to a non-loopback 806 * interface (i.e. rcvif may not necessarily be a 807 * loopback interface, hence the test for PKTF_LOOP.) 808 * Unlike IPv6, there is no interface scope ID, and 809 * therefore we don't care so much about PKTF_IFINFO. 810 */ 811 if (!(inifp->if_flags & IFF_LOOPBACK) && 812 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) { 813 OSAddAtomic(1, &ipstat.ips_badaddr); 814 goto bad; 815 } 816 } 817 818 /* IPv4 Link-Local Addresses as defined in RFC3927 */ 819 if ((IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) || 820 IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) { 821 ip_linklocal_stat.iplls_in_total++; 822 if (ip->ip_ttl != MAXTTL) { 823 OSAddAtomic(1, &ip_linklocal_stat.iplls_in_badttl); 824 /* Silently drop link local traffic with bad TTL */ 825 if (!ip_linklocal_in_allowbadttl) 826 goto bad; 827 } 828 } 829 830 sum = ip_cksum(m, hlen); 831 if (sum) { 832 goto bad; 833 } 834 835 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, 836 struct ip *, ip, struct ifnet *, inifp, 837 struct ip *, ip, struct ip6_hdr *, NULL); 838 839 /* 840 * Naively assume we can attribute inbound data to the route we would 841 * use to send to this destination. Asymetric routing breaks this 842 * assumption, but it still allows us to account for traffic from 843 * a remote node in the routing table. 844 * this has a very significant performance impact so we bypass 845 * if nstat_collect is disabled. We may also bypass if the 846 * protocol is tcp in the future because tcp will have a route that 847 * we can use to attribute the data to. That does mean we would not 848 * account for forwarded tcp traffic. 849 */ 850 if (nstat_collect) { 851 struct rtentry *rt = 852 ifnet_cached_rtlookup_inet(inifp, ip->ip_src); 853 if (rt != NULL) { 854 nstat_route_rx(rt, 1, m->m_pkthdr.len, 0); 855 rtfree(rt); 856 } 857 } 858 859 /* 860 * Convert fields to host representation. 861 */ 862#if BYTE_ORDER != BIG_ENDIAN 863 NTOHS(ip->ip_len); 864#endif 865 866 if (ip->ip_len < hlen) { 867 OSAddAtomic(1, &ipstat.ips_badlen); 868 goto bad; 869 } 870 871#if BYTE_ORDER != BIG_ENDIAN 872 NTOHS(ip->ip_off); 873#endif 874 /* 875 * Check that the amount of data in the buffers 876 * is as at least much as the IP header would have us expect. 877 * Trim mbufs if longer than we expect. 878 * Drop packet if shorter than we expect. 879 */ 880 if (m->m_pkthdr.len < ip->ip_len) { 881tooshort: 882 OSAddAtomic(1, &ipstat.ips_tooshort); 883 goto bad; 884 } 885 if (m->m_pkthdr.len > ip->ip_len) { 886 /* 887 * Invalidate hardware checksum info if ip_adj_clear_hwcksum 888 * is set; useful to handle buggy drivers. Note that this 889 * should not be enabled by default, as we may get here due 890 * to link-layer padding. 891 */ 892 if (ip_adj_clear_hwcksum && 893 (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) && 894 !(inifp->if_flags & IFF_LOOPBACK) && 895 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) { 896 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID; 897 m->m_pkthdr.csum_data = 0; 898 ipstat.ips_adj_hwcsum_clr++; 899 } 900 901 ipstat.ips_adj++; 902 if (m->m_len == m->m_pkthdr.len) { 903 m->m_len = ip->ip_len; 904 m->m_pkthdr.len = ip->ip_len; 905 } else 906 m_adj(m, ip->ip_len - m->m_pkthdr.len); 907 } 908 909 /* for consistency */ 910 m->m_pkthdr.pkt_proto = ip->ip_p; 911 912#if DUMMYNET 913check_with_pf: 914#endif 915#if PF 916 /* Invoke inbound packet filter */ 917 if (PF_IS_ENABLED) { 918 int error; 919#if DUMMYNET 920 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, &args); 921#else 922 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, NULL); 923#endif /* DUMMYNET */ 924 if (error != 0 || m == NULL) { 925 if (m != NULL) { 926 panic("%s: unexpected packet %p\n", 927 __func__, m); 928 /* NOTREACHED */ 929 } 930 /* Already freed by callee */ 931 return; 932 } 933 ip = mtod(m, struct ip *); 934 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 935 } 936#endif /* PF */ 937 938#if IPSEC 939 if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) 940 goto pass; 941#endif 942 943#if IPFIREWALL 944#if DUMMYNET 945iphack: 946#endif /* DUMMYNET */ 947 /* 948 * Check if we want to allow this packet to be processed. 949 * Consider it to be bad if not. 950 */ 951 if (fw_enable && IPFW_LOADED) { 952#if IPFIREWALL_FORWARD 953 /* 954 * If we've been forwarded from the output side, then 955 * skip the firewall a second time 956 */ 957 if (args.fwa_next_hop) 958 goto ours; 959#endif /* IPFIREWALL_FORWARD */ 960 961 args.fwa_m = m; 962 963 i = ip_fw_chk_ptr(&args); 964 m = args.fwa_m; 965 966 if ((i & IP_FW_PORT_DENY_FLAG) || m == NULL) { /* drop */ 967 if (m) 968 m_freem(m); 969 return; 970 } 971 ip = mtod(m, struct ip *); /* just in case m changed */ 972 973 if (i == 0 && args.fwa_next_hop == NULL) { /* common case */ 974 goto pass; 975 } 976#if DUMMYNET 977 if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG) != 0) { 978 /* Send packet to the appropriate pipe */ 979 ip_dn_io_ptr(m, i&0xffff, DN_TO_IP_IN, &args, 980 DN_CLIENT_IPFW); 981 return; 982 } 983#endif /* DUMMYNET */ 984#if IPDIVERT 985 if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) { 986 /* Divert or tee packet */ 987 div_info = i; 988 goto ours; 989 } 990#endif 991#if IPFIREWALL_FORWARD 992 if (i == 0 && args.fwa_next_hop != NULL) { 993 goto pass; 994 } 995#endif 996 /* 997 * if we get here, the packet must be dropped 998 */ 999 m_freem(m); 1000 return; 1001 } 1002#endif /* IPFIREWALL */ 1003#if IPSEC | IPFIREWALL 1004pass: 1005#endif 1006 /* 1007 * Process options and, if not destined for us, 1008 * ship it on. ip_dooptions returns 1 when an 1009 * error was detected (causing an icmp message 1010 * to be sent and the original packet to be freed). 1011 */ 1012 ip_nhops = 0; /* for source routed packets */ 1013#if IPFIREWALL 1014 if (hlen > sizeof (struct ip) && 1015 ip_dooptions(m, 0, args.fwa_next_hop)) { 1016#else /* !IPFIREWALL */ 1017 if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, NULL)) { 1018#endif /* !IPFIREWALL */ 1019 return; 1020 } 1021 1022#if MROUTING 1023 /* 1024 * greedy RSVP, snatches any PATH packet of the RSVP protocol and no 1025 * matter if it is destined to another node, or whether it is 1026 * a multicast one, RSVP wants it! and prevents it from being forwarded 1027 * anywhere else. Also checks if the rsvp daemon is running before 1028 * grabbing the packet. 1029 */ 1030 if (rsvp_on && ip->ip_p == IPPROTO_RSVP) { 1031 ip_setdstifaddr_info(m, inifp->if_index, NULL); 1032 goto ours; 1033 } 1034#endif /* MROUTING */ 1035 1036 /* 1037 * Check our list of addresses, to see if the packet is for us. 1038 * If we don't have any addresses, assume any unicast packet 1039 * we receive might be for us (and let the upper layers deal 1040 * with it). 1041 */ 1042 if (TAILQ_EMPTY(&in_ifaddrhead) && !(m->m_flags & (M_MCAST|M_BCAST))) { 1043 ip_setdstifaddr_info(m, inifp->if_index, NULL); 1044 goto ours; 1045 } 1046 1047 /* 1048 * Cache the destination address of the packet; this may be 1049 * changed by use of 'ipfw fwd'. 1050 */ 1051#if IPFIREWALL 1052 pkt_dst = args.fwa_next_hop == NULL ? 1053 ip->ip_dst : args.fwa_next_hop->sin_addr; 1054#else /* !IPFIREWALL */ 1055 pkt_dst = ip->ip_dst; 1056#endif /* !IPFIREWALL */ 1057 1058 /* 1059 * Enable a consistency check between the destination address 1060 * and the arrival interface for a unicast packet (the RFC 1122 1061 * strong ES model) if IP forwarding is disabled and the packet 1062 * is not locally generated and the packet is not subject to 1063 * 'ipfw fwd'. 1064 * 1065 * XXX - Checking also should be disabled if the destination 1066 * address is ipnat'ed to a different interface. 1067 * 1068 * XXX - Checking is incompatible with IP aliases added 1069 * to the loopback interface instead of the interface where 1070 * the packets are received. 1071 */ 1072 checkif = ip_checkinterface && (ipforwarding == 0) && 1073 !(inifp->if_flags & IFF_LOOPBACK) && 1074 !(m->m_pkthdr.pkt_flags & PKTF_LOOP) 1075#if IPFIREWALL 1076 && (args.fwa_next_hop == NULL); 1077#else /* !IPFIREWALL */ 1078 ; 1079#endif /* !IPFIREWALL */ 1080 1081 /* 1082 * Check for exact addresses in the hash bucket. 1083 */ 1084 lck_rw_lock_shared(in_ifaddr_rwlock); 1085 TAILQ_FOREACH(ia, INADDR_HASH(pkt_dst.s_addr), ia_hash) { 1086 /* 1087 * If the address matches, verify that the packet 1088 * arrived via the correct interface if checking is 1089 * enabled. 1090 */ 1091 if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr && 1092 (!checkif || ia->ia_ifp == inifp)) { 1093 ip_setdstifaddr_info(m, 0, ia); 1094 lck_rw_done(in_ifaddr_rwlock); 1095 goto ours; 1096 } 1097 } 1098 lck_rw_done(in_ifaddr_rwlock); 1099 1100 /* 1101 * Check for broadcast addresses. 1102 * 1103 * Only accept broadcast packets that arrive via the matching 1104 * interface. Reception of forwarded directed broadcasts would be 1105 * handled via ip_forward() and ether_frameout() with the loopback 1106 * into the stack for SIMPLEX interfaces handled by ether_frameout(). 1107 */ 1108 if (inifp->if_flags & IFF_BROADCAST) { 1109 struct ifaddr *ifa; 1110 1111 ifnet_lock_shared(inifp); 1112 TAILQ_FOREACH(ifa, &inifp->if_addrhead, ifa_link) { 1113 if (ifa->ifa_addr->sa_family != AF_INET) { 1114 continue; 1115 } 1116 ia = ifatoia(ifa); 1117 if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == 1118 pkt_dst.s_addr || ia->ia_netbroadcast.s_addr == 1119 pkt_dst.s_addr) { 1120 ip_setdstifaddr_info(m, 0, ia); 1121 ifnet_lock_done(inifp); 1122 goto ours; 1123 } 1124 } 1125 ifnet_lock_done(inifp); 1126 } 1127 1128 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 1129 struct in_multi *inm; 1130#if MROUTING 1131 if (ip_mrouter) { 1132 /* 1133 * If we are acting as a multicast router, all 1134 * incoming multicast packets are passed to the 1135 * kernel-level multicast forwarding function. 1136 * The packet is returned (relatively) intact; if 1137 * ip_mforward() returns a non-zero value, the packet 1138 * must be discarded, else it may be accepted below. 1139 */ 1140 if (ip_mforward && ip_mforward(ip, inifp, m, 0) != 0) { 1141 OSAddAtomic(1, &ipstat.ips_cantforward); 1142 m_freem(m); 1143 return; 1144 } 1145 1146 /* 1147 * The process-level routing daemon needs to receive 1148 * all multicast IGMP packets, whether or not this 1149 * host belongs to their destination groups. 1150 */ 1151 if (ip->ip_p == IPPROTO_IGMP) { 1152 ip_setdstifaddr_info(m, inifp->if_index, NULL); 1153 goto ours; 1154 } 1155 OSAddAtomic(1, &ipstat.ips_forward); 1156 } 1157#endif /* MROUTING */ 1158 /* 1159 * See if we belong to the destination multicast group on the 1160 * arrival interface. 1161 */ 1162 in_multihead_lock_shared(); 1163 IN_LOOKUP_MULTI(&ip->ip_dst, inifp, inm); 1164 in_multihead_lock_done(); 1165 if (inm == NULL) { 1166 OSAddAtomic(1, &ipstat.ips_notmember); 1167 m_freem(m); 1168 return; 1169 } 1170 ip_setdstifaddr_info(m, inifp->if_index, NULL); 1171 INM_REMREF(inm); 1172 goto ours; 1173 } 1174 if (ip->ip_dst.s_addr == (u_int32_t)INADDR_BROADCAST || 1175 ip->ip_dst.s_addr == INADDR_ANY) { 1176 ip_setdstifaddr_info(m, inifp->if_index, NULL); 1177 goto ours; 1178 } 1179 1180 /* Allow DHCP/BootP responses through */ 1181 if ((inifp->if_eflags & IFEF_AUTOCONFIGURING) && 1182 hlen == sizeof (struct ip) && ip->ip_p == IPPROTO_UDP) { 1183 struct udpiphdr *ui; 1184 1185 if (m->m_len < sizeof (struct udpiphdr) && 1186 (m = m_pullup(m, sizeof (struct udpiphdr))) == NULL) { 1187 OSAddAtomic(1, &udpstat.udps_hdrops); 1188 return; 1189 } 1190 ui = mtod(m, struct udpiphdr *); 1191 if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) { 1192 ip_setdstifaddr_info(m, inifp->if_index, NULL); 1193 goto ours; 1194 } 1195 ip = mtod(m, struct ip *); /* in case it changed */ 1196 } 1197 1198 /* 1199 * Not for us; forward if possible and desirable. 1200 */ 1201 if (ipforwarding == 0) { 1202 OSAddAtomic(1, &ipstat.ips_cantforward); 1203 m_freem(m); 1204 } else { 1205#if IPFIREWALL 1206 ip_forward(m, 0, args.fwa_next_hop); 1207#else 1208 ip_forward(m, 0, NULL); 1209#endif 1210 } 1211 return; 1212 1213ours: 1214 /* 1215 * If offset or IP_MF are set, must reassemble. 1216 */ 1217 if (ip->ip_off & ~(IP_DF | IP_RF)) { 1218 /* 1219 * ip_reass() will return a different mbuf, and update 1220 * the divert info in div_info and args.fwa_divert_rule. 1221 */ 1222#if IPDIVERT 1223 m = ip_reass(m, (u_int16_t *)&div_info, &args.fwa_divert_rule); 1224#else 1225 m = ip_reass(m); 1226#endif 1227 if (m == NULL) 1228 return; 1229 ip = mtod(m, struct ip *); 1230 /* Get the header length of the reassembled packet */ 1231 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 1232#if IPDIVERT 1233 /* Restore original checksum before diverting packet */ 1234 if (div_info != 0) { 1235#if BYTE_ORDER != BIG_ENDIAN 1236 HTONS(ip->ip_len); 1237 HTONS(ip->ip_off); 1238#endif 1239 ip->ip_sum = 0; 1240 ip->ip_sum = ip_cksum_hdr_in(m, hlen); 1241#if BYTE_ORDER != BIG_ENDIAN 1242 NTOHS(ip->ip_off); 1243 NTOHS(ip->ip_len); 1244#endif 1245 } 1246#endif 1247 } 1248 1249 /* 1250 * Further protocols expect the packet length to be w/o the 1251 * IP header. 1252 */ 1253 ip->ip_len -= hlen; 1254 1255#if IPDIVERT 1256 /* 1257 * Divert or tee packet to the divert protocol if required. 1258 * 1259 * If div_info is zero then cookie should be too, so we shouldn't 1260 * need to clear them here. Assume divert_packet() does so also. 1261 */ 1262 if (div_info != 0) { 1263 struct mbuf *clone = NULL; 1264 1265 /* Clone packet if we're doing a 'tee' */ 1266 if (div_info & IP_FW_PORT_TEE_FLAG) 1267 clone = m_dup(m, M_DONTWAIT); 1268 1269 /* Restore packet header fields to original values */ 1270 ip->ip_len += hlen; 1271 1272#if BYTE_ORDER != BIG_ENDIAN 1273 HTONS(ip->ip_len); 1274 HTONS(ip->ip_off); 1275#endif 1276 /* Deliver packet to divert input routine */ 1277 OSAddAtomic(1, &ipstat.ips_delivered); 1278 divert_packet(m, 1, div_info & 0xffff, args.fwa_divert_rule); 1279 1280 /* If 'tee', continue with original packet */ 1281 if (clone == NULL) { 1282 return; 1283 } 1284 m = clone; 1285 ip = mtod(m, struct ip *); 1286 } 1287#endif 1288 1289#if IPSEC 1290 /* 1291 * enforce IPsec policy checking if we are seeing last header. 1292 * note that we do not visit this with protocols with pcb layer 1293 * code - like udp/tcp/raw ip. 1294 */ 1295 if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)) { 1296 if (ipsec4_in_reject(m, NULL)) { 1297 IPSEC_STAT_INCREMENT(ipsecstat.in_polvio); 1298 goto bad; 1299 } 1300 } 1301#endif /* IPSEC */ 1302 1303 /* 1304 * Switch out to protocol's input routine. 1305 */ 1306 OSAddAtomic(1, &ipstat.ips_delivered); 1307 1308#if IPFIREWALL 1309 if (args.fwa_next_hop && ip->ip_p == IPPROTO_TCP) { 1310 /* TCP needs IPFORWARD info if available */ 1311 struct m_tag *fwd_tag; 1312 struct ip_fwd_tag *ipfwd_tag; 1313 1314 fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID, 1315 KERNEL_TAG_TYPE_IPFORWARD, sizeof (*ipfwd_tag), 1316 M_NOWAIT, m); 1317 if (fwd_tag == NULL) 1318 goto bad; 1319 1320 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); 1321 ipfwd_tag->next_hop = args.fwa_next_hop; 1322 1323 m_tag_prepend(m, fwd_tag); 1324 1325 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, 1326 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); 1327 1328 /* TCP deals with its own locking */ 1329 ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); 1330 } else { 1331 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, 1332 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); 1333 1334 if ((sw_lro) && (ip->ip_p == IPPROTO_TCP)) { 1335 m = tcp_lro(m, hlen); 1336 if (m == NULL) 1337 return; 1338 } 1339 1340 ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); 1341 } 1342#else /* !IPFIREWALL */ 1343 if ((sw_lro) && (ip->ip_p == IPPROTO_TCP)) { 1344 m = tcp_lro(m, hlen); 1345 if (m == NULL) 1346 return; 1347 } 1348 ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); 1349#endif /* !IPFIREWALL */ 1350 return; 1351 1352bad: 1353 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); 1354 m_freem(m); 1355} 1356 1357static void 1358ipq_updateparams(void) 1359{ 1360 lck_mtx_assert(&ipqlock, LCK_MTX_ASSERT_OWNED); 1361 /* 1362 * -1 for unlimited allocation. 1363 */ 1364 if (maxnipq < 0) 1365 ipq_limit = 0; 1366 /* 1367 * Positive number for specific bound. 1368 */ 1369 if (maxnipq > 0) 1370 ipq_limit = maxnipq; 1371 /* 1372 * Zero specifies no further fragment queue allocation -- set the 1373 * bound very low, but rely on implementation elsewhere to actually 1374 * prevent allocation and reclaim current queues. 1375 */ 1376 if (maxnipq == 0) 1377 ipq_limit = 1; 1378 /* 1379 * Arm the purge timer if not already and if there's work to do 1380 */ 1381 frag_sched_timeout(); 1382} 1383 1384static int 1385sysctl_maxnipq SYSCTL_HANDLER_ARGS 1386{ 1387#pragma unused(arg1, arg2) 1388 int error, i; 1389 1390 lck_mtx_lock(&ipqlock); 1391 i = maxnipq; 1392 error = sysctl_handle_int(oidp, &i, 0, req); 1393 if (error || req->newptr == USER_ADDR_NULL) 1394 goto done; 1395 /* impose bounds */ 1396 if (i < -1 || i > (nmbclusters / 4)) { 1397 error = EINVAL; 1398 goto done; 1399 } 1400 maxnipq = i; 1401 ipq_updateparams(); 1402done: 1403 lck_mtx_unlock(&ipqlock); 1404 return (error); 1405} 1406 1407static int 1408sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS 1409{ 1410#pragma unused(arg1, arg2) 1411 int error, i; 1412 1413 lck_mtx_lock(&ipqlock); 1414 i = maxfragsperpacket; 1415 error = sysctl_handle_int(oidp, &i, 0, req); 1416 if (error || req->newptr == USER_ADDR_NULL) 1417 goto done; 1418 maxfragsperpacket = i; 1419 ipq_updateparams(); /* see if we need to arm timer */ 1420done: 1421 lck_mtx_unlock(&ipqlock); 1422 return (error); 1423} 1424 1425/* 1426 * Take incoming datagram fragment and try to reassemble it into 1427 * whole datagram. If a chain for reassembly of this datagram already 1428 * exists, then it is given as fp; otherwise have to make a chain. 1429 * 1430 * When IPDIVERT enabled, keep additional state with each packet that 1431 * tells us if we need to divert or tee the packet we're building. 1432 * 1433 * The IP header is *NOT* adjusted out of iplen. 1434 */ 1435static struct mbuf * 1436#if IPDIVERT 1437ip_reass(struct mbuf *m, 1438#ifdef IPDIVERT_44 1439 u_int32_t *divinfo, 1440#else /* IPDIVERT_44 */ 1441 u_int16_t *divinfo, 1442#endif /* IPDIVERT_44 */ 1443 u_int16_t *divcookie) 1444#else /* IPDIVERT */ 1445ip_reass(struct mbuf *m) 1446#endif /* IPDIVERT */ 1447{ 1448 struct ip *ip; 1449 struct mbuf *p, *q, *nq, *t; 1450 struct ipq *fp = NULL; 1451 struct ipqhead *head; 1452 int i, hlen, next; 1453 u_int8_t ecn, ecn0; 1454 uint32_t csum, csum_flags; 1455 uint16_t hash; 1456 struct fq_head dfq; 1457 1458 MBUFQ_INIT(&dfq); /* for deferred frees */ 1459 1460 /* If maxnipq or maxfragsperpacket is 0, never accept fragments. */ 1461 if (maxnipq == 0 || maxfragsperpacket == 0) { 1462 ipstat.ips_fragments++; 1463 ipstat.ips_fragdropped++; 1464 m_freem(m); 1465 if (nipq > 0) { 1466 lck_mtx_lock(&ipqlock); 1467 frag_sched_timeout(); /* purge stale fragments */ 1468 lck_mtx_unlock(&ipqlock); 1469 } 1470 return (NULL); 1471 } 1472 1473 ip = mtod(m, struct ip *); 1474 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 1475 1476 lck_mtx_lock(&ipqlock); 1477 1478 hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); 1479 head = &ipq[hash]; 1480 1481 /* 1482 * Look for queue of fragments 1483 * of this datagram. 1484 */ 1485 TAILQ_FOREACH(fp, head, ipq_list) { 1486 if (ip->ip_id == fp->ipq_id && 1487 ip->ip_src.s_addr == fp->ipq_src.s_addr && 1488 ip->ip_dst.s_addr == fp->ipq_dst.s_addr && 1489#if CONFIG_MACF_NET 1490 mac_ipq_label_compare(m, fp) && 1491#endif 1492 ip->ip_p == fp->ipq_p) 1493 goto found; 1494 } 1495 1496 fp = NULL; 1497 1498 /* 1499 * Attempt to trim the number of allocated fragment queues if it 1500 * exceeds the administrative limit. 1501 */ 1502 if ((nipq > (unsigned)maxnipq) && (maxnipq > 0)) { 1503 /* 1504 * drop something from the tail of the current queue 1505 * before proceeding further 1506 */ 1507 struct ipq *fq = TAILQ_LAST(head, ipqhead); 1508 if (fq == NULL) { /* gak */ 1509 for (i = 0; i < IPREASS_NHASH; i++) { 1510 struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead); 1511 if (r) { 1512 ipstat.ips_fragtimeout += r->ipq_nfrags; 1513 frag_freef(&ipq[i], r); 1514 break; 1515 } 1516 } 1517 } else { 1518 ipstat.ips_fragtimeout += fq->ipq_nfrags; 1519 frag_freef(head, fq); 1520 } 1521 } 1522 1523found: 1524 /* 1525 * Leverage partial checksum offload for IP fragments. Narrow down 1526 * the scope to cover only UDP without IP options, as that is the 1527 * most common case. 1528 * 1529 * Perform 1's complement adjustment of octets that got included/ 1530 * excluded in the hardware-calculated checksum value. Ignore cases 1531 * where the value includes or excludes the IP header span, as the 1532 * sum for those octets would already be 0xffff and thus no-op. 1533 */ 1534 if (ip->ip_p == IPPROTO_UDP && hlen == sizeof (struct ip) && 1535 (m->m_pkthdr.csum_flags & 1536 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) == 1537 (CSUM_DATA_VALID | CSUM_PARTIAL)) { 1538 uint32_t start; 1539 1540 start = m->m_pkthdr.csum_rx_start; 1541 csum = m->m_pkthdr.csum_rx_val; 1542 1543 if (start != 0 && start != hlen) { 1544#if BYTE_ORDER != BIG_ENDIAN 1545 if (start < hlen) { 1546 HTONS(ip->ip_len); 1547 HTONS(ip->ip_off); 1548 } 1549#endif 1550 /* callee folds in sum */ 1551 csum = m_adj_sum16(m, start, hlen, csum); 1552#if BYTE_ORDER != BIG_ENDIAN 1553 if (start < hlen) { 1554 NTOHS(ip->ip_off); 1555 NTOHS(ip->ip_len); 1556 } 1557#endif 1558 } 1559 csum_flags = m->m_pkthdr.csum_flags; 1560 } else { 1561 csum = 0; 1562 csum_flags = 0; 1563 } 1564 1565 /* Invalidate checksum */ 1566 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID; 1567 1568 ipstat.ips_fragments++; 1569 1570 /* 1571 * Adjust ip_len to not reflect header, 1572 * convert offset of this to bytes. 1573 */ 1574 ip->ip_len -= hlen; 1575 if (ip->ip_off & IP_MF) { 1576 /* 1577 * Make sure that fragments have a data length 1578 * that's a non-zero multiple of 8 bytes. 1579 */ 1580 if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { 1581 OSAddAtomic(1, &ipstat.ips_toosmall); 1582 /* 1583 * Reassembly queue may have been found if previous 1584 * fragments were valid; given that this one is bad, 1585 * we need to drop it. Make sure to set fp to NULL 1586 * if not already, since we don't want to decrement 1587 * ipq_nfrags as it doesn't include this packet. 1588 */ 1589 fp = NULL; 1590 goto dropfrag; 1591 } 1592 m->m_flags |= M_FRAG; 1593 } else { 1594 /* Clear the flag in case packet comes from loopback */ 1595 m->m_flags &= ~M_FRAG; 1596 } 1597 ip->ip_off <<= 3; 1598 1599 m->m_pkthdr.pkt_hdr = ip; 1600 1601 /* Previous ip_reass() started here. */ 1602 /* 1603 * Presence of header sizes in mbufs 1604 * would confuse code below. 1605 */ 1606 m->m_data += hlen; 1607 m->m_len -= hlen; 1608 1609 /* 1610 * If first fragment to arrive, create a reassembly queue. 1611 */ 1612 if (fp == NULL) { 1613 fp = ipq_alloc(M_DONTWAIT); 1614 if (fp == NULL) 1615 goto dropfrag; 1616#if CONFIG_MACF_NET 1617 if (mac_ipq_label_init(fp, M_NOWAIT) != 0) { 1618 ipq_free(fp); 1619 fp = NULL; 1620 goto dropfrag; 1621 } 1622 mac_ipq_label_associate(m, fp); 1623#endif 1624 TAILQ_INSERT_HEAD(head, fp, ipq_list); 1625 nipq++; 1626 fp->ipq_nfrags = 1; 1627 fp->ipq_ttl = IPFRAGTTL; 1628 fp->ipq_p = ip->ip_p; 1629 fp->ipq_id = ip->ip_id; 1630 fp->ipq_src = ip->ip_src; 1631 fp->ipq_dst = ip->ip_dst; 1632 fp->ipq_frags = m; 1633 m->m_nextpkt = NULL; 1634 /* 1635 * If the first fragment has valid checksum offload 1636 * info, the rest of fragments are eligible as well. 1637 */ 1638 if (csum_flags != 0) { 1639 fp->ipq_csum = csum; 1640 fp->ipq_csum_flags = csum_flags; 1641 } 1642#if IPDIVERT 1643 /* 1644 * Transfer firewall instructions to the fragment structure. 1645 * Only trust info in the fragment at offset 0. 1646 */ 1647 if (ip->ip_off == 0) { 1648#ifdef IPDIVERT_44 1649 fp->ipq_div_info = *divinfo; 1650#else 1651 fp->ipq_divert = *divinfo; 1652#endif 1653 fp->ipq_div_cookie = *divcookie; 1654 } 1655 *divinfo = 0; 1656 *divcookie = 0; 1657#endif /* IPDIVERT */ 1658 m = NULL; /* nothing to return */ 1659 goto done; 1660 } else { 1661 fp->ipq_nfrags++; 1662#if CONFIG_MACF_NET 1663 mac_ipq_label_update(m, fp); 1664#endif 1665 } 1666 1667#define GETIP(m) ((struct ip *)((m)->m_pkthdr.pkt_hdr)) 1668 1669 /* 1670 * Handle ECN by comparing this segment with the first one; 1671 * if CE is set, do not lose CE. 1672 * drop if CE and not-ECT are mixed for the same packet. 1673 */ 1674 ecn = ip->ip_tos & IPTOS_ECN_MASK; 1675 ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; 1676 if (ecn == IPTOS_ECN_CE) { 1677 if (ecn0 == IPTOS_ECN_NOTECT) 1678 goto dropfrag; 1679 if (ecn0 != IPTOS_ECN_CE) 1680 GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; 1681 } 1682 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) 1683 goto dropfrag; 1684 1685 /* 1686 * Find a segment which begins after this one does. 1687 */ 1688 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) 1689 if (GETIP(q)->ip_off > ip->ip_off) 1690 break; 1691 1692 /* 1693 * If there is a preceding segment, it may provide some of 1694 * our data already. If so, drop the data from the incoming 1695 * segment. If it provides all of our data, drop us, otherwise 1696 * stick new segment in the proper place. 1697 * 1698 * If some of the data is dropped from the preceding 1699 * segment, then it's checksum is invalidated. 1700 */ 1701 if (p) { 1702 i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off; 1703 if (i > 0) { 1704 if (i >= ip->ip_len) 1705 goto dropfrag; 1706 m_adj(m, i); 1707 fp->ipq_csum_flags = 0; 1708 ip->ip_off += i; 1709 ip->ip_len -= i; 1710 } 1711 m->m_nextpkt = p->m_nextpkt; 1712 p->m_nextpkt = m; 1713 } else { 1714 m->m_nextpkt = fp->ipq_frags; 1715 fp->ipq_frags = m; 1716 } 1717 1718 /* 1719 * While we overlap succeeding segments trim them or, 1720 * if they are completely covered, dequeue them. 1721 */ 1722 for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off; 1723 q = nq) { 1724 i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off; 1725 if (i < GETIP(q)->ip_len) { 1726 GETIP(q)->ip_len -= i; 1727 GETIP(q)->ip_off += i; 1728 m_adj(q, i); 1729 fp->ipq_csum_flags = 0; 1730 break; 1731 } 1732 nq = q->m_nextpkt; 1733 m->m_nextpkt = nq; 1734 ipstat.ips_fragdropped++; 1735 fp->ipq_nfrags--; 1736 /* defer freeing until after lock is dropped */ 1737 MBUFQ_ENQUEUE(&dfq, q); 1738 } 1739 1740 /* 1741 * If this fragment contains similar checksum offload info 1742 * as that of the existing ones, accumulate checksum. Otherwise, 1743 * invalidate checksum offload info for the entire datagram. 1744 */ 1745 if (csum_flags != 0 && csum_flags == fp->ipq_csum_flags) 1746 fp->ipq_csum += csum; 1747 else if (fp->ipq_csum_flags != 0) 1748 fp->ipq_csum_flags = 0; 1749 1750#if IPDIVERT 1751 /* 1752 * Transfer firewall instructions to the fragment structure. 1753 * Only trust info in the fragment at offset 0. 1754 */ 1755 if (ip->ip_off == 0) { 1756#ifdef IPDIVERT_44 1757 fp->ipq_div_info = *divinfo; 1758#else 1759 fp->ipq_divert = *divinfo; 1760#endif 1761 fp->ipq_div_cookie = *divcookie; 1762 } 1763 *divinfo = 0; 1764 *divcookie = 0; 1765#endif /* IPDIVERT */ 1766 1767 /* 1768 * Check for complete reassembly and perform frag per packet 1769 * limiting. 1770 * 1771 * Frag limiting is performed here so that the nth frag has 1772 * a chance to complete the packet before we drop the packet. 1773 * As a result, n+1 frags are actually allowed per packet, but 1774 * only n will ever be stored. (n = maxfragsperpacket.) 1775 * 1776 */ 1777 next = 0; 1778 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { 1779 if (GETIP(q)->ip_off != next) { 1780 if (fp->ipq_nfrags > maxfragsperpacket) { 1781 ipstat.ips_fragdropped += fp->ipq_nfrags; 1782 frag_freef(head, fp); 1783 } 1784 m = NULL; /* nothing to return */ 1785 goto done; 1786 } 1787 next += GETIP(q)->ip_len; 1788 } 1789 /* Make sure the last packet didn't have the IP_MF flag */ 1790 if (p->m_flags & M_FRAG) { 1791 if (fp->ipq_nfrags > maxfragsperpacket) { 1792 ipstat.ips_fragdropped += fp->ipq_nfrags; 1793 frag_freef(head, fp); 1794 } 1795 m = NULL; /* nothing to return */ 1796 goto done; 1797 } 1798 1799 /* 1800 * Reassembly is complete. Make sure the packet is a sane size. 1801 */ 1802 q = fp->ipq_frags; 1803 ip = GETIP(q); 1804 if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) { 1805 ipstat.ips_toolong++; 1806 ipstat.ips_fragdropped += fp->ipq_nfrags; 1807 frag_freef(head, fp); 1808 m = NULL; /* nothing to return */ 1809 goto done; 1810 } 1811 1812 /* 1813 * Concatenate fragments. 1814 */ 1815 m = q; 1816 t = m->m_next; 1817 m->m_next = NULL; 1818 m_cat(m, t); 1819 nq = q->m_nextpkt; 1820 q->m_nextpkt = NULL; 1821 for (q = nq; q != NULL; q = nq) { 1822 nq = q->m_nextpkt; 1823 q->m_nextpkt = NULL; 1824 m_cat(m, q); 1825 } 1826 1827 /* 1828 * Store partial hardware checksum info from the fragment queue; 1829 * the receive start offset is set to 20 bytes (see code at the 1830 * top of this routine.) 1831 */ 1832 if (fp->ipq_csum_flags != 0) { 1833 csum = fp->ipq_csum; 1834 1835 ADDCARRY(csum); 1836 1837 m->m_pkthdr.csum_rx_val = csum; 1838 m->m_pkthdr.csum_rx_start = sizeof (struct ip); 1839 m->m_pkthdr.csum_flags = fp->ipq_csum_flags; 1840 } else if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || 1841 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) { 1842 /* loopback checksums are always OK */ 1843 m->m_pkthdr.csum_data = 0xffff; 1844 m->m_pkthdr.csum_flags &= ~CSUM_PARTIAL; 1845 m->m_pkthdr.csum_flags = 1846 CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 1847 CSUM_IP_CHECKED | CSUM_IP_VALID; 1848 } 1849 1850#if IPDIVERT 1851 /* 1852 * Extract firewall instructions from the fragment structure. 1853 */ 1854#ifdef IPDIVERT_44 1855 *divinfo = fp->ipq_div_info; 1856#else 1857 *divinfo = fp->ipq_divert; 1858#endif 1859 *divcookie = fp->ipq_div_cookie; 1860#endif /* IPDIVERT */ 1861 1862#if CONFIG_MACF_NET 1863 mac_mbuf_label_associate_ipq(fp, m); 1864 mac_ipq_label_destroy(fp); 1865#endif 1866 /* 1867 * Create header for new ip packet by modifying header of first 1868 * packet; dequeue and discard fragment reassembly header. 1869 * Make header visible. 1870 */ 1871 ip->ip_len = (IP_VHL_HL(ip->ip_vhl) << 2) + next; 1872 ip->ip_src = fp->ipq_src; 1873 ip->ip_dst = fp->ipq_dst; 1874 1875 fp->ipq_frags = NULL; /* return to caller as 'm' */ 1876 frag_freef(head, fp); 1877 fp = NULL; 1878 1879 m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2); 1880 m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2); 1881 /* some debugging cruft by sklower, below, will go away soon */ 1882 if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ 1883 m_fixhdr(m); 1884 ipstat.ips_reassembled++; 1885 1886 /* arm the purge timer if not already and if there's work to do */ 1887 frag_sched_timeout(); 1888 lck_mtx_unlock(&ipqlock); 1889 /* perform deferred free (if needed) now that lock is dropped */ 1890 if (!MBUFQ_EMPTY(&dfq)) 1891 MBUFQ_DRAIN(&dfq); 1892 VERIFY(MBUFQ_EMPTY(&dfq)); 1893 return (m); 1894 1895done: 1896 VERIFY(m == NULL); 1897 /* arm the purge timer if not already and if there's work to do */ 1898 frag_sched_timeout(); 1899 lck_mtx_unlock(&ipqlock); 1900 /* perform deferred free (if needed) */ 1901 if (!MBUFQ_EMPTY(&dfq)) 1902 MBUFQ_DRAIN(&dfq); 1903 VERIFY(MBUFQ_EMPTY(&dfq)); 1904 return (NULL); 1905 1906dropfrag: 1907#if IPDIVERT 1908 *divinfo = 0; 1909 *divcookie = 0; 1910#endif /* IPDIVERT */ 1911 ipstat.ips_fragdropped++; 1912 if (fp != NULL) 1913 fp->ipq_nfrags--; 1914 /* arm the purge timer if not already and if there's work to do */ 1915 frag_sched_timeout(); 1916 lck_mtx_unlock(&ipqlock); 1917 m_freem(m); 1918 /* perform deferred free (if needed) */ 1919 if (!MBUFQ_EMPTY(&dfq)) 1920 MBUFQ_DRAIN(&dfq); 1921 VERIFY(MBUFQ_EMPTY(&dfq)); 1922 return (NULL); 1923#undef GETIP 1924} 1925 1926/* 1927 * Free a fragment reassembly header and all 1928 * associated datagrams. 1929 */ 1930static void 1931frag_freef(struct ipqhead *fhp, struct ipq *fp) 1932{ 1933 lck_mtx_assert(&ipqlock, LCK_MTX_ASSERT_OWNED); 1934 1935 fp->ipq_nfrags = 0; 1936 if (fp->ipq_frags != NULL) { 1937 m_freem_list(fp->ipq_frags); 1938 fp->ipq_frags = NULL; 1939 } 1940 TAILQ_REMOVE(fhp, fp, ipq_list); 1941 nipq--; 1942 ipq_free(fp); 1943} 1944 1945/* 1946 * IP reassembly timer processing 1947 */ 1948static void 1949frag_timeout(void *arg) 1950{ 1951#pragma unused(arg) 1952 struct ipq *fp; 1953 int i; 1954 1955 /* 1956 * Update coarse-grained networking timestamp (in sec.); the idea 1957 * is to piggy-back on the timeout callout to update the counter 1958 * returnable via net_uptime(). 1959 */ 1960 net_update_uptime(); 1961 1962 lck_mtx_lock(&ipqlock); 1963 for (i = 0; i < IPREASS_NHASH; i++) { 1964 for (fp = TAILQ_FIRST(&ipq[i]); fp; ) { 1965 struct ipq *fpp; 1966 1967 fpp = fp; 1968 fp = TAILQ_NEXT(fp, ipq_list); 1969 if (--fpp->ipq_ttl == 0) { 1970 ipstat.ips_fragtimeout += fpp->ipq_nfrags; 1971 frag_freef(&ipq[i], fpp); 1972 } 1973 } 1974 } 1975 /* 1976 * If we are over the maximum number of fragments 1977 * (due to the limit being lowered), drain off 1978 * enough to get down to the new limit. 1979 */ 1980 if (maxnipq >= 0 && nipq > (unsigned)maxnipq) { 1981 for (i = 0; i < IPREASS_NHASH; i++) { 1982 while (nipq > (unsigned)maxnipq && 1983 !TAILQ_EMPTY(&ipq[i])) { 1984 ipstat.ips_fragdropped += 1985 TAILQ_FIRST(&ipq[i])->ipq_nfrags; 1986 frag_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); 1987 } 1988 } 1989 } 1990 /* re-arm the purge timer if there's work to do */ 1991 frag_timeout_run = 0; 1992 frag_sched_timeout(); 1993 lck_mtx_unlock(&ipqlock); 1994} 1995 1996static void 1997frag_sched_timeout(void) 1998{ 1999 lck_mtx_assert(&ipqlock, LCK_MTX_ASSERT_OWNED); 2000 2001 if (!frag_timeout_run && nipq > 0) { 2002 frag_timeout_run = 1; 2003 timeout(frag_timeout, NULL, hz); 2004 } 2005} 2006 2007/* 2008 * Drain off all datagram fragments. 2009 */ 2010static void 2011frag_drain(void) 2012{ 2013 int i; 2014 2015 lck_mtx_lock(&ipqlock); 2016 for (i = 0; i < IPREASS_NHASH; i++) { 2017 while (!TAILQ_EMPTY(&ipq[i])) { 2018 ipstat.ips_fragdropped += 2019 TAILQ_FIRST(&ipq[i])->ipq_nfrags; 2020 frag_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); 2021 } 2022 } 2023 lck_mtx_unlock(&ipqlock); 2024} 2025 2026static struct ipq * 2027ipq_alloc(int how) 2028{ 2029 struct mbuf *t; 2030 struct ipq *fp; 2031 2032 /* 2033 * See comments in ipq_updateparams(). Keep the count separate 2034 * from nipq since the latter represents the elements already 2035 * in the reassembly queues. 2036 */ 2037 if (ipq_limit > 0 && ipq_count > ipq_limit) 2038 return (NULL); 2039 2040 t = m_get(how, MT_FTABLE); 2041 if (t != NULL) { 2042 atomic_add_32(&ipq_count, 1); 2043 fp = mtod(t, struct ipq *); 2044 bzero(fp, sizeof (*fp)); 2045 } else { 2046 fp = NULL; 2047 } 2048 return (fp); 2049} 2050 2051static void 2052ipq_free(struct ipq *fp) 2053{ 2054 (void) m_free(dtom(fp)); 2055 atomic_add_32(&ipq_count, -1); 2056} 2057 2058/* 2059 * Drain callback 2060 */ 2061void 2062ip_drain(void) 2063{ 2064 frag_drain(); /* fragments */ 2065 in_rtqdrain(); /* protocol cloned routes */ 2066 in_arpdrain(NULL); /* cloned routes: ARP */ 2067} 2068 2069/* 2070 * Do option processing on a datagram, 2071 * possibly discarding it if bad options are encountered, 2072 * or forwarding it if source-routed. 2073 * The pass argument is used when operating in the IPSTEALTH 2074 * mode to tell what options to process: 2075 * [LS]SRR (pass 0) or the others (pass 1). 2076 * The reason for as many as two passes is that when doing IPSTEALTH, 2077 * non-routing options should be processed only if the packet is for us. 2078 * Returns 1 if packet has been forwarded/freed, 2079 * 0 if the packet should be processed further. 2080 */ 2081static int 2082ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop) 2083{ 2084#pragma unused(pass) 2085 struct ip *ip = mtod(m, struct ip *); 2086 u_char *cp; 2087 struct ip_timestamp *ipt; 2088 struct in_ifaddr *ia; 2089 int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; 2090 struct in_addr *sin, dst; 2091 n_time ntime; 2092 struct sockaddr_in ipaddr = { 2093 sizeof (ipaddr), AF_INET, 0, { 0 }, { 0, } }; 2094 2095 /* Expect 32-bit aligned data pointer on strict-align platforms */ 2096 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); 2097 2098 dst = ip->ip_dst; 2099 cp = (u_char *)(ip + 1); 2100 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); 2101 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2102 opt = cp[IPOPT_OPTVAL]; 2103 if (opt == IPOPT_EOL) 2104 break; 2105 if (opt == IPOPT_NOP) 2106 optlen = 1; 2107 else { 2108 if (cnt < IPOPT_OLEN + sizeof (*cp)) { 2109 code = &cp[IPOPT_OLEN] - (u_char *)ip; 2110 goto bad; 2111 } 2112 optlen = cp[IPOPT_OLEN]; 2113 if (optlen < IPOPT_OLEN + sizeof (*cp) || 2114 optlen > cnt) { 2115 code = &cp[IPOPT_OLEN] - (u_char *)ip; 2116 goto bad; 2117 } 2118 } 2119 switch (opt) { 2120 2121 default: 2122 break; 2123 2124 /* 2125 * Source routing with record. 2126 * Find interface with current destination address. 2127 * If none on this machine then drop if strictly routed, 2128 * or do nothing if loosely routed. 2129 * Record interface address and bring up next address 2130 * component. If strictly routed make sure next 2131 * address is on directly accessible net. 2132 */ 2133 case IPOPT_LSRR: 2134 case IPOPT_SSRR: 2135 if (optlen < IPOPT_OFFSET + sizeof (*cp)) { 2136 code = &cp[IPOPT_OLEN] - (u_char *)ip; 2137 goto bad; 2138 } 2139 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 2140 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 2141 goto bad; 2142 } 2143 ipaddr.sin_addr = ip->ip_dst; 2144 ia = (struct in_ifaddr *)ifa_ifwithaddr(SA(&ipaddr)); 2145 if (ia == NULL) { 2146 if (opt == IPOPT_SSRR) { 2147 type = ICMP_UNREACH; 2148 code = ICMP_UNREACH_SRCFAIL; 2149 goto bad; 2150 } 2151 if (!ip_dosourceroute) 2152 goto nosourcerouting; 2153 /* 2154 * Loose routing, and not at next destination 2155 * yet; nothing to do except forward. 2156 */ 2157 break; 2158 } else { 2159 IFA_REMREF(&ia->ia_ifa); 2160 ia = NULL; 2161 } 2162 off--; /* 0 origin */ 2163 if (off > optlen - (int)sizeof (struct in_addr)) { 2164 /* 2165 * End of source route. Should be for us. 2166 */ 2167 if (!ip_acceptsourceroute) 2168 goto nosourcerouting; 2169 save_rte(cp, ip->ip_src); 2170 break; 2171 } 2172 2173 if (!ip_dosourceroute) { 2174 if (ipforwarding) { 2175 char buf[MAX_IPv4_STR_LEN]; 2176 char buf2[MAX_IPv4_STR_LEN]; 2177 /* 2178 * Acting as a router, so generate ICMP 2179 */ 2180nosourcerouting: 2181 log(LOG_WARNING, 2182 "attempted source route from %s " 2183 "to %s\n", 2184 inet_ntop(AF_INET, &ip->ip_src, 2185 buf, sizeof (buf)), 2186 inet_ntop(AF_INET, &ip->ip_dst, 2187 buf2, sizeof (buf2))); 2188 type = ICMP_UNREACH; 2189 code = ICMP_UNREACH_SRCFAIL; 2190 goto bad; 2191 } else { 2192 /* 2193 * Not acting as a router, 2194 * so silently drop. 2195 */ 2196 OSAddAtomic(1, &ipstat.ips_cantforward); 2197 m_freem(m); 2198 return (1); 2199 } 2200 } 2201 2202 /* 2203 * locate outgoing interface 2204 */ 2205 (void) memcpy(&ipaddr.sin_addr, cp + off, 2206 sizeof (ipaddr.sin_addr)); 2207 2208 if (opt == IPOPT_SSRR) { 2209#define INA struct in_ifaddr * 2210 if ((ia = (INA)ifa_ifwithdstaddr( 2211 SA(&ipaddr))) == NULL) { 2212 ia = (INA)ifa_ifwithnet(SA(&ipaddr)); 2213 } 2214 } else { 2215 ia = ip_rtaddr(ipaddr.sin_addr); 2216 } 2217 if (ia == NULL) { 2218 type = ICMP_UNREACH; 2219 code = ICMP_UNREACH_SRCFAIL; 2220 goto bad; 2221 } 2222 ip->ip_dst = ipaddr.sin_addr; 2223 IFA_LOCK(&ia->ia_ifa); 2224 (void) memcpy(cp + off, &(IA_SIN(ia)->sin_addr), 2225 sizeof (struct in_addr)); 2226 IFA_UNLOCK(&ia->ia_ifa); 2227 IFA_REMREF(&ia->ia_ifa); 2228 ia = NULL; 2229 cp[IPOPT_OFFSET] += sizeof (struct in_addr); 2230 /* 2231 * Let ip_intr's mcast routing check handle mcast pkts 2232 */ 2233 forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); 2234 break; 2235 2236 case IPOPT_RR: 2237 if (optlen < IPOPT_OFFSET + sizeof (*cp)) { 2238 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 2239 goto bad; 2240 } 2241 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 2242 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 2243 goto bad; 2244 } 2245 /* 2246 * If no space remains, ignore. 2247 */ 2248 off--; /* 0 origin */ 2249 if (off > optlen - (int)sizeof (struct in_addr)) 2250 break; 2251 (void) memcpy(&ipaddr.sin_addr, &ip->ip_dst, 2252 sizeof (ipaddr.sin_addr)); 2253 /* 2254 * locate outgoing interface; if we're the destination, 2255 * use the incoming interface (should be same). 2256 */ 2257 if ((ia = (INA)ifa_ifwithaddr(SA(&ipaddr))) == NULL) { 2258 if ((ia = ip_rtaddr(ipaddr.sin_addr)) == NULL) { 2259 type = ICMP_UNREACH; 2260 code = ICMP_UNREACH_HOST; 2261 goto bad; 2262 } 2263 } 2264 IFA_LOCK(&ia->ia_ifa); 2265 (void) memcpy(cp + off, &(IA_SIN(ia)->sin_addr), 2266 sizeof (struct in_addr)); 2267 IFA_UNLOCK(&ia->ia_ifa); 2268 IFA_REMREF(&ia->ia_ifa); 2269 ia = NULL; 2270 cp[IPOPT_OFFSET] += sizeof (struct in_addr); 2271 break; 2272 2273 case IPOPT_TS: 2274 code = cp - (u_char *)ip; 2275 ipt = (struct ip_timestamp *)(void *)cp; 2276 if (ipt->ipt_len < 4 || ipt->ipt_len > 40) { 2277 code = (u_char *)&ipt->ipt_len - (u_char *)ip; 2278 goto bad; 2279 } 2280 if (ipt->ipt_ptr < 5) { 2281 code = (u_char *)&ipt->ipt_ptr - (u_char *)ip; 2282 goto bad; 2283 } 2284 if (ipt->ipt_ptr > 2285 ipt->ipt_len - (int)sizeof (int32_t)) { 2286 if (++ipt->ipt_oflw == 0) { 2287 code = (u_char *)&ipt->ipt_ptr - 2288 (u_char *)ip; 2289 goto bad; 2290 } 2291 break; 2292 } 2293 sin = (struct in_addr *)(void *)(cp + ipt->ipt_ptr - 1); 2294 switch (ipt->ipt_flg) { 2295 2296 case IPOPT_TS_TSONLY: 2297 break; 2298 2299 case IPOPT_TS_TSANDADDR: 2300 if (ipt->ipt_ptr - 1 + sizeof (n_time) + 2301 sizeof (struct in_addr) > ipt->ipt_len) { 2302 code = (u_char *)&ipt->ipt_ptr - 2303 (u_char *)ip; 2304 goto bad; 2305 } 2306 ipaddr.sin_addr = dst; 2307 ia = (INA)ifaof_ifpforaddr(SA(&ipaddr), 2308 m->m_pkthdr.rcvif); 2309 if (ia == NULL) 2310 continue; 2311 IFA_LOCK(&ia->ia_ifa); 2312 (void) memcpy(sin, &IA_SIN(ia)->sin_addr, 2313 sizeof (struct in_addr)); 2314 IFA_UNLOCK(&ia->ia_ifa); 2315 ipt->ipt_ptr += sizeof (struct in_addr); 2316 IFA_REMREF(&ia->ia_ifa); 2317 ia = NULL; 2318 break; 2319 2320 case IPOPT_TS_PRESPEC: 2321 if (ipt->ipt_ptr - 1 + sizeof (n_time) + 2322 sizeof (struct in_addr) > ipt->ipt_len) { 2323 code = (u_char *)&ipt->ipt_ptr - 2324 (u_char *)ip; 2325 goto bad; 2326 } 2327 (void) memcpy(&ipaddr.sin_addr, sin, 2328 sizeof (struct in_addr)); 2329 if ((ia = (struct in_ifaddr *)ifa_ifwithaddr( 2330 SA(&ipaddr))) == NULL) 2331 continue; 2332 IFA_REMREF(&ia->ia_ifa); 2333 ia = NULL; 2334 ipt->ipt_ptr += sizeof (struct in_addr); 2335 break; 2336 2337 default: 2338 /* XXX can't take &ipt->ipt_flg */ 2339 code = (u_char *)&ipt->ipt_ptr - 2340 (u_char *)ip + 1; 2341 goto bad; 2342 } 2343 ntime = iptime(); 2344 (void) memcpy(cp + ipt->ipt_ptr - 1, &ntime, 2345 sizeof (n_time)); 2346 ipt->ipt_ptr += sizeof (n_time); 2347 } 2348 } 2349 if (forward && ipforwarding) { 2350 ip_forward(m, 1, next_hop); 2351 return (1); 2352 } 2353 return (0); 2354bad: 2355 /* XXX icmp_error adds in hdr length */ 2356 ip->ip_len -= IP_VHL_HL(ip->ip_vhl) << 2; 2357 icmp_error(m, type, code, 0, 0); 2358 OSAddAtomic(1, &ipstat.ips_badoptions); 2359 return (1); 2360} 2361 2362/* 2363 * Check for the presence of the IP Router Alert option [RFC2113] 2364 * in the header of an IPv4 datagram. 2365 * 2366 * This call is not intended for use from the forwarding path; it is here 2367 * so that protocol domains may check for the presence of the option. 2368 * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert 2369 * option does not have much relevance to the implementation, though this 2370 * may change in future. 2371 * Router alert options SHOULD be passed if running in IPSTEALTH mode and 2372 * we are not the endpoint. 2373 * Length checks on individual options should already have been peformed 2374 * by ip_dooptions() therefore they are folded under DIAGNOSTIC here. 2375 * 2376 * Return zero if not present or options are invalid, non-zero if present. 2377 */ 2378int 2379ip_checkrouteralert(struct mbuf *m) 2380{ 2381 struct ip *ip = mtod(m, struct ip *); 2382 u_char *cp; 2383 int opt, optlen, cnt, found_ra; 2384 2385 found_ra = 0; 2386 cp = (u_char *)(ip + 1); 2387 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); 2388 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2389 opt = cp[IPOPT_OPTVAL]; 2390 if (opt == IPOPT_EOL) 2391 break; 2392 if (opt == IPOPT_NOP) 2393 optlen = 1; 2394 else { 2395#ifdef DIAGNOSTIC 2396 if (cnt < IPOPT_OLEN + sizeof (*cp)) 2397 break; 2398#endif 2399 optlen = cp[IPOPT_OLEN]; 2400#ifdef DIAGNOSTIC 2401 if (optlen < IPOPT_OLEN + sizeof (*cp) || optlen > cnt) 2402 break; 2403#endif 2404 } 2405 switch (opt) { 2406 case IPOPT_RA: 2407#ifdef DIAGNOSTIC 2408 if (optlen != IPOPT_OFFSET + sizeof (uint16_t) || 2409 (*((uint16_t *)(void *)&cp[IPOPT_OFFSET]) != 0)) 2410 break; 2411 else 2412#endif 2413 found_ra = 1; 2414 break; 2415 default: 2416 break; 2417 } 2418 } 2419 2420 return (found_ra); 2421} 2422 2423/* 2424 * Given address of next destination (final or next hop), 2425 * return internet address info of interface to be used to get there. 2426 */ 2427struct in_ifaddr * 2428ip_rtaddr(struct in_addr dst) 2429{ 2430 struct sockaddr_in *sin; 2431 struct ifaddr *rt_ifa; 2432 struct route ro; 2433 2434 bzero(&ro, sizeof (ro)); 2435 sin = SIN(&ro.ro_dst); 2436 sin->sin_family = AF_INET; 2437 sin->sin_len = sizeof (*sin); 2438 sin->sin_addr = dst; 2439 2440 rtalloc_ign(&ro, RTF_PRCLONING); 2441 if (ro.ro_rt == NULL) { 2442 ROUTE_RELEASE(&ro); 2443 return (NULL); 2444 } 2445 2446 RT_LOCK(ro.ro_rt); 2447 if ((rt_ifa = ro.ro_rt->rt_ifa) != NULL) 2448 IFA_ADDREF(rt_ifa); 2449 RT_UNLOCK(ro.ro_rt); 2450 ROUTE_RELEASE(&ro); 2451 2452 return ((struct in_ifaddr *)rt_ifa); 2453} 2454 2455/* 2456 * Save incoming source route for use in replies, 2457 * to be picked up later by ip_srcroute if the receiver is interested. 2458 */ 2459void 2460save_rte(u_char *option, struct in_addr dst) 2461{ 2462 unsigned olen; 2463 2464 olen = option[IPOPT_OLEN]; 2465#if DIAGNOSTIC 2466 if (ipprintfs) 2467 printf("save_rte: olen %d\n", olen); 2468#endif 2469 if (olen > sizeof (ip_srcrt) - (1 + sizeof (dst))) 2470 return; 2471 bcopy(option, ip_srcrt.srcopt, olen); 2472 ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof (struct in_addr); 2473 ip_srcrt.dst = dst; 2474} 2475 2476/* 2477 * Retrieve incoming source route for use in replies, 2478 * in the same form used by setsockopt. 2479 * The first hop is placed before the options, will be removed later. 2480 */ 2481struct mbuf * 2482ip_srcroute(void) 2483{ 2484 struct in_addr *p, *q; 2485 struct mbuf *m; 2486 2487 if (ip_nhops == 0) 2488 return (NULL); 2489 2490 m = m_get(M_DONTWAIT, MT_HEADER); 2491 if (m == NULL) 2492 return (NULL); 2493 2494#define OPTSIZ (sizeof (ip_srcrt.nop) + sizeof (ip_srcrt.srcopt)) 2495 2496 /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ 2497 m->m_len = ip_nhops * sizeof (struct in_addr) + 2498 sizeof (struct in_addr) + OPTSIZ; 2499#if DIAGNOSTIC 2500 if (ipprintfs) 2501 printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len); 2502#endif 2503 2504 /* 2505 * First save first hop for return route 2506 */ 2507 p = &ip_srcrt.route[ip_nhops - 1]; 2508 *(mtod(m, struct in_addr *)) = *p--; 2509#if DIAGNOSTIC 2510 if (ipprintfs) 2511 printf(" hops %lx", 2512 (u_int32_t)ntohl(mtod(m, struct in_addr *)->s_addr)); 2513#endif 2514 2515 /* 2516 * Copy option fields and padding (nop) to mbuf. 2517 */ 2518 ip_srcrt.nop = IPOPT_NOP; 2519 ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; 2520 (void) memcpy(mtod(m, caddr_t) + sizeof (struct in_addr), 2521 &ip_srcrt.nop, OPTSIZ); 2522 q = (struct in_addr *)(void *)(mtod(m, caddr_t) + 2523 sizeof (struct in_addr) + OPTSIZ); 2524#undef OPTSIZ 2525 /* 2526 * Record return path as an IP source route, 2527 * reversing the path (pointers are now aligned). 2528 */ 2529 while (p >= ip_srcrt.route) { 2530#if DIAGNOSTIC 2531 if (ipprintfs) 2532 printf(" %lx", (u_int32_t)ntohl(q->s_addr)); 2533#endif 2534 *q++ = *p--; 2535 } 2536 /* 2537 * Last hop goes to final destination. 2538 */ 2539 *q = ip_srcrt.dst; 2540#if DIAGNOSTIC 2541 if (ipprintfs) 2542 printf(" %lx\n", (u_int32_t)ntohl(q->s_addr)); 2543#endif 2544 return (m); 2545} 2546 2547/* 2548 * Strip out IP options, at higher 2549 * level protocol in the kernel. 2550 * Second argument is buffer to which options 2551 * will be moved, and return value is their length. 2552 * XXX should be deleted; last arg currently ignored. 2553 */ 2554void 2555ip_stripoptions(struct mbuf *m, struct mbuf *mopt) 2556{ 2557#pragma unused(mopt) 2558 int i; 2559 struct ip *ip = mtod(m, struct ip *); 2560 caddr_t opts; 2561 int olen; 2562 2563 /* Expect 32-bit aligned data pointer on strict-align platforms */ 2564 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); 2565 2566 olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); 2567 opts = (caddr_t)(ip + 1); 2568 i = m->m_len - (sizeof (struct ip) + olen); 2569 bcopy(opts + olen, opts, (unsigned)i); 2570 m->m_len -= olen; 2571 if (m->m_flags & M_PKTHDR) 2572 m->m_pkthdr.len -= olen; 2573 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof (struct ip) >> 2); 2574} 2575 2576u_char inetctlerrmap[PRC_NCMDS] = { 2577 0, 0, 0, 0, 2578 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, 2579 ENETUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, 2580 EMSGSIZE, EHOSTUNREACH, 0, 0, 2581 0, 0, 0, 0, 2582 ENOPROTOOPT, ECONNREFUSED 2583}; 2584 2585static int 2586sysctl_ipforwarding SYSCTL_HANDLER_ARGS 2587{ 2588#pragma unused(arg1, arg2) 2589 int i, was_ipforwarding = ipforwarding; 2590 2591 i = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); 2592 if (i != 0 || req->newptr == USER_ADDR_NULL) 2593 return (i); 2594 2595 if (was_ipforwarding && !ipforwarding) { 2596 /* clean up IPv4 forwarding cached routes */ 2597 ifnet_head_lock_shared(); 2598 for (i = 0; i <= if_index; i++) { 2599 struct ifnet *ifp = ifindex2ifnet[i]; 2600 if (ifp != NULL) { 2601 lck_mtx_lock(&ifp->if_cached_route_lock); 2602 ROUTE_RELEASE(&ifp->if_fwd_route); 2603 bzero(&ifp->if_fwd_route, 2604 sizeof (ifp->if_fwd_route)); 2605 lck_mtx_unlock(&ifp->if_cached_route_lock); 2606 } 2607 } 2608 ifnet_head_done(); 2609 } 2610 2611 return (0); 2612} 2613 2614/* 2615 * Similar to inp_route_{copyout,copyin} routines except that these copy 2616 * out the cached IPv4 forwarding route from struct ifnet instead of the 2617 * inpcb. See comments for those routines for explanations. 2618 */ 2619static void 2620ip_fwd_route_copyout(struct ifnet *ifp, struct route *dst) 2621{ 2622 struct route *src = &ifp->if_fwd_route; 2623 2624 lck_mtx_lock_spin(&ifp->if_cached_route_lock); 2625 lck_mtx_convert_spin(&ifp->if_cached_route_lock); 2626 2627 /* Minor sanity check */ 2628 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) 2629 panic("%s: wrong or corrupted route: %p", __func__, src); 2630 2631 route_copyout(dst, src, sizeof (*dst)); 2632 2633 lck_mtx_unlock(&ifp->if_cached_route_lock); 2634} 2635 2636static void 2637ip_fwd_route_copyin(struct ifnet *ifp, struct route *src) 2638{ 2639 struct route *dst = &ifp->if_fwd_route; 2640 2641 lck_mtx_lock_spin(&ifp->if_cached_route_lock); 2642 lck_mtx_convert_spin(&ifp->if_cached_route_lock); 2643 2644 /* Minor sanity check */ 2645 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) 2646 panic("%s: wrong or corrupted route: %p", __func__, src); 2647 2648 if (ifp->if_fwd_cacheok) 2649 route_copyin(src, dst, sizeof (*src)); 2650 2651 lck_mtx_unlock(&ifp->if_cached_route_lock); 2652} 2653 2654/* 2655 * Forward a packet. If some error occurs return the sender 2656 * an icmp packet. Note we can't always generate a meaningful 2657 * icmp message because icmp doesn't have a large enough repertoire 2658 * of codes and types. 2659 * 2660 * If not forwarding, just drop the packet. This could be confusing 2661 * if ipforwarding was zero but some routing protocol was advancing 2662 * us as a gateway to somewhere. However, we must let the routing 2663 * protocol deal with that. 2664 * 2665 * The srcrt parameter indicates whether the packet is being forwarded 2666 * via a source route. 2667 */ 2668static void 2669ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) 2670{ 2671#if !IPFIREWALL 2672#pragma unused(next_hop) 2673#endif 2674 struct ip *ip = mtod(m, struct ip *); 2675 struct sockaddr_in *sin; 2676 struct rtentry *rt; 2677 struct route fwd_rt; 2678 int error, type = 0, code = 0; 2679 struct mbuf *mcopy; 2680 n_long dest; 2681 struct in_addr pkt_dst; 2682 u_int32_t nextmtu = 0, len; 2683 struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, 0, 0 }; 2684 struct ifnet *rcvifp = m->m_pkthdr.rcvif; 2685#if IPSEC 2686 struct secpolicy *sp = NULL; 2687 int ipsecerror; 2688#endif /* IPSEC */ 2689#if PF 2690 struct pf_mtag *pf_mtag; 2691#endif /* PF */ 2692 2693 dest = 0; 2694#if IPFIREWALL 2695 /* 2696 * Cache the destination address of the packet; this may be 2697 * changed by use of 'ipfw fwd'. 2698 */ 2699 pkt_dst = ((next_hop != NULL) ? next_hop->sin_addr : ip->ip_dst); 2700#else /* !IPFIREWALL */ 2701 pkt_dst = ip->ip_dst; 2702#endif /* !IPFIREWALL */ 2703 2704#if DIAGNOSTIC 2705 if (ipprintfs) 2706 printf("forward: src %lx dst %lx ttl %x\n", 2707 (u_int32_t)ip->ip_src.s_addr, (u_int32_t)pkt_dst.s_addr, 2708 ip->ip_ttl); 2709#endif 2710 2711 if (m->m_flags & (M_BCAST|M_MCAST) || !in_canforward(pkt_dst)) { 2712 OSAddAtomic(1, &ipstat.ips_cantforward); 2713 m_freem(m); 2714 return; 2715 } 2716#if IPSTEALTH 2717 if (!ipstealth) { 2718#endif /* IPSTEALTH */ 2719 if (ip->ip_ttl <= IPTTLDEC) { 2720 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 2721 dest, 0); 2722 return; 2723 } 2724#if IPSTEALTH 2725 } 2726#endif /* IPSTEALTH */ 2727 2728#if PF 2729 pf_mtag = pf_find_mtag(m); 2730 if (pf_mtag != NULL && pf_mtag->pftag_rtableid != IFSCOPE_NONE) { 2731 ipoa.ipoa_boundif = pf_mtag->pftag_rtableid; 2732 ipoa.ipoa_flags |= IPOAF_BOUND_IF; 2733 } 2734#endif /* PF */ 2735 2736 ip_fwd_route_copyout(rcvifp, &fwd_rt); 2737 2738 sin = SIN(&fwd_rt.ro_dst); 2739 if (ROUTE_UNUSABLE(&fwd_rt) || pkt_dst.s_addr != sin->sin_addr.s_addr) { 2740 ROUTE_RELEASE(&fwd_rt); 2741 2742 sin->sin_family = AF_INET; 2743 sin->sin_len = sizeof (*sin); 2744 sin->sin_addr = pkt_dst; 2745 2746 rtalloc_scoped_ign(&fwd_rt, RTF_PRCLONING, ipoa.ipoa_boundif); 2747 if (fwd_rt.ro_rt == NULL) { 2748 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); 2749 goto done; 2750 } 2751 } 2752 rt = fwd_rt.ro_rt; 2753 2754 /* 2755 * Save the IP header and at most 8 bytes of the payload, 2756 * in case we need to generate an ICMP message to the src. 2757 * 2758 * We don't use m_copy() because it might return a reference 2759 * to a shared cluster. Both this function and ip_output() 2760 * assume exclusive access to the IP header in `m', so any 2761 * data in a cluster may change before we reach icmp_error(). 2762 */ 2763 MGET(mcopy, M_DONTWAIT, m->m_type); 2764 if (mcopy != NULL) { 2765 M_COPY_PKTHDR(mcopy, m); 2766 mcopy->m_len = imin((IP_VHL_HL(ip->ip_vhl) << 2) + 8, 2767 (int)ip->ip_len); 2768 m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); 2769 } 2770 2771#if IPSTEALTH 2772 if (!ipstealth) { 2773#endif /* IPSTEALTH */ 2774 ip->ip_ttl -= IPTTLDEC; 2775#if IPSTEALTH 2776 } 2777#endif /* IPSTEALTH */ 2778 2779 /* 2780 * If forwarding packet using same interface that it came in on, 2781 * perhaps should send a redirect to sender to shortcut a hop. 2782 * Only send redirect if source is sending directly to us, 2783 * and if packet was not source routed (or has any options). 2784 * Also, don't send redirect if forwarding using a default route 2785 * or a route modified by a redirect. 2786 */ 2787 RT_LOCK_SPIN(rt); 2788 if (rt->rt_ifp == m->m_pkthdr.rcvif && 2789 !(rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) && 2790 satosin(rt_key(rt))->sin_addr.s_addr != INADDR_ANY && 2791 ipsendredirects && !srcrt && rt->rt_ifa != NULL) { 2792 struct in_ifaddr *ia = (struct in_ifaddr *)rt->rt_ifa; 2793 u_int32_t src = ntohl(ip->ip_src.s_addr); 2794 2795 /* Become a regular mutex */ 2796 RT_CONVERT_LOCK(rt); 2797 IFA_LOCK_SPIN(&ia->ia_ifa); 2798 if ((src & ia->ia_subnetmask) == ia->ia_subnet) { 2799 if (rt->rt_flags & RTF_GATEWAY) 2800 dest = satosin(rt->rt_gateway)->sin_addr.s_addr; 2801 else 2802 dest = pkt_dst.s_addr; 2803 /* 2804 * Router requirements says to only send 2805 * host redirects. 2806 */ 2807 type = ICMP_REDIRECT; 2808 code = ICMP_REDIRECT_HOST; 2809#if DIAGNOSTIC 2810 if (ipprintfs) 2811 printf("redirect (%d) to %lx\n", code, 2812 (u_int32_t)dest); 2813#endif 2814 } 2815 IFA_UNLOCK(&ia->ia_ifa); 2816 } 2817 RT_UNLOCK(rt); 2818 2819#if IPFIREWALL 2820 if (next_hop != NULL) { 2821 /* Pass IPFORWARD info if available */ 2822 struct m_tag *tag; 2823 struct ip_fwd_tag *ipfwd_tag; 2824 2825 tag = m_tag_create(KERNEL_MODULE_TAG_ID, 2826 KERNEL_TAG_TYPE_IPFORWARD, 2827 sizeof (*ipfwd_tag), M_NOWAIT, m); 2828 if (tag == NULL) { 2829 error = ENOBUFS; 2830 m_freem(m); 2831 goto done; 2832 } 2833 2834 ipfwd_tag = (struct ip_fwd_tag *)(tag+1); 2835 ipfwd_tag->next_hop = next_hop; 2836 2837 m_tag_prepend(m, tag); 2838 } 2839#endif /* IPFIREWALL */ 2840 2841 /* Mark this packet as being forwarded from another interface */ 2842 m->m_pkthdr.pkt_flags |= PKTF_FORWARDED; 2843 len = m_pktlen(m); 2844 2845 error = ip_output(m, NULL, &fwd_rt, IP_FORWARDING | IP_OUTARGS, 2846 NULL, &ipoa); 2847 2848 /* Refresh rt since the route could have changed while in IP */ 2849 rt = fwd_rt.ro_rt; 2850 2851 if (error != 0) { 2852 OSAddAtomic(1, &ipstat.ips_cantforward); 2853 } else { 2854 /* 2855 * Increment stats on the source interface; the ones 2856 * for destination interface has been taken care of 2857 * during output above by virtue of PKTF_FORWARDED. 2858 */ 2859 rcvifp->if_fpackets++; 2860 rcvifp->if_fbytes += len; 2861 2862 OSAddAtomic(1, &ipstat.ips_forward); 2863 if (type != 0) { 2864 OSAddAtomic(1, &ipstat.ips_redirectsent); 2865 } else { 2866 if (mcopy != NULL) { 2867 /* 2868 * If we didn't have to go thru ipflow and 2869 * the packet was successfully consumed by 2870 * ip_output, the mcopy is rather a waste; 2871 * this could be further optimized. 2872 */ 2873 m_freem(mcopy); 2874 } 2875 goto done; 2876 } 2877 } 2878 if (mcopy == NULL) 2879 goto done; 2880 2881 switch (error) { 2882 case 0: /* forwarded, but need redirect */ 2883 /* type, code set above */ 2884 break; 2885 2886 case ENETUNREACH: /* shouldn't happen, checked above */ 2887 case EHOSTUNREACH: 2888 case ENETDOWN: 2889 case EHOSTDOWN: 2890 default: 2891 type = ICMP_UNREACH; 2892 code = ICMP_UNREACH_HOST; 2893 break; 2894 2895 case EMSGSIZE: 2896 type = ICMP_UNREACH; 2897 code = ICMP_UNREACH_NEEDFRAG; 2898 2899 if (rt == NULL) { 2900 break; 2901 } else { 2902 RT_LOCK_SPIN(rt); 2903 if (rt->rt_ifp != NULL) 2904 nextmtu = rt->rt_ifp->if_mtu; 2905 RT_UNLOCK(rt); 2906 } 2907#ifdef IPSEC 2908 if (ipsec_bypass) 2909 break; 2910 2911 /* 2912 * If the packet is routed over IPsec tunnel, tell the 2913 * originator the tunnel MTU. 2914 * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz 2915 * XXX quickhack!!! 2916 */ 2917 sp = ipsec4_getpolicybyaddr(mcopy, IPSEC_DIR_OUTBOUND, 2918 IP_FORWARDING, &ipsecerror); 2919 2920 if (sp == NULL) 2921 break; 2922 2923 /* 2924 * find the correct route for outer IPv4 2925 * header, compute tunnel MTU. 2926 */ 2927 nextmtu = 0; 2928 2929 if (sp->req != NULL && 2930 sp->req->saidx.mode == IPSEC_MODE_TUNNEL) { 2931 struct secasindex saidx; 2932 struct secasvar *sav; 2933 struct route *ro; 2934 struct ip *ipm; 2935 int ipsechdr; 2936 2937 /* count IPsec header size */ 2938 ipsechdr = ipsec_hdrsiz(sp); 2939 2940 ipm = mtod(mcopy, struct ip *); 2941 bcopy(&sp->req->saidx, &saidx, sizeof (saidx)); 2942 saidx.mode = sp->req->saidx.mode; 2943 saidx.reqid = sp->req->saidx.reqid; 2944 sin = SIN(&saidx.src); 2945 if (sin->sin_len == 0) { 2946 sin->sin_len = sizeof (*sin); 2947 sin->sin_family = AF_INET; 2948 sin->sin_port = IPSEC_PORT_ANY; 2949 bcopy(&ipm->ip_src, &sin->sin_addr, 2950 sizeof (sin->sin_addr)); 2951 } 2952 sin = SIN(&saidx.dst); 2953 if (sin->sin_len == 0) { 2954 sin->sin_len = sizeof (*sin); 2955 sin->sin_family = AF_INET; 2956 sin->sin_port = IPSEC_PORT_ANY; 2957 bcopy(&ipm->ip_dst, &sin->sin_addr, 2958 sizeof (sin->sin_addr)); 2959 } 2960 sav = key_allocsa_policy(&saidx); 2961 if (sav != NULL) { 2962 lck_mtx_lock(sadb_mutex); 2963 if (sav->sah != NULL) { 2964 ro = &sav->sah->sa_route; 2965 if (ro->ro_rt != NULL) { 2966 RT_LOCK(ro->ro_rt); 2967 if (ro->ro_rt->rt_ifp != NULL) { 2968 nextmtu = ro->ro_rt-> 2969 rt_ifp->if_mtu; 2970 nextmtu -= ipsechdr; 2971 } 2972 RT_UNLOCK(ro->ro_rt); 2973 } 2974 } 2975 key_freesav(sav, KEY_SADB_LOCKED); 2976 lck_mtx_unlock(sadb_mutex); 2977 } 2978 } 2979 key_freesp(sp, KEY_SADB_UNLOCKED); 2980#endif /* IPSEC */ 2981 break; 2982 2983 case ENOBUFS: 2984 /* 2985 * A router should not generate ICMP_SOURCEQUENCH as 2986 * required in RFC1812 Requirements for IP Version 4 Routers. 2987 * Source quench could be a big problem under DoS attacks, 2988 * or if the underlying interface is rate-limited. 2989 * Those who need source quench packets may re-enable them 2990 * via the net.inet.ip.sendsourcequench sysctl. 2991 */ 2992 if (ip_sendsourcequench == 0) { 2993 m_freem(mcopy); 2994 goto done; 2995 } else { 2996 type = ICMP_SOURCEQUENCH; 2997 code = 0; 2998 } 2999 break; 3000 3001 case EACCES: /* ipfw denied packet */ 3002 m_freem(mcopy); 3003 goto done; 3004 } 3005 3006 if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG) 3007 OSAddAtomic(1, &ipstat.ips_cantfrag); 3008 3009 icmp_error(mcopy, type, code, dest, nextmtu); 3010done: 3011 ip_fwd_route_copyin(rcvifp, &fwd_rt); 3012} 3013 3014int 3015ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, 3016 struct mbuf *m) 3017{ 3018 *mp = NULL; 3019 if (inp->inp_socket->so_options & SO_TIMESTAMP) { 3020 struct timeval tv; 3021 3022 getmicrotime(&tv); 3023 mp = sbcreatecontrol_mbuf((caddr_t)&tv, sizeof (tv), 3024 SCM_TIMESTAMP, SOL_SOCKET, mp); 3025 if (*mp == NULL) { 3026 goto no_mbufs; 3027 } 3028 } 3029 if (inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) { 3030 uint64_t time; 3031 3032 time = mach_absolute_time(); 3033 mp = sbcreatecontrol_mbuf((caddr_t)&time, sizeof (time), 3034 SCM_TIMESTAMP_MONOTONIC, SOL_SOCKET, mp); 3035 if (*mp == NULL) { 3036 goto no_mbufs; 3037 } 3038 } 3039 if (inp->inp_flags & INP_RECVDSTADDR) { 3040 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_dst, 3041 sizeof (struct in_addr), IP_RECVDSTADDR, IPPROTO_IP, mp); 3042 if (*mp == NULL) { 3043 goto no_mbufs; 3044 } 3045 } 3046#ifdef notyet 3047 /* 3048 * XXX 3049 * Moving these out of udp_input() made them even more broken 3050 * than they already were. 3051 */ 3052 /* options were tossed already */ 3053 if (inp->inp_flags & INP_RECVOPTS) { 3054 mp = sbcreatecontrol_mbuf((caddr_t)opts_deleted_above, 3055 sizeof (struct in_addr), IP_RECVOPTS, IPPROTO_IP, mp); 3056 if (*mp == NULL) { 3057 goto no_mbufs; 3058 } 3059 } 3060 /* ip_srcroute doesn't do what we want here, need to fix */ 3061 if (inp->inp_flags & INP_RECVRETOPTS) { 3062 mp = sbcreatecontrol_mbuf((caddr_t)ip_srcroute(), 3063 sizeof (struct in_addr), IP_RECVRETOPTS, IPPROTO_IP, mp); 3064 if (*mp == NULL) { 3065 goto no_mbufs; 3066 } 3067 } 3068#endif /* notyet */ 3069 if (inp->inp_flags & INP_RECVIF) { 3070 struct ifnet *ifp; 3071 uint8_t sdlbuf[SOCK_MAXADDRLEN + 1]; 3072 struct sockaddr_dl *sdl2 = SDL(&sdlbuf); 3073 3074 /* 3075 * Make sure to accomodate the largest possible 3076 * size of SA(if_lladdr)->sa_len. 3077 */ 3078 _CASSERT(sizeof (sdlbuf) == (SOCK_MAXADDRLEN + 1)); 3079 3080 ifnet_head_lock_shared(); 3081 if ((ifp = m->m_pkthdr.rcvif) != NULL && 3082 ifp->if_index && (ifp->if_index <= if_index)) { 3083 struct ifaddr *ifa = ifnet_addrs[ifp->if_index - 1]; 3084 struct sockaddr_dl *sdp; 3085 3086 if (!ifa || !ifa->ifa_addr) 3087 goto makedummy; 3088 3089 IFA_LOCK_SPIN(ifa); 3090 sdp = SDL(ifa->ifa_addr); 3091 /* 3092 * Change our mind and don't try copy. 3093 */ 3094 if (sdp->sdl_family != AF_LINK) { 3095 IFA_UNLOCK(ifa); 3096 goto makedummy; 3097 } 3098 /* the above _CASSERT ensures sdl_len fits in sdlbuf */ 3099 bcopy(sdp, sdl2, sdp->sdl_len); 3100 IFA_UNLOCK(ifa); 3101 } else { 3102makedummy: 3103 sdl2->sdl_len = 3104 offsetof(struct sockaddr_dl, sdl_data[0]); 3105 sdl2->sdl_family = AF_LINK; 3106 sdl2->sdl_index = 0; 3107 sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; 3108 } 3109 ifnet_head_done(); 3110 mp = sbcreatecontrol_mbuf((caddr_t)sdl2, sdl2->sdl_len, 3111 IP_RECVIF, IPPROTO_IP, mp); 3112 if (*mp == NULL) { 3113 goto no_mbufs; 3114 } 3115 } 3116 if (inp->inp_flags & INP_RECVTTL) { 3117 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_ttl, 3118 sizeof (ip->ip_ttl), IP_RECVTTL, IPPROTO_IP, mp); 3119 if (*mp == NULL) { 3120 goto no_mbufs; 3121 } 3122 } 3123 if (inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) { 3124 int tc = m_get_traffic_class(m); 3125 3126 mp = sbcreatecontrol_mbuf((caddr_t)&tc, sizeof (tc), 3127 SO_TRAFFIC_CLASS, SOL_SOCKET, mp); 3128 if (*mp == NULL) { 3129 goto no_mbufs; 3130 } 3131 } 3132 if (inp->inp_flags & INP_PKTINFO) { 3133 struct in_pktinfo pi; 3134 3135 bzero(&pi, sizeof (struct in_pktinfo)); 3136 bcopy(&ip->ip_dst, &pi.ipi_addr, sizeof (struct in_addr)); 3137 pi.ipi_ifindex = (m != NULL && m->m_pkthdr.rcvif != NULL) ? 3138 m->m_pkthdr.rcvif->if_index : 0; 3139 3140 mp = sbcreatecontrol_mbuf((caddr_t)&pi, 3141 sizeof (struct in_pktinfo), IP_RECVPKTINFO, IPPROTO_IP, mp); 3142 if (*mp == NULL) { 3143 goto no_mbufs; 3144 } 3145 } 3146 return (0); 3147 3148no_mbufs: 3149 ipstat.ips_pktdropcntrl++; 3150 return (ENOBUFS); 3151} 3152 3153#if MROUTING 3154int 3155ip_rsvp_init(struct socket *so) 3156{ 3157 if (so->so_type != SOCK_RAW || SOCK_PROTO(so) != IPPROTO_RSVP) 3158 return (EOPNOTSUPP); 3159 3160 if (ip_rsvpd != NULL) 3161 return (EADDRINUSE); 3162 3163 ip_rsvpd = so; 3164 /* 3165 * This may seem silly, but we need to be sure we don't over-increment 3166 * the RSVP counter, in case something slips up. 3167 */ 3168 if (!ip_rsvp_on) { 3169 ip_rsvp_on = 1; 3170 rsvp_on++; 3171 } 3172 3173 return (0); 3174} 3175 3176int 3177ip_rsvp_done(void) 3178{ 3179 ip_rsvpd = NULL; 3180 /* 3181 * This may seem silly, but we need to be sure we don't over-decrement 3182 * the RSVP counter, in case something slips up. 3183 */ 3184 if (ip_rsvp_on) { 3185 ip_rsvp_on = 0; 3186 rsvp_on--; 3187 } 3188 return (0); 3189} 3190#endif /* MROUTING */ 3191 3192static inline u_short 3193ip_cksum(struct mbuf *m, int hlen) 3194{ 3195 u_short sum; 3196 3197 if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { 3198 sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); 3199 } else if (!(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) && 3200 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) { 3201 /* 3202 * The packet arrived on an interface which isn't capable 3203 * of performing IP header checksum; compute it now. 3204 */ 3205 sum = ip_cksum_hdr_in(m, hlen); 3206 } else { 3207 sum = 0; 3208 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 3209 CSUM_IP_CHECKED | CSUM_IP_VALID); 3210 m->m_pkthdr.csum_data = 0xffff; 3211 } 3212 3213 if (sum != 0) 3214 OSAddAtomic(1, &ipstat.ips_badsum); 3215 3216 return (sum); 3217} 3218 3219static int 3220ip_getstat SYSCTL_HANDLER_ARGS 3221{ 3222#pragma unused(oidp, arg1, arg2) 3223 if (req->oldptr == USER_ADDR_NULL) 3224 req->oldlen = (size_t)sizeof (struct ipstat); 3225 3226 return (SYSCTL_OUT(req, &ipstat, MIN(sizeof (ipstat), req->oldlen))); 3227} 3228 3229void 3230ip_setsrcifaddr_info(struct mbuf *m, uint32_t src_idx, struct in_ifaddr *ia) 3231{ 3232 VERIFY(m->m_flags & M_PKTHDR); 3233 3234 /* 3235 * If the source ifaddr is specified, pick up the information 3236 * from there; otherwise just grab the passed-in ifindex as the 3237 * caller may not have the ifaddr available. 3238 */ 3239 if (ia != NULL) { 3240 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO; 3241 m->m_pkthdr.src_ifindex = ia->ia_ifp->if_index; 3242 } else { 3243 m->m_pkthdr.src_ifindex = src_idx; 3244 if (src_idx != 0) 3245 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO; 3246 } 3247} 3248 3249void 3250ip_setdstifaddr_info(struct mbuf *m, uint32_t dst_idx, struct in_ifaddr *ia) 3251{ 3252 VERIFY(m->m_flags & M_PKTHDR); 3253 3254 /* 3255 * If the destination ifaddr is specified, pick up the information 3256 * from there; otherwise just grab the passed-in ifindex as the 3257 * caller may not have the ifaddr available. 3258 */ 3259 if (ia != NULL) { 3260 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO; 3261 m->m_pkthdr.dst_ifindex = ia->ia_ifp->if_index; 3262 } else { 3263 m->m_pkthdr.dst_ifindex = dst_idx; 3264 if (dst_idx != 0) 3265 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO; 3266 } 3267} 3268 3269int 3270ip_getsrcifaddr_info(struct mbuf *m, uint32_t *src_idx, uint32_t *iaf) 3271{ 3272 VERIFY(m->m_flags & M_PKTHDR); 3273 3274 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) 3275 return (-1); 3276 3277 if (src_idx != NULL) 3278 *src_idx = m->m_pkthdr.src_ifindex; 3279 3280 if (iaf != NULL) 3281 *iaf = 0; 3282 3283 return (0); 3284} 3285 3286int 3287ip_getdstifaddr_info(struct mbuf *m, uint32_t *dst_idx, uint32_t *iaf) 3288{ 3289 VERIFY(m->m_flags & M_PKTHDR); 3290 3291 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) 3292 return (-1); 3293 3294 if (dst_idx != NULL) 3295 *dst_idx = m->m_pkthdr.dst_ifindex; 3296 3297 if (iaf != NULL) 3298 *iaf = 0; 3299 3300 return (0); 3301} 3302 3303/* 3304 * Protocol input handler for IPPROTO_GRE. 3305 */ 3306void 3307gre_input(struct mbuf *m, int off) 3308{ 3309 gre_input_func_t fn = gre_input_func; 3310 3311 /* 3312 * If there is a registered GRE input handler, pass mbuf to it. 3313 */ 3314 if (fn != NULL) { 3315 lck_mtx_unlock(inet_domain_mutex); 3316 m = fn(m, off, (mtod(m, struct ip *))->ip_p); 3317 lck_mtx_lock(inet_domain_mutex); 3318 } 3319 3320 /* 3321 * If no matching tunnel that is up is found, we inject 3322 * the mbuf to raw ip socket to see if anyone picks it up. 3323 */ 3324 if (m != NULL) 3325 rip_input(m, off); 3326} 3327 3328/* 3329 * Private KPI for PPP/PPTP. 3330 */ 3331int 3332ip_gre_register_input(gre_input_func_t fn) 3333{ 3334 lck_mtx_lock(inet_domain_mutex); 3335 gre_input_func = fn; 3336 lck_mtx_unlock(inet_domain_mutex); 3337 3338 return (0); 3339} 3340