1/* 2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1988, 1993 30 * The Regents of the University of California. All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or without 33 * modification, are permitted provided that the following conditions 34 * are met: 35 * 1. Redistributions of source code must retain the above copyright 36 * notice, this list of conditions and the following disclaimer. 37 * 2. Redistributions in binary form must reproduce the above copyright 38 * notice, this list of conditions and the following disclaimer in the 39 * documentation and/or other materials provided with the distribution. 40 * 3. All advertising materials mentioning features or use of this software 41 * must display the following acknowledgement: 42 * This product includes software developed by the University of 43 * California, Berkeley and its contributors. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 61 */ 62/* 63 * NOTICE: This file was modified by SPARTA, Inc. in 2007 to introduce 64 * support for mandatory and extensible security protections. This notice 65 * is included in support of clause 2.2 (b) of the Apple Public License, 66 * Version 2.0. 67 */ 68 69#define _IP_VHL 70 71#include <sys/param.h> 72#include <sys/systm.h> 73#include <sys/mbuf.h> 74#include <sys/malloc.h> 75#include <sys/domain.h> 76#include <sys/protosw.h> 77#include <sys/socket.h> 78#include <sys/time.h> 79#include <sys/kernel.h> 80#include <sys/syslog.h> 81#include <sys/sysctl.h> 82#include <sys/mcache.h> 83#include <sys/socketvar.h> 84#include <sys/kdebug.h> 85#include <mach/mach_time.h> 86#include <mach/sdt.h> 87 88#include <machine/endian.h> 89#include <dev/random/randomdev.h> 90 91#include <kern/queue.h> 92#include <kern/locks.h> 93#include <libkern/OSAtomic.h> 94 95#include <pexpert/pexpert.h> 96 97#include <net/if.h> 98#include <net/if_var.h> 99#include <net/if_dl.h> 100#include <net/route.h> 101#include <net/kpi_protocol.h> 102#include <net/ntstat.h> 103#include <net/dlil.h> 104#include <net/classq/classq.h> 105#if PF 106#include <net/pfvar.h> 107#endif /* PF */ 108 109#include <netinet/in.h> 110#include <netinet/in_systm.h> 111#include <netinet/in_var.h> 112#include <netinet/in_arp.h> 113#include <netinet/ip.h> 114#include <netinet/in_pcb.h> 115#include <netinet/ip_var.h> 116#include <netinet/ip_icmp.h> 117#include <netinet/ip_fw.h> 118#include <netinet/ip_divert.h> 119#include <netinet/kpi_ipfilter_var.h> 120#include <netinet/udp.h> 121#include <netinet/udp_var.h> 122#include <netinet/bootp.h> 123#include <netinet/lro_ext.h> 124 125#if DUMMYNET 126#include <netinet/ip_dummynet.h> 127#endif /* DUMMYNET */ 128 129#if CONFIG_MACF_NET 130#include <security/mac_framework.h> 131#endif /* CONFIG_MACF_NET */ 132 133#if IPSEC 134#include <netinet6/ipsec.h> 135#include <netkey/key.h> 136#endif /* IPSEC */ 137 138#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 0) 139#define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 2) 140#define DBG_FNC_IP_INPUT NETDBG_CODE(DBG_NETIP, (2 << 8)) 141 142#if IPSEC 143extern int ipsec_bypass; 144extern lck_mtx_t *sadb_mutex; 145 146lck_grp_t *sadb_stat_mutex_grp; 147lck_grp_attr_t *sadb_stat_mutex_grp_attr; 148lck_attr_t *sadb_stat_mutex_attr; 149decl_lck_mtx_data(, sadb_stat_mutex_data); 150lck_mtx_t *sadb_stat_mutex = &sadb_stat_mutex_data; 151#endif /* IPSEC */ 152 153MBUFQ_HEAD(fq_head); 154 155static int frag_timeout_run; /* frag timer is scheduled to run */ 156static void frag_timeout(void *); 157static void frag_sched_timeout(void); 158 159static struct ipq *ipq_alloc(int); 160static void ipq_free(struct ipq *); 161static void ipq_updateparams(void); 162 163decl_lck_mtx_data(static, ipqlock); 164static lck_attr_t *ipqlock_attr; 165static lck_grp_t *ipqlock_grp; 166static lck_grp_attr_t *ipqlock_grp_attr; 167 168/* Packet reassembly stuff */ 169#define IPREASS_NHASH_LOG2 6 170#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) 171#define IPREASS_HMASK (IPREASS_NHASH - 1) 172#define IPREASS_HASH(x, y) \ 173 (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) 174 175/* IP fragment reassembly queues (protected by ipqlock) */ 176static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; /* ip reassembly queues */ 177static int maxnipq; /* max packets in reass queues */ 178static u_int32_t maxfragsperpacket; /* max frags/packet in reass queues */ 179static u_int32_t nipq; /* # of packets in reass queues */ 180static u_int32_t ipq_limit; /* ipq allocation limit */ 181static u_int32_t ipq_count; /* current # of allocated ipq's */ 182 183static int sysctl_ipforwarding SYSCTL_HANDLER_ARGS; 184static int sysctl_maxnipq SYSCTL_HANDLER_ARGS; 185static int sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS; 186 187int ipforwarding = 0; 188SYSCTL_PROC(_net_inet_ip, IPCTL_FORWARDING, forwarding, 189 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &ipforwarding, 0, 190 sysctl_ipforwarding, "I", "Enable IP forwarding between interfaces"); 191 192static int ipsendredirects = 1; /* XXX */ 193SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, 194 CTLFLAG_RW | CTLFLAG_LOCKED, &ipsendredirects, 0, 195 "Enable sending IP redirects"); 196 197int ip_defttl = IPDEFTTL; 198SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW | CTLFLAG_LOCKED, 199 &ip_defttl, 0, "Maximum TTL on IP packets"); 200 201static int ip_dosourceroute = 0; 202SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, 203 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_dosourceroute, 0, 204 "Enable forwarding source routed IP packets"); 205 206static int ip_acceptsourceroute = 0; 207SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, 208 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_acceptsourceroute, 0, 209 "Enable accepting source routed IP packets"); 210 211static int ip_sendsourcequench = 0; 212SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench, 213 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_sendsourcequench, 0, 214 "Enable the transmission of source quench packets"); 215 216SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, 217 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &maxnipq, 0, sysctl_maxnipq, 218 "I", "Maximum number of IPv4 fragment reassembly queue entries"); 219 220SYSCTL_UINT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD | CTLFLAG_LOCKED, 221 &nipq, 0, "Current number of IPv4 fragment reassembly queue entries"); 222 223SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragsperpacket, 224 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &maxfragsperpacket, 0, 225 sysctl_maxfragsperpacket, "I", 226 "Maximum number of IPv4 fragments allowed per packet"); 227 228int ip_doscopedroute = 1; 229SYSCTL_INT(_net_inet_ip, OID_AUTO, scopedroute, CTLFLAG_RD | CTLFLAG_LOCKED, 230 &ip_doscopedroute, 0, "Enable IPv4 scoped routing"); 231 232static uint32_t ip_adj_clear_hwcksum = 0; 233SYSCTL_UINT(_net_inet_ip, OID_AUTO, adj_clear_hwcksum, 234 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_adj_clear_hwcksum, 0, 235 "Invalidate hwcksum info when adjusting length"); 236 237/* 238 * XXX - Setting ip_checkinterface mostly implements the receive side of 239 * the Strong ES model described in RFC 1122, but since the routing table 240 * and transmit implementation do not implement the Strong ES model, 241 * setting this to 1 results in an odd hybrid. 242 * 243 * XXX - ip_checkinterface currently must be disabled if you use ipnat 244 * to translate the destination address to another local interface. 245 * 246 * XXX - ip_checkinterface must be disabled if you add IP aliases 247 * to the loopback interface instead of the interface where the 248 * packets for those addresses are received. 249 */ 250static int ip_checkinterface = 0; 251SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW | CTLFLAG_LOCKED, 252 &ip_checkinterface, 0, "Verify packet arrives on correct interface"); 253 254#if DIAGNOSTIC 255static int ipprintfs = 0; 256#endif 257 258struct protosw *ip_protox[IPPROTO_MAX]; 259 260static lck_grp_attr_t *in_ifaddr_rwlock_grp_attr; 261static lck_grp_t *in_ifaddr_rwlock_grp; 262static lck_attr_t *in_ifaddr_rwlock_attr; 263decl_lck_rw_data(, in_ifaddr_rwlock_data); 264lck_rw_t *in_ifaddr_rwlock = &in_ifaddr_rwlock_data; 265 266/* Protected by in_ifaddr_rwlock */ 267struct in_ifaddrhead in_ifaddrhead; /* first inet address */ 268struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ 269 270#define INADDR_NHASH 61 271static u_int32_t inaddr_nhash; /* hash table size */ 272static u_int32_t inaddr_hashp; /* next largest prime */ 273 274static int ip_getstat SYSCTL_HANDLER_ARGS; 275struct ipstat ipstat; 276SYSCTL_PROC(_net_inet_ip, IPCTL_STATS, stats, 277 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 278 0, 0, ip_getstat, "S,ipstat", 279 "IP statistics (struct ipstat, netinet/ip_var.h)"); 280 281#if IPCTL_DEFMTU 282SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW | CTLFLAG_LOCKED, 283 &ip_mtu, 0, "Default MTU"); 284#endif /* IPCTL_DEFMTU */ 285 286#if IPSTEALTH 287static int ipstealth = 0; 288SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW | CTLFLAG_LOCKED, 289 &ipstealth, 0, ""); 290#endif /* IPSTEALTH */ 291 292/* Firewall hooks */ 293#if IPFIREWALL 294ip_fw_chk_t *ip_fw_chk_ptr; 295int fw_enable = 1; 296int fw_bypass = 1; 297int fw_one_pass = 0; 298#endif /* IPFIREWALL */ 299 300#if DUMMYNET 301ip_dn_io_t *ip_dn_io_ptr; 302#endif /* DUMMYNET */ 303 304SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal, 305 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local"); 306 307struct ip_linklocal_stat ip_linklocal_stat; 308SYSCTL_STRUCT(_net_inet_ip_linklocal, OID_AUTO, stat, 309 CTLFLAG_RD | CTLFLAG_LOCKED, &ip_linklocal_stat, ip_linklocal_stat, 310 "Number of link local packets with TTL less than 255"); 311 312SYSCTL_NODE(_net_inet_ip_linklocal, OID_AUTO, in, 313 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local input"); 314 315int ip_linklocal_in_allowbadttl = 1; 316SYSCTL_INT(_net_inet_ip_linklocal_in, OID_AUTO, allowbadttl, 317 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_linklocal_in_allowbadttl, 0, 318 "Allow incoming link local packets with TTL less than 255"); 319 320 321/* 322 * We need to save the IP options in case a protocol wants to respond 323 * to an incoming packet over the same route if the packet got here 324 * using IP source routing. This allows connection establishment and 325 * maintenance when the remote end is on a network that is not known 326 * to us. 327 */ 328static int ip_nhops = 0; 329static struct ip_srcrt { 330 struct in_addr dst; /* final destination */ 331 char nop; /* one NOP to align */ 332 char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */ 333 struct in_addr route[MAX_IPOPTLEN / sizeof (struct in_addr)]; 334} ip_srcrt; 335 336static void in_ifaddrhashtbl_init(void); 337static void save_rte(u_char *, struct in_addr); 338static int ip_dooptions(struct mbuf *, int, struct sockaddr_in *); 339static void ip_forward(struct mbuf *, int, struct sockaddr_in *); 340static void frag_freef(struct ipqhead *, struct ipq *); 341#if IPDIVERT 342#ifdef IPDIVERT_44 343static struct mbuf *ip_reass(struct mbuf *, u_int32_t *, u_int16_t *); 344#else /* !IPDIVERT_44 */ 345static struct mbuf *ip_reass(struct mbuf *, u_int16_t *, u_int16_t *); 346#endif /* !IPDIVERT_44 */ 347#else /* !IPDIVERT */ 348static struct mbuf *ip_reass(struct mbuf *); 349#endif /* !IPDIVERT */ 350static void ip_fwd_route_copyout(struct ifnet *, struct route *); 351static void ip_fwd_route_copyin(struct ifnet *, struct route *); 352static inline u_short ip_cksum(struct mbuf *, int); 353 354int ip_use_randomid = 1; 355SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW | CTLFLAG_LOCKED, 356 &ip_use_randomid, 0, "Randomize IP packets IDs"); 357 358/* 359 * On platforms which require strict alignment (currently for anything but 360 * i386 or x86_64), check if the IP header pointer is 32-bit aligned; if not, 361 * copy the contents of the mbuf chain into a new chain, and free the original 362 * one. Create some head room in the first mbuf of the new chain, in case 363 * it's needed later on. 364 */ 365#if defined(__i386__) || defined(__x86_64__) 366#define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { } while (0) 367#else /* !__i386__ && !__x86_64__ */ 368#define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { \ 369 if (!IP_HDR_ALIGNED_P(mtod(_m, caddr_t))) { \ 370 struct mbuf *_n; \ 371 struct ifnet *__ifp = (_ifp); \ 372 atomic_add_64(&(__ifp)->if_alignerrs, 1); \ 373 if (((_m)->m_flags & M_PKTHDR) && \ 374 (_m)->m_pkthdr.pkt_hdr != NULL) \ 375 (_m)->m_pkthdr.pkt_hdr = NULL; \ 376 _n = m_defrag_offset(_m, max_linkhdr, M_NOWAIT); \ 377 if (_n == NULL) { \ 378 atomic_add_32(&ipstat.ips_toosmall, 1); \ 379 m_freem(_m); \ 380 (_m) = NULL; \ 381 _action; \ 382 } else { \ 383 VERIFY(_n != (_m)); \ 384 (_m) = _n; \ 385 } \ 386 } \ 387} while (0) 388#endif /* !__i386__ && !__x86_64__ */ 389 390/* 391 * GRE input handler function, settable via ip_gre_register_input() for PPTP. 392 */ 393static gre_input_func_t gre_input_func; 394 395/* 396 * IP initialization: fill in IP protocol switch table. 397 * All protocols not implemented in kernel go to raw IP protocol handler. 398 */ 399void 400ip_init(struct protosw *pp, struct domain *dp) 401{ 402 static int ip_initialized = 0; 403 struct protosw *pr; 404 struct timeval tv; 405 int i; 406 407 domain_proto_mtx_lock_assert_held(); 408 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED); 409 410 /* ipq_alloc() uses mbufs for IP fragment queue structures */ 411 _CASSERT(sizeof (struct ipq) <= _MLEN); 412 413 /* 414 * Some ioctls (e.g. SIOCAIFADDR) use ifaliasreq struct, which is 415 * interchangeable with in_aliasreq; they must have the same size. 416 */ 417 _CASSERT(sizeof (struct ifaliasreq) == sizeof (struct in_aliasreq)); 418 419 if (ip_initialized) 420 return; 421 ip_initialized = 1; 422 423 PE_parse_boot_argn("net.inet.ip.scopedroute", 424 &ip_doscopedroute, sizeof (ip_doscopedroute)); 425 426 in_ifaddr_init(); 427 428 in_ifaddr_rwlock_grp_attr = lck_grp_attr_alloc_init(); 429 in_ifaddr_rwlock_grp = lck_grp_alloc_init("in_ifaddr_rwlock", 430 in_ifaddr_rwlock_grp_attr); 431 in_ifaddr_rwlock_attr = lck_attr_alloc_init(); 432 lck_rw_init(in_ifaddr_rwlock, in_ifaddr_rwlock_grp, 433 in_ifaddr_rwlock_attr); 434 435 TAILQ_INIT(&in_ifaddrhead); 436 in_ifaddrhashtbl_init(); 437 438 ip_moptions_init(); 439 440 pr = pffindproto_locked(PF_INET, IPPROTO_RAW, SOCK_RAW); 441 if (pr == NULL) { 442 panic("%s: Unable to find [PF_INET,IPPROTO_RAW,SOCK_RAW]\n", 443 __func__); 444 /* NOTREACHED */ 445 } 446 447 /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ 448 for (i = 0; i < IPPROTO_MAX; i++) 449 ip_protox[i] = pr; 450 /* 451 * Cycle through IP protocols and put them into the appropriate place 452 * in ip_protox[], skipping protocols IPPROTO_{IP,RAW}. 453 */ 454 VERIFY(dp == inetdomain && dp->dom_family == PF_INET); 455 TAILQ_FOREACH(pr, &dp->dom_protosw, pr_entry) { 456 VERIFY(pr->pr_domain == dp); 457 if (pr->pr_protocol != 0 && pr->pr_protocol != IPPROTO_RAW) { 458 /* Be careful to only index valid IP protocols. */ 459 if (pr->pr_protocol < IPPROTO_MAX) 460 ip_protox[pr->pr_protocol] = pr; 461 } 462 } 463 464 /* IP fragment reassembly queue lock */ 465 ipqlock_grp_attr = lck_grp_attr_alloc_init(); 466 ipqlock_grp = lck_grp_alloc_init("ipqlock", ipqlock_grp_attr); 467 ipqlock_attr = lck_attr_alloc_init(); 468 lck_mtx_init(&ipqlock, ipqlock_grp, ipqlock_attr); 469 470 lck_mtx_lock(&ipqlock); 471 /* Initialize IP reassembly queue. */ 472 for (i = 0; i < IPREASS_NHASH; i++) 473 TAILQ_INIT(&ipq[i]); 474 475 maxnipq = nmbclusters / 32; 476 maxfragsperpacket = 128; /* enough for 64k in 512 byte fragments */ 477 ipq_updateparams(); 478 lck_mtx_unlock(&ipqlock); 479 480 getmicrotime(&tv); 481 ip_id = RandomULong() ^ tv.tv_usec; 482 ip_initid(); 483 484 ipf_init(); 485 486#if IPSEC 487 sadb_stat_mutex_grp_attr = lck_grp_attr_alloc_init(); 488 sadb_stat_mutex_grp = lck_grp_alloc_init("sadb_stat", 489 sadb_stat_mutex_grp_attr); 490 sadb_stat_mutex_attr = lck_attr_alloc_init(); 491 lck_mtx_init(sadb_stat_mutex, sadb_stat_mutex_grp, 492 sadb_stat_mutex_attr); 493 494#endif 495 arp_init(); 496} 497 498/* 499 * Initialize IPv4 source address hash table. 500 */ 501static void 502in_ifaddrhashtbl_init(void) 503{ 504 int i, k, p; 505 506 if (in_ifaddrhashtbl != NULL) 507 return; 508 509 PE_parse_boot_argn("inaddr_nhash", &inaddr_nhash, 510 sizeof (inaddr_nhash)); 511 if (inaddr_nhash == 0) 512 inaddr_nhash = INADDR_NHASH; 513 514 MALLOC(in_ifaddrhashtbl, struct in_ifaddrhashhead *, 515 inaddr_nhash * sizeof (*in_ifaddrhashtbl), 516 M_IFADDR, M_WAITOK | M_ZERO); 517 if (in_ifaddrhashtbl == NULL) 518 panic("in_ifaddrhashtbl_init allocation failed"); 519 520 /* 521 * Generate the next largest prime greater than inaddr_nhash. 522 */ 523 k = (inaddr_nhash % 2 == 0) ? inaddr_nhash + 1 : inaddr_nhash + 2; 524 for (;;) { 525 p = 1; 526 for (i = 3; i * i <= k; i += 2) { 527 if (k % i == 0) 528 p = 0; 529 } 530 if (p == 1) 531 break; 532 k += 2; 533 } 534 inaddr_hashp = k; 535} 536 537u_int32_t 538inaddr_hashval(u_int32_t key) 539{ 540 /* 541 * The hash index is the computed prime times the key modulo 542 * the hash size, as documented in "Introduction to Algorithms" 543 * (Cormen, Leiserson, Rivest). 544 */ 545 if (inaddr_nhash > 1) 546 return ((key * inaddr_hashp) % inaddr_nhash); 547 else 548 return (0); 549} 550 551void 552ip_proto_dispatch_in_wrapper(struct mbuf *m, int hlen, u_int8_t proto) 553{ 554 ip_proto_dispatch_in(m, hlen, proto, 0); 555} 556 557__private_extern__ void 558ip_proto_dispatch_in(struct mbuf *m, int hlen, u_int8_t proto, 559 ipfilter_t inject_ipfref) 560{ 561 struct ipfilter *filter; 562 int seen = (inject_ipfref == NULL); 563 int changed_header = 0; 564 struct ip *ip; 565 void (*pr_input)(struct mbuf *, int len); 566 567 if (!TAILQ_EMPTY(&ipv4_filters)) { 568 ipf_ref(); 569 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { 570 if (seen == 0) { 571 if ((struct ipfilter *)inject_ipfref == filter) 572 seen = 1; 573 } else if (filter->ipf_filter.ipf_input) { 574 errno_t result; 575 576 if (changed_header == 0) { 577 /* 578 * Perform IP header alignment fixup, 579 * if needed, before passing packet 580 * into filter(s). 581 */ 582 IP_HDR_ALIGNMENT_FIXUP(m, 583 m->m_pkthdr.rcvif, ipf_unref()); 584 585 /* ipf_unref() already called */ 586 if (m == NULL) 587 return; 588 589 changed_header = 1; 590 ip = mtod(m, struct ip *); 591 ip->ip_len = htons(ip->ip_len + hlen); 592 ip->ip_off = htons(ip->ip_off); 593 ip->ip_sum = 0; 594 ip->ip_sum = ip_cksum_hdr_in(m, hlen); 595 } 596 result = filter->ipf_filter.ipf_input( 597 filter->ipf_filter.cookie, (mbuf_t *)&m, 598 hlen, proto); 599 if (result == EJUSTRETURN) { 600 ipf_unref(); 601 return; 602 } 603 if (result != 0) { 604 ipf_unref(); 605 m_freem(m); 606 return; 607 } 608 } 609 } 610 ipf_unref(); 611 } 612 613 /* Perform IP header alignment fixup (post-filters), if needed */ 614 IP_HDR_ALIGNMENT_FIXUP(m, m->m_pkthdr.rcvif, return); 615 616 /* 617 * If there isn't a specific lock for the protocol 618 * we're about to call, use the generic lock for AF_INET. 619 * otherwise let the protocol deal with its own locking 620 */ 621 ip = mtod(m, struct ip *); 622 623 if (changed_header) { 624 ip->ip_len = ntohs(ip->ip_len) - hlen; 625 ip->ip_off = ntohs(ip->ip_off); 626 } 627 628 if ((pr_input = ip_protox[ip->ip_p]->pr_input) == NULL) { 629 m_freem(m); 630 } else if (!(ip_protox[ip->ip_p]->pr_flags & PR_PROTOLOCK)) { 631 lck_mtx_lock(inet_domain_mutex); 632 pr_input(m, hlen); 633 lck_mtx_unlock(inet_domain_mutex); 634 } else { 635 pr_input(m, hlen); 636 } 637} 638 639/* 640 * Ip input routine. Checksum and byte swap header. If fragmented 641 * try to reassemble. Process options. Pass to next level. 642 */ 643void 644ip_input(struct mbuf *m) 645{ 646 struct ip *ip; 647 struct in_ifaddr *ia = NULL; 648 unsigned int hlen, checkif; 649 u_short sum = 0; 650 struct in_addr pkt_dst; 651#if IPFIREWALL 652 int i; 653 u_int32_t div_info = 0; /* packet divert/tee info */ 654#endif 655#if IPFIREWALL || DUMMYNET 656 struct ip_fw_args args; 657 struct m_tag *tag; 658#endif 659 ipfilter_t inject_filter_ref = NULL; 660 struct ifnet *inifp; 661 662 /* Check if the mbuf is still valid after interface filter processing */ 663 MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif); 664 inifp = m->m_pkthdr.rcvif; 665 VERIFY(inifp != NULL); 666 667 /* Perform IP header alignment fixup, if needed */ 668 IP_HDR_ALIGNMENT_FIXUP(m, inifp, goto bad); 669 670 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED; 671 672#if IPFIREWALL || DUMMYNET 673 bzero(&args, sizeof (struct ip_fw_args)); 674 675 /* 676 * Don't bother searching for tag(s) if there's none. 677 */ 678 if (SLIST_EMPTY(&m->m_pkthdr.tags)) 679 goto ipfw_tags_done; 680 681 /* Grab info from mtags prepended to the chain */ 682#if DUMMYNET 683 if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, 684 KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { 685 struct dn_pkt_tag *dn_tag; 686 687 dn_tag = (struct dn_pkt_tag *)(tag+1); 688 args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule; 689 args.fwa_pf_rule = dn_tag->dn_pf_rule; 690 691 m_tag_delete(m, tag); 692 } 693#endif /* DUMMYNET */ 694 695#if IPDIVERT 696 if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, 697 KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { 698 struct divert_tag *div_tag; 699 700 div_tag = (struct divert_tag *)(tag+1); 701 args.fwa_divert_rule = div_tag->cookie; 702 703 m_tag_delete(m, tag); 704 } 705#endif 706 707 if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, 708 KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { 709 struct ip_fwd_tag *ipfwd_tag; 710 711 ipfwd_tag = (struct ip_fwd_tag *)(tag+1); 712 args.fwa_next_hop = ipfwd_tag->next_hop; 713 714 m_tag_delete(m, tag); 715 } 716 717#if DIAGNOSTIC 718 if (m == NULL || !(m->m_flags & M_PKTHDR)) 719 panic("ip_input no HDR"); 720#endif 721 722#if DUMMYNET 723 if (args.fwa_ipfw_rule || args.fwa_pf_rule) { 724 /* dummynet already filtered us */ 725 ip = mtod(m, struct ip *); 726 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 727 inject_filter_ref = ipf_get_inject_filter(m); 728#if IPFIREWALL 729 if (args.fwa_ipfw_rule) 730 goto iphack; 731#endif /* IPFIREWALL */ 732 if (args.fwa_pf_rule) 733 goto check_with_pf; 734 } 735#endif /* DUMMYNET */ 736ipfw_tags_done: 737#endif /* IPFIREWALL || DUMMYNET */ 738 739 /* 740 * No need to process packet twice if we've already seen it. 741 */ 742 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) 743 inject_filter_ref = ipf_get_inject_filter(m); 744 if (inject_filter_ref != NULL) { 745 ip = mtod(m, struct ip *); 746 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 747 748 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, 749 struct ip *, ip, struct ifnet *, inifp, 750 struct ip *, ip, struct ip6_hdr *, NULL); 751 752 ip->ip_len = ntohs(ip->ip_len) - hlen; 753 ip->ip_off = ntohs(ip->ip_off); 754 ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref); 755 return; 756 } 757 758 OSAddAtomic(1, &ipstat.ips_total); 759 if (m->m_pkthdr.len < sizeof (struct ip)) 760 goto tooshort; 761 762 if (m->m_len < sizeof (struct ip) && 763 (m = m_pullup(m, sizeof (struct ip))) == NULL) { 764 OSAddAtomic(1, &ipstat.ips_toosmall); 765 return; 766 } 767 ip = mtod(m, struct ip *); 768 769 KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr, 770 ip->ip_p, ip->ip_off, ip->ip_len); 771 772 if (IP_VHL_V(ip->ip_vhl) != IPVERSION) { 773 OSAddAtomic(1, &ipstat.ips_badvers); 774 goto bad; 775 } 776 777 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 778 if (hlen < sizeof (struct ip)) { /* minimum header length */ 779 OSAddAtomic(1, &ipstat.ips_badhlen); 780 goto bad; 781 } 782 if (hlen > m->m_len) { 783 if ((m = m_pullup(m, hlen)) == NULL) { 784 OSAddAtomic(1, &ipstat.ips_badhlen); 785 return; 786 } 787 ip = mtod(m, struct ip *); 788 } 789 790 /* 127/8 must not appear on wire - RFC1122 */ 791 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 792 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 793 /* 794 * Allow for the following exceptions: 795 * 796 * 1. If the packet was sent to loopback (i.e. rcvif 797 * would have been set earlier at output time.) 798 * 799 * 2. If the packet was sent out on loopback from a local 800 * source address which belongs to a non-loopback 801 * interface (i.e. rcvif may not necessarily be a 802 * loopback interface, hence the test for PKTF_LOOP.) 803 * Unlike IPv6, there is no interface scope ID, and 804 * therefore we don't care so much about PKTF_IFINFO. 805 */ 806 if (!(inifp->if_flags & IFF_LOOPBACK) && 807 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) { 808 OSAddAtomic(1, &ipstat.ips_badaddr); 809 goto bad; 810 } 811 } 812 813 /* IPv4 Link-Local Addresses as defined in RFC3927 */ 814 if ((IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) || 815 IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) { 816 ip_linklocal_stat.iplls_in_total++; 817 if (ip->ip_ttl != MAXTTL) { 818 OSAddAtomic(1, &ip_linklocal_stat.iplls_in_badttl); 819 /* Silently drop link local traffic with bad TTL */ 820 if (!ip_linklocal_in_allowbadttl) 821 goto bad; 822 } 823 } 824 825 sum = ip_cksum(m, hlen); 826 if (sum) { 827 goto bad; 828 } 829 830 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, 831 struct ip *, ip, struct ifnet *, inifp, 832 struct ip *, ip, struct ip6_hdr *, NULL); 833 834 /* 835 * Naively assume we can attribute inbound data to the route we would 836 * use to send to this destination. Asymetric routing breaks this 837 * assumption, but it still allows us to account for traffic from 838 * a remote node in the routing table. 839 * this has a very significant performance impact so we bypass 840 * if nstat_collect is disabled. We may also bypass if the 841 * protocol is tcp in the future because tcp will have a route that 842 * we can use to attribute the data to. That does mean we would not 843 * account for forwarded tcp traffic. 844 */ 845 if (nstat_collect) { 846 struct rtentry *rt = 847 ifnet_cached_rtlookup_inet(inifp, ip->ip_src); 848 if (rt != NULL) { 849 nstat_route_rx(rt, 1, m->m_pkthdr.len, 0); 850 rtfree(rt); 851 } 852 } 853 854 /* 855 * Convert fields to host representation. 856 */ 857#if BYTE_ORDER != BIG_ENDIAN 858 NTOHS(ip->ip_len); 859#endif 860 861 if (ip->ip_len < hlen) { 862 OSAddAtomic(1, &ipstat.ips_badlen); 863 goto bad; 864 } 865 866#if BYTE_ORDER != BIG_ENDIAN 867 NTOHS(ip->ip_off); 868#endif 869 /* 870 * Check that the amount of data in the buffers 871 * is as at least much as the IP header would have us expect. 872 * Trim mbufs if longer than we expect. 873 * Drop packet if shorter than we expect. 874 */ 875 if (m->m_pkthdr.len < ip->ip_len) { 876tooshort: 877 OSAddAtomic(1, &ipstat.ips_tooshort); 878 goto bad; 879 } 880 if (m->m_pkthdr.len > ip->ip_len) { 881 /* 882 * Invalidate hardware checksum info if ip_adj_clear_hwcksum 883 * is set; useful to handle buggy drivers. Note that this 884 * should not be enabled by default, as we may get here due 885 * to link-layer padding. 886 */ 887 if (ip_adj_clear_hwcksum && 888 (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) && 889 !(inifp->if_flags & IFF_LOOPBACK) && 890 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) { 891 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID; 892 m->m_pkthdr.csum_data = 0; 893 ipstat.ips_adj_hwcsum_clr++; 894 } 895 896 ipstat.ips_adj++; 897 if (m->m_len == m->m_pkthdr.len) { 898 m->m_len = ip->ip_len; 899 m->m_pkthdr.len = ip->ip_len; 900 } else 901 m_adj(m, ip->ip_len - m->m_pkthdr.len); 902 } 903 904 /* for consistency */ 905 m->m_pkthdr.pkt_proto = ip->ip_p; 906 907#if DUMMYNET 908check_with_pf: 909#endif 910#if PF 911 /* Invoke inbound packet filter */ 912 if (PF_IS_ENABLED) { 913 int error; 914#if DUMMYNET 915 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, &args); 916#else 917 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, NULL); 918#endif /* DUMMYNET */ 919 if (error != 0 || m == NULL) { 920 if (m != NULL) { 921 panic("%s: unexpected packet %p\n", 922 __func__, m); 923 /* NOTREACHED */ 924 } 925 /* Already freed by callee */ 926 return; 927 } 928 ip = mtod(m, struct ip *); 929 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 930 } 931#endif /* PF */ 932 933#if IPSEC 934 if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) 935 goto pass; 936#endif 937 938#if IPFIREWALL 939#if DUMMYNET 940iphack: 941#endif /* DUMMYNET */ 942 /* 943 * Check if we want to allow this packet to be processed. 944 * Consider it to be bad if not. 945 */ 946 if (fw_enable && IPFW_LOADED) { 947#if IPFIREWALL_FORWARD 948 /* 949 * If we've been forwarded from the output side, then 950 * skip the firewall a second time 951 */ 952 if (args.fwa_next_hop) 953 goto ours; 954#endif /* IPFIREWALL_FORWARD */ 955 956 args.fwa_m = m; 957 958 i = ip_fw_chk_ptr(&args); 959 m = args.fwa_m; 960 961 if ((i & IP_FW_PORT_DENY_FLAG) || m == NULL) { /* drop */ 962 if (m) 963 m_freem(m); 964 return; 965 } 966 ip = mtod(m, struct ip *); /* just in case m changed */ 967 968 if (i == 0 && args.fwa_next_hop == NULL) { /* common case */ 969 goto pass; 970 } 971#if DUMMYNET 972 if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG) != 0) { 973 /* Send packet to the appropriate pipe */ 974 ip_dn_io_ptr(m, i&0xffff, DN_TO_IP_IN, &args, 975 DN_CLIENT_IPFW); 976 return; 977 } 978#endif /* DUMMYNET */ 979#if IPDIVERT 980 if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) { 981 /* Divert or tee packet */ 982 div_info = i; 983 goto ours; 984 } 985#endif 986#if IPFIREWALL_FORWARD 987 if (i == 0 && args.fwa_next_hop != NULL) { 988 goto pass; 989 } 990#endif 991 /* 992 * if we get here, the packet must be dropped 993 */ 994 m_freem(m); 995 return; 996 } 997#endif /* IPFIREWALL */ 998#if IPSEC | IPFIREWALL 999pass: 1000#endif 1001 /* 1002 * Process options and, if not destined for us, 1003 * ship it on. ip_dooptions returns 1 when an 1004 * error was detected (causing an icmp message 1005 * to be sent and the original packet to be freed). 1006 */ 1007 ip_nhops = 0; /* for source routed packets */ 1008#if IPFIREWALL 1009 if (hlen > sizeof (struct ip) && 1010 ip_dooptions(m, 0, args.fwa_next_hop)) { 1011#else /* !IPFIREWALL */ 1012 if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, NULL)) { 1013#endif /* !IPFIREWALL */ 1014 return; 1015 } 1016 1017 /* 1018 * Check our list of addresses, to see if the packet is for us. 1019 * If we don't have any addresses, assume any unicast packet 1020 * we receive might be for us (and let the upper layers deal 1021 * with it). 1022 */ 1023 if (TAILQ_EMPTY(&in_ifaddrhead) && !(m->m_flags & (M_MCAST|M_BCAST))) { 1024 ip_setdstifaddr_info(m, inifp->if_index, NULL); 1025 goto ours; 1026 } 1027 1028 /* 1029 * Cache the destination address of the packet; this may be 1030 * changed by use of 'ipfw fwd'. 1031 */ 1032#if IPFIREWALL 1033 pkt_dst = args.fwa_next_hop == NULL ? 1034 ip->ip_dst : args.fwa_next_hop->sin_addr; 1035#else /* !IPFIREWALL */ 1036 pkt_dst = ip->ip_dst; 1037#endif /* !IPFIREWALL */ 1038 1039 /* 1040 * Enable a consistency check between the destination address 1041 * and the arrival interface for a unicast packet (the RFC 1122 1042 * strong ES model) if IP forwarding is disabled and the packet 1043 * is not locally generated and the packet is not subject to 1044 * 'ipfw fwd'. 1045 * 1046 * XXX - Checking also should be disabled if the destination 1047 * address is ipnat'ed to a different interface. 1048 * 1049 * XXX - Checking is incompatible with IP aliases added 1050 * to the loopback interface instead of the interface where 1051 * the packets are received. 1052 */ 1053 checkif = ip_checkinterface && (ipforwarding == 0) && 1054 !(inifp->if_flags & IFF_LOOPBACK) && 1055 !(m->m_pkthdr.pkt_flags & PKTF_LOOP) 1056#if IPFIREWALL 1057 && (args.fwa_next_hop == NULL); 1058#else /* !IPFIREWALL */ 1059 ; 1060#endif /* !IPFIREWALL */ 1061 1062 /* 1063 * Check for exact addresses in the hash bucket. 1064 */ 1065 lck_rw_lock_shared(in_ifaddr_rwlock); 1066 TAILQ_FOREACH(ia, INADDR_HASH(pkt_dst.s_addr), ia_hash) { 1067 /* 1068 * If the address matches, verify that the packet 1069 * arrived via the correct interface if checking is 1070 * enabled. 1071 */ 1072 if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr && 1073 (!checkif || ia->ia_ifp == inifp)) { 1074 ip_setdstifaddr_info(m, 0, ia); 1075 lck_rw_done(in_ifaddr_rwlock); 1076 goto ours; 1077 } 1078 } 1079 lck_rw_done(in_ifaddr_rwlock); 1080 1081 /* 1082 * Check for broadcast addresses. 1083 * 1084 * Only accept broadcast packets that arrive via the matching 1085 * interface. Reception of forwarded directed broadcasts would be 1086 * handled via ip_forward() and ether_frameout() with the loopback 1087 * into the stack for SIMPLEX interfaces handled by ether_frameout(). 1088 */ 1089 if (inifp->if_flags & IFF_BROADCAST) { 1090 struct ifaddr *ifa; 1091 1092 ifnet_lock_shared(inifp); 1093 TAILQ_FOREACH(ifa, &inifp->if_addrhead, ifa_link) { 1094 if (ifa->ifa_addr->sa_family != AF_INET) { 1095 continue; 1096 } 1097 ia = ifatoia(ifa); 1098 if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == 1099 pkt_dst.s_addr || ia->ia_netbroadcast.s_addr == 1100 pkt_dst.s_addr) { 1101 ip_setdstifaddr_info(m, 0, ia); 1102 ifnet_lock_done(inifp); 1103 goto ours; 1104 } 1105 } 1106 ifnet_lock_done(inifp); 1107 } 1108 1109 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 1110 struct in_multi *inm; 1111 /* 1112 * See if we belong to the destination multicast group on the 1113 * arrival interface. 1114 */ 1115 in_multihead_lock_shared(); 1116 IN_LOOKUP_MULTI(&ip->ip_dst, inifp, inm); 1117 in_multihead_lock_done(); 1118 if (inm == NULL) { 1119 OSAddAtomic(1, &ipstat.ips_notmember); 1120 m_freem(m); 1121 return; 1122 } 1123 ip_setdstifaddr_info(m, inifp->if_index, NULL); 1124 INM_REMREF(inm); 1125 goto ours; 1126 } 1127 if (ip->ip_dst.s_addr == (u_int32_t)INADDR_BROADCAST || 1128 ip->ip_dst.s_addr == INADDR_ANY) { 1129 ip_setdstifaddr_info(m, inifp->if_index, NULL); 1130 goto ours; 1131 } 1132 1133 /* Allow DHCP/BootP responses through */ 1134 if ((inifp->if_eflags & IFEF_AUTOCONFIGURING) && 1135 hlen == sizeof (struct ip) && ip->ip_p == IPPROTO_UDP) { 1136 struct udpiphdr *ui; 1137 1138 if (m->m_len < sizeof (struct udpiphdr) && 1139 (m = m_pullup(m, sizeof (struct udpiphdr))) == NULL) { 1140 OSAddAtomic(1, &udpstat.udps_hdrops); 1141 return; 1142 } 1143 ui = mtod(m, struct udpiphdr *); 1144 if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) { 1145 ip_setdstifaddr_info(m, inifp->if_index, NULL); 1146 goto ours; 1147 } 1148 ip = mtod(m, struct ip *); /* in case it changed */ 1149 } 1150 1151 /* 1152 * Not for us; forward if possible and desirable. 1153 */ 1154 if (ipforwarding == 0) { 1155 OSAddAtomic(1, &ipstat.ips_cantforward); 1156 m_freem(m); 1157 } else { 1158#if IPFIREWALL 1159 ip_forward(m, 0, args.fwa_next_hop); 1160#else 1161 ip_forward(m, 0, NULL); 1162#endif 1163 } 1164 return; 1165 1166ours: 1167 /* 1168 * If offset or IP_MF are set, must reassemble. 1169 */ 1170 if (ip->ip_off & ~(IP_DF | IP_RF)) { 1171 /* 1172 * ip_reass() will return a different mbuf, and update 1173 * the divert info in div_info and args.fwa_divert_rule. 1174 */ 1175#if IPDIVERT 1176 m = ip_reass(m, (u_int16_t *)&div_info, &args.fwa_divert_rule); 1177#else 1178 m = ip_reass(m); 1179#endif 1180 if (m == NULL) 1181 return; 1182 ip = mtod(m, struct ip *); 1183 /* Get the header length of the reassembled packet */ 1184 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 1185#if IPDIVERT 1186 /* Restore original checksum before diverting packet */ 1187 if (div_info != 0) { 1188#if BYTE_ORDER != BIG_ENDIAN 1189 HTONS(ip->ip_len); 1190 HTONS(ip->ip_off); 1191#endif 1192 ip->ip_sum = 0; 1193 ip->ip_sum = ip_cksum_hdr_in(m, hlen); 1194#if BYTE_ORDER != BIG_ENDIAN 1195 NTOHS(ip->ip_off); 1196 NTOHS(ip->ip_len); 1197#endif 1198 } 1199#endif 1200 } 1201 1202 /* 1203 * Further protocols expect the packet length to be w/o the 1204 * IP header. 1205 */ 1206 ip->ip_len -= hlen; 1207 1208#if IPDIVERT 1209 /* 1210 * Divert or tee packet to the divert protocol if required. 1211 * 1212 * If div_info is zero then cookie should be too, so we shouldn't 1213 * need to clear them here. Assume divert_packet() does so also. 1214 */ 1215 if (div_info != 0) { 1216 struct mbuf *clone = NULL; 1217 1218 /* Clone packet if we're doing a 'tee' */ 1219 if (div_info & IP_FW_PORT_TEE_FLAG) 1220 clone = m_dup(m, M_DONTWAIT); 1221 1222 /* Restore packet header fields to original values */ 1223 ip->ip_len += hlen; 1224 1225#if BYTE_ORDER != BIG_ENDIAN 1226 HTONS(ip->ip_len); 1227 HTONS(ip->ip_off); 1228#endif 1229 /* Deliver packet to divert input routine */ 1230 OSAddAtomic(1, &ipstat.ips_delivered); 1231 divert_packet(m, 1, div_info & 0xffff, args.fwa_divert_rule); 1232 1233 /* If 'tee', continue with original packet */ 1234 if (clone == NULL) { 1235 return; 1236 } 1237 m = clone; 1238 ip = mtod(m, struct ip *); 1239 } 1240#endif 1241 1242#if IPSEC 1243 /* 1244 * enforce IPsec policy checking if we are seeing last header. 1245 * note that we do not visit this with protocols with pcb layer 1246 * code - like udp/tcp/raw ip. 1247 */ 1248 if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)) { 1249 if (ipsec4_in_reject(m, NULL)) { 1250 IPSEC_STAT_INCREMENT(ipsecstat.in_polvio); 1251 goto bad; 1252 } 1253 } 1254#endif /* IPSEC */ 1255 1256 /* 1257 * Switch out to protocol's input routine. 1258 */ 1259 OSAddAtomic(1, &ipstat.ips_delivered); 1260 1261#if IPFIREWALL 1262 if (args.fwa_next_hop && ip->ip_p == IPPROTO_TCP) { 1263 /* TCP needs IPFORWARD info if available */ 1264 struct m_tag *fwd_tag; 1265 struct ip_fwd_tag *ipfwd_tag; 1266 1267 fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID, 1268 KERNEL_TAG_TYPE_IPFORWARD, sizeof (*ipfwd_tag), 1269 M_NOWAIT, m); 1270 if (fwd_tag == NULL) 1271 goto bad; 1272 1273 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); 1274 ipfwd_tag->next_hop = args.fwa_next_hop; 1275 1276 m_tag_prepend(m, fwd_tag); 1277 1278 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, 1279 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); 1280 1281 /* TCP deals with its own locking */ 1282 ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); 1283 } else { 1284 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, 1285 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); 1286 1287 if ((sw_lro) && (ip->ip_p == IPPROTO_TCP)) { 1288 m = tcp_lro(m, hlen); 1289 if (m == NULL) 1290 return; 1291 } 1292 1293 ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); 1294 } 1295#else /* !IPFIREWALL */ 1296 if ((sw_lro) && (ip->ip_p == IPPROTO_TCP)) { 1297 m = tcp_lro(m, hlen); 1298 if (m == NULL) 1299 return; 1300 } 1301 ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); 1302#endif /* !IPFIREWALL */ 1303 return; 1304 1305bad: 1306 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); 1307 m_freem(m); 1308} 1309 1310static void 1311ipq_updateparams(void) 1312{ 1313 lck_mtx_assert(&ipqlock, LCK_MTX_ASSERT_OWNED); 1314 /* 1315 * -1 for unlimited allocation. 1316 */ 1317 if (maxnipq < 0) 1318 ipq_limit = 0; 1319 /* 1320 * Positive number for specific bound. 1321 */ 1322 if (maxnipq > 0) 1323 ipq_limit = maxnipq; 1324 /* 1325 * Zero specifies no further fragment queue allocation -- set the 1326 * bound very low, but rely on implementation elsewhere to actually 1327 * prevent allocation and reclaim current queues. 1328 */ 1329 if (maxnipq == 0) 1330 ipq_limit = 1; 1331 /* 1332 * Arm the purge timer if not already and if there's work to do 1333 */ 1334 frag_sched_timeout(); 1335} 1336 1337static int 1338sysctl_maxnipq SYSCTL_HANDLER_ARGS 1339{ 1340#pragma unused(arg1, arg2) 1341 int error, i; 1342 1343 lck_mtx_lock(&ipqlock); 1344 i = maxnipq; 1345 error = sysctl_handle_int(oidp, &i, 0, req); 1346 if (error || req->newptr == USER_ADDR_NULL) 1347 goto done; 1348 /* impose bounds */ 1349 if (i < -1 || i > (nmbclusters / 4)) { 1350 error = EINVAL; 1351 goto done; 1352 } 1353 maxnipq = i; 1354 ipq_updateparams(); 1355done: 1356 lck_mtx_unlock(&ipqlock); 1357 return (error); 1358} 1359 1360static int 1361sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS 1362{ 1363#pragma unused(arg1, arg2) 1364 int error, i; 1365 1366 lck_mtx_lock(&ipqlock); 1367 i = maxfragsperpacket; 1368 error = sysctl_handle_int(oidp, &i, 0, req); 1369 if (error || req->newptr == USER_ADDR_NULL) 1370 goto done; 1371 maxfragsperpacket = i; 1372 ipq_updateparams(); /* see if we need to arm timer */ 1373done: 1374 lck_mtx_unlock(&ipqlock); 1375 return (error); 1376} 1377 1378/* 1379 * Take incoming datagram fragment and try to reassemble it into 1380 * whole datagram. If a chain for reassembly of this datagram already 1381 * exists, then it is given as fp; otherwise have to make a chain. 1382 * 1383 * When IPDIVERT enabled, keep additional state with each packet that 1384 * tells us if we need to divert or tee the packet we're building. 1385 * 1386 * The IP header is *NOT* adjusted out of iplen. 1387 */ 1388static struct mbuf * 1389#if IPDIVERT 1390ip_reass(struct mbuf *m, 1391#ifdef IPDIVERT_44 1392 u_int32_t *divinfo, 1393#else /* IPDIVERT_44 */ 1394 u_int16_t *divinfo, 1395#endif /* IPDIVERT_44 */ 1396 u_int16_t *divcookie) 1397#else /* IPDIVERT */ 1398ip_reass(struct mbuf *m) 1399#endif /* IPDIVERT */ 1400{ 1401 struct ip *ip; 1402 struct mbuf *p, *q, *nq, *t; 1403 struct ipq *fp = NULL; 1404 struct ipqhead *head; 1405 int i, hlen, next; 1406 u_int8_t ecn, ecn0; 1407 uint32_t csum, csum_flags; 1408 uint16_t hash; 1409 struct fq_head dfq; 1410 1411 MBUFQ_INIT(&dfq); /* for deferred frees */ 1412 1413 /* If maxnipq or maxfragsperpacket is 0, never accept fragments. */ 1414 if (maxnipq == 0 || maxfragsperpacket == 0) { 1415 ipstat.ips_fragments++; 1416 ipstat.ips_fragdropped++; 1417 m_freem(m); 1418 if (nipq > 0) { 1419 lck_mtx_lock(&ipqlock); 1420 frag_sched_timeout(); /* purge stale fragments */ 1421 lck_mtx_unlock(&ipqlock); 1422 } 1423 return (NULL); 1424 } 1425 1426 ip = mtod(m, struct ip *); 1427 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 1428 1429 lck_mtx_lock(&ipqlock); 1430 1431 hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); 1432 head = &ipq[hash]; 1433 1434 /* 1435 * Look for queue of fragments 1436 * of this datagram. 1437 */ 1438 TAILQ_FOREACH(fp, head, ipq_list) { 1439 if (ip->ip_id == fp->ipq_id && 1440 ip->ip_src.s_addr == fp->ipq_src.s_addr && 1441 ip->ip_dst.s_addr == fp->ipq_dst.s_addr && 1442#if CONFIG_MACF_NET 1443 mac_ipq_label_compare(m, fp) && 1444#endif 1445 ip->ip_p == fp->ipq_p) 1446 goto found; 1447 } 1448 1449 fp = NULL; 1450 1451 /* 1452 * Attempt to trim the number of allocated fragment queues if it 1453 * exceeds the administrative limit. 1454 */ 1455 if ((nipq > (unsigned)maxnipq) && (maxnipq > 0)) { 1456 /* 1457 * drop something from the tail of the current queue 1458 * before proceeding further 1459 */ 1460 struct ipq *fq = TAILQ_LAST(head, ipqhead); 1461 if (fq == NULL) { /* gak */ 1462 for (i = 0; i < IPREASS_NHASH; i++) { 1463 struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead); 1464 if (r) { 1465 ipstat.ips_fragtimeout += r->ipq_nfrags; 1466 frag_freef(&ipq[i], r); 1467 break; 1468 } 1469 } 1470 } else { 1471 ipstat.ips_fragtimeout += fq->ipq_nfrags; 1472 frag_freef(head, fq); 1473 } 1474 } 1475 1476found: 1477 /* 1478 * Leverage partial checksum offload for IP fragments. Narrow down 1479 * the scope to cover only UDP without IP options, as that is the 1480 * most common case. 1481 * 1482 * Perform 1's complement adjustment of octets that got included/ 1483 * excluded in the hardware-calculated checksum value. Ignore cases 1484 * where the value includes or excludes the IP header span, as the 1485 * sum for those octets would already be 0xffff and thus no-op. 1486 */ 1487 if (ip->ip_p == IPPROTO_UDP && hlen == sizeof (struct ip) && 1488 (m->m_pkthdr.csum_flags & 1489 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) == 1490 (CSUM_DATA_VALID | CSUM_PARTIAL)) { 1491 uint32_t start; 1492 1493 start = m->m_pkthdr.csum_rx_start; 1494 csum = m->m_pkthdr.csum_rx_val; 1495 1496 if (start != 0 && start != hlen) { 1497#if BYTE_ORDER != BIG_ENDIAN 1498 if (start < hlen) { 1499 HTONS(ip->ip_len); 1500 HTONS(ip->ip_off); 1501 } 1502#endif 1503 /* callee folds in sum */ 1504 csum = m_adj_sum16(m, start, hlen, csum); 1505#if BYTE_ORDER != BIG_ENDIAN 1506 if (start < hlen) { 1507 NTOHS(ip->ip_off); 1508 NTOHS(ip->ip_len); 1509 } 1510#endif 1511 } 1512 csum_flags = m->m_pkthdr.csum_flags; 1513 } else { 1514 csum = 0; 1515 csum_flags = 0; 1516 } 1517 1518 /* Invalidate checksum */ 1519 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID; 1520 1521 ipstat.ips_fragments++; 1522 1523 /* 1524 * Adjust ip_len to not reflect header, 1525 * convert offset of this to bytes. 1526 */ 1527 ip->ip_len -= hlen; 1528 if (ip->ip_off & IP_MF) { 1529 /* 1530 * Make sure that fragments have a data length 1531 * that's a non-zero multiple of 8 bytes. 1532 */ 1533 if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { 1534 OSAddAtomic(1, &ipstat.ips_toosmall); 1535 /* 1536 * Reassembly queue may have been found if previous 1537 * fragments were valid; given that this one is bad, 1538 * we need to drop it. Make sure to set fp to NULL 1539 * if not already, since we don't want to decrement 1540 * ipq_nfrags as it doesn't include this packet. 1541 */ 1542 fp = NULL; 1543 goto dropfrag; 1544 } 1545 m->m_flags |= M_FRAG; 1546 } else { 1547 /* Clear the flag in case packet comes from loopback */ 1548 m->m_flags &= ~M_FRAG; 1549 } 1550 ip->ip_off <<= 3; 1551 1552 m->m_pkthdr.pkt_hdr = ip; 1553 1554 /* Previous ip_reass() started here. */ 1555 /* 1556 * Presence of header sizes in mbufs 1557 * would confuse code below. 1558 */ 1559 m->m_data += hlen; 1560 m->m_len -= hlen; 1561 1562 /* 1563 * If first fragment to arrive, create a reassembly queue. 1564 */ 1565 if (fp == NULL) { 1566 fp = ipq_alloc(M_DONTWAIT); 1567 if (fp == NULL) 1568 goto dropfrag; 1569#if CONFIG_MACF_NET 1570 if (mac_ipq_label_init(fp, M_NOWAIT) != 0) { 1571 ipq_free(fp); 1572 fp = NULL; 1573 goto dropfrag; 1574 } 1575 mac_ipq_label_associate(m, fp); 1576#endif 1577 TAILQ_INSERT_HEAD(head, fp, ipq_list); 1578 nipq++; 1579 fp->ipq_nfrags = 1; 1580 fp->ipq_ttl = IPFRAGTTL; 1581 fp->ipq_p = ip->ip_p; 1582 fp->ipq_id = ip->ip_id; 1583 fp->ipq_src = ip->ip_src; 1584 fp->ipq_dst = ip->ip_dst; 1585 fp->ipq_frags = m; 1586 m->m_nextpkt = NULL; 1587 /* 1588 * If the first fragment has valid checksum offload 1589 * info, the rest of fragments are eligible as well. 1590 */ 1591 if (csum_flags != 0) { 1592 fp->ipq_csum = csum; 1593 fp->ipq_csum_flags = csum_flags; 1594 } 1595#if IPDIVERT 1596 /* 1597 * Transfer firewall instructions to the fragment structure. 1598 * Only trust info in the fragment at offset 0. 1599 */ 1600 if (ip->ip_off == 0) { 1601#ifdef IPDIVERT_44 1602 fp->ipq_div_info = *divinfo; 1603#else 1604 fp->ipq_divert = *divinfo; 1605#endif 1606 fp->ipq_div_cookie = *divcookie; 1607 } 1608 *divinfo = 0; 1609 *divcookie = 0; 1610#endif /* IPDIVERT */ 1611 m = NULL; /* nothing to return */ 1612 goto done; 1613 } else { 1614 fp->ipq_nfrags++; 1615#if CONFIG_MACF_NET 1616 mac_ipq_label_update(m, fp); 1617#endif 1618 } 1619 1620#define GETIP(m) ((struct ip *)((m)->m_pkthdr.pkt_hdr)) 1621 1622 /* 1623 * Handle ECN by comparing this segment with the first one; 1624 * if CE is set, do not lose CE. 1625 * drop if CE and not-ECT are mixed for the same packet. 1626 */ 1627 ecn = ip->ip_tos & IPTOS_ECN_MASK; 1628 ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; 1629 if (ecn == IPTOS_ECN_CE) { 1630 if (ecn0 == IPTOS_ECN_NOTECT) 1631 goto dropfrag; 1632 if (ecn0 != IPTOS_ECN_CE) 1633 GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; 1634 } 1635 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) 1636 goto dropfrag; 1637 1638 /* 1639 * Find a segment which begins after this one does. 1640 */ 1641 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) 1642 if (GETIP(q)->ip_off > ip->ip_off) 1643 break; 1644 1645 /* 1646 * If there is a preceding segment, it may provide some of 1647 * our data already. If so, drop the data from the incoming 1648 * segment. If it provides all of our data, drop us, otherwise 1649 * stick new segment in the proper place. 1650 * 1651 * If some of the data is dropped from the preceding 1652 * segment, then it's checksum is invalidated. 1653 */ 1654 if (p) { 1655 i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off; 1656 if (i > 0) { 1657 if (i >= ip->ip_len) 1658 goto dropfrag; 1659 m_adj(m, i); 1660 fp->ipq_csum_flags = 0; 1661 ip->ip_off += i; 1662 ip->ip_len -= i; 1663 } 1664 m->m_nextpkt = p->m_nextpkt; 1665 p->m_nextpkt = m; 1666 } else { 1667 m->m_nextpkt = fp->ipq_frags; 1668 fp->ipq_frags = m; 1669 } 1670 1671 /* 1672 * While we overlap succeeding segments trim them or, 1673 * if they are completely covered, dequeue them. 1674 */ 1675 for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off; 1676 q = nq) { 1677 i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off; 1678 if (i < GETIP(q)->ip_len) { 1679 GETIP(q)->ip_len -= i; 1680 GETIP(q)->ip_off += i; 1681 m_adj(q, i); 1682 fp->ipq_csum_flags = 0; 1683 break; 1684 } 1685 nq = q->m_nextpkt; 1686 m->m_nextpkt = nq; 1687 ipstat.ips_fragdropped++; 1688 fp->ipq_nfrags--; 1689 /* defer freeing until after lock is dropped */ 1690 MBUFQ_ENQUEUE(&dfq, q); 1691 } 1692 1693 /* 1694 * If this fragment contains similar checksum offload info 1695 * as that of the existing ones, accumulate checksum. Otherwise, 1696 * invalidate checksum offload info for the entire datagram. 1697 */ 1698 if (csum_flags != 0 && csum_flags == fp->ipq_csum_flags) 1699 fp->ipq_csum += csum; 1700 else if (fp->ipq_csum_flags != 0) 1701 fp->ipq_csum_flags = 0; 1702 1703#if IPDIVERT 1704 /* 1705 * Transfer firewall instructions to the fragment structure. 1706 * Only trust info in the fragment at offset 0. 1707 */ 1708 if (ip->ip_off == 0) { 1709#ifdef IPDIVERT_44 1710 fp->ipq_div_info = *divinfo; 1711#else 1712 fp->ipq_divert = *divinfo; 1713#endif 1714 fp->ipq_div_cookie = *divcookie; 1715 } 1716 *divinfo = 0; 1717 *divcookie = 0; 1718#endif /* IPDIVERT */ 1719 1720 /* 1721 * Check for complete reassembly and perform frag per packet 1722 * limiting. 1723 * 1724 * Frag limiting is performed here so that the nth frag has 1725 * a chance to complete the packet before we drop the packet. 1726 * As a result, n+1 frags are actually allowed per packet, but 1727 * only n will ever be stored. (n = maxfragsperpacket.) 1728 * 1729 */ 1730 next = 0; 1731 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { 1732 if (GETIP(q)->ip_off != next) { 1733 if (fp->ipq_nfrags > maxfragsperpacket) { 1734 ipstat.ips_fragdropped += fp->ipq_nfrags; 1735 frag_freef(head, fp); 1736 } 1737 m = NULL; /* nothing to return */ 1738 goto done; 1739 } 1740 next += GETIP(q)->ip_len; 1741 } 1742 /* Make sure the last packet didn't have the IP_MF flag */ 1743 if (p->m_flags & M_FRAG) { 1744 if (fp->ipq_nfrags > maxfragsperpacket) { 1745 ipstat.ips_fragdropped += fp->ipq_nfrags; 1746 frag_freef(head, fp); 1747 } 1748 m = NULL; /* nothing to return */ 1749 goto done; 1750 } 1751 1752 /* 1753 * Reassembly is complete. Make sure the packet is a sane size. 1754 */ 1755 q = fp->ipq_frags; 1756 ip = GETIP(q); 1757 if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) { 1758 ipstat.ips_toolong++; 1759 ipstat.ips_fragdropped += fp->ipq_nfrags; 1760 frag_freef(head, fp); 1761 m = NULL; /* nothing to return */ 1762 goto done; 1763 } 1764 1765 /* 1766 * Concatenate fragments. 1767 */ 1768 m = q; 1769 t = m->m_next; 1770 m->m_next = NULL; 1771 m_cat(m, t); 1772 nq = q->m_nextpkt; 1773 q->m_nextpkt = NULL; 1774 for (q = nq; q != NULL; q = nq) { 1775 nq = q->m_nextpkt; 1776 q->m_nextpkt = NULL; 1777 m_cat(m, q); 1778 } 1779 1780 /* 1781 * Store partial hardware checksum info from the fragment queue; 1782 * the receive start offset is set to 20 bytes (see code at the 1783 * top of this routine.) 1784 */ 1785 if (fp->ipq_csum_flags != 0) { 1786 csum = fp->ipq_csum; 1787 1788 ADDCARRY(csum); 1789 1790 m->m_pkthdr.csum_rx_val = csum; 1791 m->m_pkthdr.csum_rx_start = sizeof (struct ip); 1792 m->m_pkthdr.csum_flags = fp->ipq_csum_flags; 1793 } else if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || 1794 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) { 1795 /* loopback checksums are always OK */ 1796 m->m_pkthdr.csum_data = 0xffff; 1797 m->m_pkthdr.csum_flags &= ~CSUM_PARTIAL; 1798 m->m_pkthdr.csum_flags = 1799 CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 1800 CSUM_IP_CHECKED | CSUM_IP_VALID; 1801 } 1802 1803#if IPDIVERT 1804 /* 1805 * Extract firewall instructions from the fragment structure. 1806 */ 1807#ifdef IPDIVERT_44 1808 *divinfo = fp->ipq_div_info; 1809#else 1810 *divinfo = fp->ipq_divert; 1811#endif 1812 *divcookie = fp->ipq_div_cookie; 1813#endif /* IPDIVERT */ 1814 1815#if CONFIG_MACF_NET 1816 mac_mbuf_label_associate_ipq(fp, m); 1817 mac_ipq_label_destroy(fp); 1818#endif 1819 /* 1820 * Create header for new ip packet by modifying header of first 1821 * packet; dequeue and discard fragment reassembly header. 1822 * Make header visible. 1823 */ 1824 ip->ip_len = (IP_VHL_HL(ip->ip_vhl) << 2) + next; 1825 ip->ip_src = fp->ipq_src; 1826 ip->ip_dst = fp->ipq_dst; 1827 1828 fp->ipq_frags = NULL; /* return to caller as 'm' */ 1829 frag_freef(head, fp); 1830 fp = NULL; 1831 1832 m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2); 1833 m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2); 1834 /* some debugging cruft by sklower, below, will go away soon */ 1835 if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ 1836 m_fixhdr(m); 1837 ipstat.ips_reassembled++; 1838 1839 /* arm the purge timer if not already and if there's work to do */ 1840 frag_sched_timeout(); 1841 lck_mtx_unlock(&ipqlock); 1842 /* perform deferred free (if needed) now that lock is dropped */ 1843 if (!MBUFQ_EMPTY(&dfq)) 1844 MBUFQ_DRAIN(&dfq); 1845 VERIFY(MBUFQ_EMPTY(&dfq)); 1846 return (m); 1847 1848done: 1849 VERIFY(m == NULL); 1850 /* arm the purge timer if not already and if there's work to do */ 1851 frag_sched_timeout(); 1852 lck_mtx_unlock(&ipqlock); 1853 /* perform deferred free (if needed) */ 1854 if (!MBUFQ_EMPTY(&dfq)) 1855 MBUFQ_DRAIN(&dfq); 1856 VERIFY(MBUFQ_EMPTY(&dfq)); 1857 return (NULL); 1858 1859dropfrag: 1860#if IPDIVERT 1861 *divinfo = 0; 1862 *divcookie = 0; 1863#endif /* IPDIVERT */ 1864 ipstat.ips_fragdropped++; 1865 if (fp != NULL) 1866 fp->ipq_nfrags--; 1867 /* arm the purge timer if not already and if there's work to do */ 1868 frag_sched_timeout(); 1869 lck_mtx_unlock(&ipqlock); 1870 m_freem(m); 1871 /* perform deferred free (if needed) */ 1872 if (!MBUFQ_EMPTY(&dfq)) 1873 MBUFQ_DRAIN(&dfq); 1874 VERIFY(MBUFQ_EMPTY(&dfq)); 1875 return (NULL); 1876#undef GETIP 1877} 1878 1879/* 1880 * Free a fragment reassembly header and all 1881 * associated datagrams. 1882 */ 1883static void 1884frag_freef(struct ipqhead *fhp, struct ipq *fp) 1885{ 1886 lck_mtx_assert(&ipqlock, LCK_MTX_ASSERT_OWNED); 1887 1888 fp->ipq_nfrags = 0; 1889 if (fp->ipq_frags != NULL) { 1890 m_freem_list(fp->ipq_frags); 1891 fp->ipq_frags = NULL; 1892 } 1893 TAILQ_REMOVE(fhp, fp, ipq_list); 1894 nipq--; 1895 ipq_free(fp); 1896} 1897 1898/* 1899 * IP reassembly timer processing 1900 */ 1901static void 1902frag_timeout(void *arg) 1903{ 1904#pragma unused(arg) 1905 struct ipq *fp; 1906 int i; 1907 1908 /* 1909 * Update coarse-grained networking timestamp (in sec.); the idea 1910 * is to piggy-back on the timeout callout to update the counter 1911 * returnable via net_uptime(). 1912 */ 1913 net_update_uptime(); 1914 1915 lck_mtx_lock(&ipqlock); 1916 for (i = 0; i < IPREASS_NHASH; i++) { 1917 for (fp = TAILQ_FIRST(&ipq[i]); fp; ) { 1918 struct ipq *fpp; 1919 1920 fpp = fp; 1921 fp = TAILQ_NEXT(fp, ipq_list); 1922 if (--fpp->ipq_ttl == 0) { 1923 ipstat.ips_fragtimeout += fpp->ipq_nfrags; 1924 frag_freef(&ipq[i], fpp); 1925 } 1926 } 1927 } 1928 /* 1929 * If we are over the maximum number of fragments 1930 * (due to the limit being lowered), drain off 1931 * enough to get down to the new limit. 1932 */ 1933 if (maxnipq >= 0 && nipq > (unsigned)maxnipq) { 1934 for (i = 0; i < IPREASS_NHASH; i++) { 1935 while (nipq > (unsigned)maxnipq && 1936 !TAILQ_EMPTY(&ipq[i])) { 1937 ipstat.ips_fragdropped += 1938 TAILQ_FIRST(&ipq[i])->ipq_nfrags; 1939 frag_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); 1940 } 1941 } 1942 } 1943 /* re-arm the purge timer if there's work to do */ 1944 frag_timeout_run = 0; 1945 frag_sched_timeout(); 1946 lck_mtx_unlock(&ipqlock); 1947} 1948 1949static void 1950frag_sched_timeout(void) 1951{ 1952 lck_mtx_assert(&ipqlock, LCK_MTX_ASSERT_OWNED); 1953 1954 if (!frag_timeout_run && nipq > 0) { 1955 frag_timeout_run = 1; 1956 timeout(frag_timeout, NULL, hz); 1957 } 1958} 1959 1960/* 1961 * Drain off all datagram fragments. 1962 */ 1963static void 1964frag_drain(void) 1965{ 1966 int i; 1967 1968 lck_mtx_lock(&ipqlock); 1969 for (i = 0; i < IPREASS_NHASH; i++) { 1970 while (!TAILQ_EMPTY(&ipq[i])) { 1971 ipstat.ips_fragdropped += 1972 TAILQ_FIRST(&ipq[i])->ipq_nfrags; 1973 frag_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); 1974 } 1975 } 1976 lck_mtx_unlock(&ipqlock); 1977} 1978 1979static struct ipq * 1980ipq_alloc(int how) 1981{ 1982 struct mbuf *t; 1983 struct ipq *fp; 1984 1985 /* 1986 * See comments in ipq_updateparams(). Keep the count separate 1987 * from nipq since the latter represents the elements already 1988 * in the reassembly queues. 1989 */ 1990 if (ipq_limit > 0 && ipq_count > ipq_limit) 1991 return (NULL); 1992 1993 t = m_get(how, MT_FTABLE); 1994 if (t != NULL) { 1995 atomic_add_32(&ipq_count, 1); 1996 fp = mtod(t, struct ipq *); 1997 bzero(fp, sizeof (*fp)); 1998 } else { 1999 fp = NULL; 2000 } 2001 return (fp); 2002} 2003 2004static void 2005ipq_free(struct ipq *fp) 2006{ 2007 (void) m_free(dtom(fp)); 2008 atomic_add_32(&ipq_count, -1); 2009} 2010 2011/* 2012 * Drain callback 2013 */ 2014void 2015ip_drain(void) 2016{ 2017 frag_drain(); /* fragments */ 2018 in_rtqdrain(); /* protocol cloned routes */ 2019 in_arpdrain(NULL); /* cloned routes: ARP */ 2020} 2021 2022/* 2023 * Do option processing on a datagram, 2024 * possibly discarding it if bad options are encountered, 2025 * or forwarding it if source-routed. 2026 * The pass argument is used when operating in the IPSTEALTH 2027 * mode to tell what options to process: 2028 * [LS]SRR (pass 0) or the others (pass 1). 2029 * The reason for as many as two passes is that when doing IPSTEALTH, 2030 * non-routing options should be processed only if the packet is for us. 2031 * Returns 1 if packet has been forwarded/freed, 2032 * 0 if the packet should be processed further. 2033 */ 2034static int 2035ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop) 2036{ 2037#pragma unused(pass) 2038 struct ip *ip = mtod(m, struct ip *); 2039 u_char *cp; 2040 struct ip_timestamp *ipt; 2041 struct in_ifaddr *ia; 2042 int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; 2043 struct in_addr *sin, dst; 2044 n_time ntime; 2045 struct sockaddr_in ipaddr = { 2046 sizeof (ipaddr), AF_INET, 0, { 0 }, { 0, } }; 2047 2048 /* Expect 32-bit aligned data pointer on strict-align platforms */ 2049 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); 2050 2051 dst = ip->ip_dst; 2052 cp = (u_char *)(ip + 1); 2053 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); 2054 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2055 opt = cp[IPOPT_OPTVAL]; 2056 if (opt == IPOPT_EOL) 2057 break; 2058 if (opt == IPOPT_NOP) 2059 optlen = 1; 2060 else { 2061 if (cnt < IPOPT_OLEN + sizeof (*cp)) { 2062 code = &cp[IPOPT_OLEN] - (u_char *)ip; 2063 goto bad; 2064 } 2065 optlen = cp[IPOPT_OLEN]; 2066 if (optlen < IPOPT_OLEN + sizeof (*cp) || 2067 optlen > cnt) { 2068 code = &cp[IPOPT_OLEN] - (u_char *)ip; 2069 goto bad; 2070 } 2071 } 2072 switch (opt) { 2073 2074 default: 2075 break; 2076 2077 /* 2078 * Source routing with record. 2079 * Find interface with current destination address. 2080 * If none on this machine then drop if strictly routed, 2081 * or do nothing if loosely routed. 2082 * Record interface address and bring up next address 2083 * component. If strictly routed make sure next 2084 * address is on directly accessible net. 2085 */ 2086 case IPOPT_LSRR: 2087 case IPOPT_SSRR: 2088 if (optlen < IPOPT_OFFSET + sizeof (*cp)) { 2089 code = &cp[IPOPT_OLEN] - (u_char *)ip; 2090 goto bad; 2091 } 2092 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 2093 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 2094 goto bad; 2095 } 2096 ipaddr.sin_addr = ip->ip_dst; 2097 ia = (struct in_ifaddr *)ifa_ifwithaddr(SA(&ipaddr)); 2098 if (ia == NULL) { 2099 if (opt == IPOPT_SSRR) { 2100 type = ICMP_UNREACH; 2101 code = ICMP_UNREACH_SRCFAIL; 2102 goto bad; 2103 } 2104 if (!ip_dosourceroute) 2105 goto nosourcerouting; 2106 /* 2107 * Loose routing, and not at next destination 2108 * yet; nothing to do except forward. 2109 */ 2110 break; 2111 } else { 2112 IFA_REMREF(&ia->ia_ifa); 2113 ia = NULL; 2114 } 2115 off--; /* 0 origin */ 2116 if (off > optlen - (int)sizeof (struct in_addr)) { 2117 /* 2118 * End of source route. Should be for us. 2119 */ 2120 if (!ip_acceptsourceroute) 2121 goto nosourcerouting; 2122 save_rte(cp, ip->ip_src); 2123 break; 2124 } 2125 2126 if (!ip_dosourceroute) { 2127 if (ipforwarding) { 2128 char buf[MAX_IPv4_STR_LEN]; 2129 char buf2[MAX_IPv4_STR_LEN]; 2130 /* 2131 * Acting as a router, so generate ICMP 2132 */ 2133nosourcerouting: 2134 log(LOG_WARNING, 2135 "attempted source route from %s " 2136 "to %s\n", 2137 inet_ntop(AF_INET, &ip->ip_src, 2138 buf, sizeof (buf)), 2139 inet_ntop(AF_INET, &ip->ip_dst, 2140 buf2, sizeof (buf2))); 2141 type = ICMP_UNREACH; 2142 code = ICMP_UNREACH_SRCFAIL; 2143 goto bad; 2144 } else { 2145 /* 2146 * Not acting as a router, 2147 * so silently drop. 2148 */ 2149 OSAddAtomic(1, &ipstat.ips_cantforward); 2150 m_freem(m); 2151 return (1); 2152 } 2153 } 2154 2155 /* 2156 * locate outgoing interface 2157 */ 2158 (void) memcpy(&ipaddr.sin_addr, cp + off, 2159 sizeof (ipaddr.sin_addr)); 2160 2161 if (opt == IPOPT_SSRR) { 2162#define INA struct in_ifaddr * 2163 if ((ia = (INA)ifa_ifwithdstaddr( 2164 SA(&ipaddr))) == NULL) { 2165 ia = (INA)ifa_ifwithnet(SA(&ipaddr)); 2166 } 2167 } else { 2168 ia = ip_rtaddr(ipaddr.sin_addr); 2169 } 2170 if (ia == NULL) { 2171 type = ICMP_UNREACH; 2172 code = ICMP_UNREACH_SRCFAIL; 2173 goto bad; 2174 } 2175 ip->ip_dst = ipaddr.sin_addr; 2176 IFA_LOCK(&ia->ia_ifa); 2177 (void) memcpy(cp + off, &(IA_SIN(ia)->sin_addr), 2178 sizeof (struct in_addr)); 2179 IFA_UNLOCK(&ia->ia_ifa); 2180 IFA_REMREF(&ia->ia_ifa); 2181 ia = NULL; 2182 cp[IPOPT_OFFSET] += sizeof (struct in_addr); 2183 /* 2184 * Let ip_intr's mcast routing check handle mcast pkts 2185 */ 2186 forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); 2187 break; 2188 2189 case IPOPT_RR: 2190 if (optlen < IPOPT_OFFSET + sizeof (*cp)) { 2191 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 2192 goto bad; 2193 } 2194 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 2195 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 2196 goto bad; 2197 } 2198 /* 2199 * If no space remains, ignore. 2200 */ 2201 off--; /* 0 origin */ 2202 if (off > optlen - (int)sizeof (struct in_addr)) 2203 break; 2204 (void) memcpy(&ipaddr.sin_addr, &ip->ip_dst, 2205 sizeof (ipaddr.sin_addr)); 2206 /* 2207 * locate outgoing interface; if we're the destination, 2208 * use the incoming interface (should be same). 2209 */ 2210 if ((ia = (INA)ifa_ifwithaddr(SA(&ipaddr))) == NULL) { 2211 if ((ia = ip_rtaddr(ipaddr.sin_addr)) == NULL) { 2212 type = ICMP_UNREACH; 2213 code = ICMP_UNREACH_HOST; 2214 goto bad; 2215 } 2216 } 2217 IFA_LOCK(&ia->ia_ifa); 2218 (void) memcpy(cp + off, &(IA_SIN(ia)->sin_addr), 2219 sizeof (struct in_addr)); 2220 IFA_UNLOCK(&ia->ia_ifa); 2221 IFA_REMREF(&ia->ia_ifa); 2222 ia = NULL; 2223 cp[IPOPT_OFFSET] += sizeof (struct in_addr); 2224 break; 2225 2226 case IPOPT_TS: 2227 code = cp - (u_char *)ip; 2228 ipt = (struct ip_timestamp *)(void *)cp; 2229 if (ipt->ipt_len < 4 || ipt->ipt_len > 40) { 2230 code = (u_char *)&ipt->ipt_len - (u_char *)ip; 2231 goto bad; 2232 } 2233 if (ipt->ipt_ptr < 5) { 2234 code = (u_char *)&ipt->ipt_ptr - (u_char *)ip; 2235 goto bad; 2236 } 2237 if (ipt->ipt_ptr > 2238 ipt->ipt_len - (int)sizeof (int32_t)) { 2239 if (++ipt->ipt_oflw == 0) { 2240 code = (u_char *)&ipt->ipt_ptr - 2241 (u_char *)ip; 2242 goto bad; 2243 } 2244 break; 2245 } 2246 sin = (struct in_addr *)(void *)(cp + ipt->ipt_ptr - 1); 2247 switch (ipt->ipt_flg) { 2248 2249 case IPOPT_TS_TSONLY: 2250 break; 2251 2252 case IPOPT_TS_TSANDADDR: 2253 if (ipt->ipt_ptr - 1 + sizeof (n_time) + 2254 sizeof (struct in_addr) > ipt->ipt_len) { 2255 code = (u_char *)&ipt->ipt_ptr - 2256 (u_char *)ip; 2257 goto bad; 2258 } 2259 ipaddr.sin_addr = dst; 2260 ia = (INA)ifaof_ifpforaddr(SA(&ipaddr), 2261 m->m_pkthdr.rcvif); 2262 if (ia == NULL) 2263 continue; 2264 IFA_LOCK(&ia->ia_ifa); 2265 (void) memcpy(sin, &IA_SIN(ia)->sin_addr, 2266 sizeof (struct in_addr)); 2267 IFA_UNLOCK(&ia->ia_ifa); 2268 ipt->ipt_ptr += sizeof (struct in_addr); 2269 IFA_REMREF(&ia->ia_ifa); 2270 ia = NULL; 2271 break; 2272 2273 case IPOPT_TS_PRESPEC: 2274 if (ipt->ipt_ptr - 1 + sizeof (n_time) + 2275 sizeof (struct in_addr) > ipt->ipt_len) { 2276 code = (u_char *)&ipt->ipt_ptr - 2277 (u_char *)ip; 2278 goto bad; 2279 } 2280 (void) memcpy(&ipaddr.sin_addr, sin, 2281 sizeof (struct in_addr)); 2282 if ((ia = (struct in_ifaddr *)ifa_ifwithaddr( 2283 SA(&ipaddr))) == NULL) 2284 continue; 2285 IFA_REMREF(&ia->ia_ifa); 2286 ia = NULL; 2287 ipt->ipt_ptr += sizeof (struct in_addr); 2288 break; 2289 2290 default: 2291 /* XXX can't take &ipt->ipt_flg */ 2292 code = (u_char *)&ipt->ipt_ptr - 2293 (u_char *)ip + 1; 2294 goto bad; 2295 } 2296 ntime = iptime(); 2297 (void) memcpy(cp + ipt->ipt_ptr - 1, &ntime, 2298 sizeof (n_time)); 2299 ipt->ipt_ptr += sizeof (n_time); 2300 } 2301 } 2302 if (forward && ipforwarding) { 2303 ip_forward(m, 1, next_hop); 2304 return (1); 2305 } 2306 return (0); 2307bad: 2308 /* XXX icmp_error adds in hdr length */ 2309 ip->ip_len -= IP_VHL_HL(ip->ip_vhl) << 2; 2310 icmp_error(m, type, code, 0, 0); 2311 OSAddAtomic(1, &ipstat.ips_badoptions); 2312 return (1); 2313} 2314 2315/* 2316 * Check for the presence of the IP Router Alert option [RFC2113] 2317 * in the header of an IPv4 datagram. 2318 * 2319 * This call is not intended for use from the forwarding path; it is here 2320 * so that protocol domains may check for the presence of the option. 2321 * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert 2322 * option does not have much relevance to the implementation, though this 2323 * may change in future. 2324 * Router alert options SHOULD be passed if running in IPSTEALTH mode and 2325 * we are not the endpoint. 2326 * Length checks on individual options should already have been peformed 2327 * by ip_dooptions() therefore they are folded under DIAGNOSTIC here. 2328 * 2329 * Return zero if not present or options are invalid, non-zero if present. 2330 */ 2331int 2332ip_checkrouteralert(struct mbuf *m) 2333{ 2334 struct ip *ip = mtod(m, struct ip *); 2335 u_char *cp; 2336 int opt, optlen, cnt, found_ra; 2337 2338 found_ra = 0; 2339 cp = (u_char *)(ip + 1); 2340 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); 2341 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2342 opt = cp[IPOPT_OPTVAL]; 2343 if (opt == IPOPT_EOL) 2344 break; 2345 if (opt == IPOPT_NOP) 2346 optlen = 1; 2347 else { 2348#ifdef DIAGNOSTIC 2349 if (cnt < IPOPT_OLEN + sizeof (*cp)) 2350 break; 2351#endif 2352 optlen = cp[IPOPT_OLEN]; 2353#ifdef DIAGNOSTIC 2354 if (optlen < IPOPT_OLEN + sizeof (*cp) || optlen > cnt) 2355 break; 2356#endif 2357 } 2358 switch (opt) { 2359 case IPOPT_RA: 2360#ifdef DIAGNOSTIC 2361 if (optlen != IPOPT_OFFSET + sizeof (uint16_t) || 2362 (*((uint16_t *)(void *)&cp[IPOPT_OFFSET]) != 0)) 2363 break; 2364 else 2365#endif 2366 found_ra = 1; 2367 break; 2368 default: 2369 break; 2370 } 2371 } 2372 2373 return (found_ra); 2374} 2375 2376/* 2377 * Given address of next destination (final or next hop), 2378 * return internet address info of interface to be used to get there. 2379 */ 2380struct in_ifaddr * 2381ip_rtaddr(struct in_addr dst) 2382{ 2383 struct sockaddr_in *sin; 2384 struct ifaddr *rt_ifa; 2385 struct route ro; 2386 2387 bzero(&ro, sizeof (ro)); 2388 sin = SIN(&ro.ro_dst); 2389 sin->sin_family = AF_INET; 2390 sin->sin_len = sizeof (*sin); 2391 sin->sin_addr = dst; 2392 2393 rtalloc_ign(&ro, RTF_PRCLONING); 2394 if (ro.ro_rt == NULL) { 2395 ROUTE_RELEASE(&ro); 2396 return (NULL); 2397 } 2398 2399 RT_LOCK(ro.ro_rt); 2400 if ((rt_ifa = ro.ro_rt->rt_ifa) != NULL) 2401 IFA_ADDREF(rt_ifa); 2402 RT_UNLOCK(ro.ro_rt); 2403 ROUTE_RELEASE(&ro); 2404 2405 return ((struct in_ifaddr *)rt_ifa); 2406} 2407 2408/* 2409 * Save incoming source route for use in replies, 2410 * to be picked up later by ip_srcroute if the receiver is interested. 2411 */ 2412void 2413save_rte(u_char *option, struct in_addr dst) 2414{ 2415 unsigned olen; 2416 2417 olen = option[IPOPT_OLEN]; 2418#if DIAGNOSTIC 2419 if (ipprintfs) 2420 printf("save_rte: olen %d\n", olen); 2421#endif 2422 if (olen > sizeof (ip_srcrt) - (1 + sizeof (dst))) 2423 return; 2424 bcopy(option, ip_srcrt.srcopt, olen); 2425 ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof (struct in_addr); 2426 ip_srcrt.dst = dst; 2427} 2428 2429/* 2430 * Retrieve incoming source route for use in replies, 2431 * in the same form used by setsockopt. 2432 * The first hop is placed before the options, will be removed later. 2433 */ 2434struct mbuf * 2435ip_srcroute(void) 2436{ 2437 struct in_addr *p, *q; 2438 struct mbuf *m; 2439 2440 if (ip_nhops == 0) 2441 return (NULL); 2442 2443 m = m_get(M_DONTWAIT, MT_HEADER); 2444 if (m == NULL) 2445 return (NULL); 2446 2447#define OPTSIZ (sizeof (ip_srcrt.nop) + sizeof (ip_srcrt.srcopt)) 2448 2449 /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ 2450 m->m_len = ip_nhops * sizeof (struct in_addr) + 2451 sizeof (struct in_addr) + OPTSIZ; 2452#if DIAGNOSTIC 2453 if (ipprintfs) 2454 printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len); 2455#endif 2456 2457 /* 2458 * First save first hop for return route 2459 */ 2460 p = &ip_srcrt.route[ip_nhops - 1]; 2461 *(mtod(m, struct in_addr *)) = *p--; 2462#if DIAGNOSTIC 2463 if (ipprintfs) 2464 printf(" hops %lx", 2465 (u_int32_t)ntohl(mtod(m, struct in_addr *)->s_addr)); 2466#endif 2467 2468 /* 2469 * Copy option fields and padding (nop) to mbuf. 2470 */ 2471 ip_srcrt.nop = IPOPT_NOP; 2472 ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; 2473 (void) memcpy(mtod(m, caddr_t) + sizeof (struct in_addr), 2474 &ip_srcrt.nop, OPTSIZ); 2475 q = (struct in_addr *)(void *)(mtod(m, caddr_t) + 2476 sizeof (struct in_addr) + OPTSIZ); 2477#undef OPTSIZ 2478 /* 2479 * Record return path as an IP source route, 2480 * reversing the path (pointers are now aligned). 2481 */ 2482 while (p >= ip_srcrt.route) { 2483#if DIAGNOSTIC 2484 if (ipprintfs) 2485 printf(" %lx", (u_int32_t)ntohl(q->s_addr)); 2486#endif 2487 *q++ = *p--; 2488 } 2489 /* 2490 * Last hop goes to final destination. 2491 */ 2492 *q = ip_srcrt.dst; 2493#if DIAGNOSTIC 2494 if (ipprintfs) 2495 printf(" %lx\n", (u_int32_t)ntohl(q->s_addr)); 2496#endif 2497 return (m); 2498} 2499 2500/* 2501 * Strip out IP options, at higher 2502 * level protocol in the kernel. 2503 * Second argument is buffer to which options 2504 * will be moved, and return value is their length. 2505 * XXX should be deleted; last arg currently ignored. 2506 */ 2507void 2508ip_stripoptions(struct mbuf *m, struct mbuf *mopt) 2509{ 2510#pragma unused(mopt) 2511 int i; 2512 struct ip *ip = mtod(m, struct ip *); 2513 caddr_t opts; 2514 int olen; 2515 2516 /* Expect 32-bit aligned data pointer on strict-align platforms */ 2517 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); 2518 2519 olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); 2520 opts = (caddr_t)(ip + 1); 2521 i = m->m_len - (sizeof (struct ip) + olen); 2522 bcopy(opts + olen, opts, (unsigned)i); 2523 m->m_len -= olen; 2524 if (m->m_flags & M_PKTHDR) 2525 m->m_pkthdr.len -= olen; 2526 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof (struct ip) >> 2); 2527} 2528 2529u_char inetctlerrmap[PRC_NCMDS] = { 2530 0, 0, 0, 0, 2531 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, 2532 ENETUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, 2533 EMSGSIZE, EHOSTUNREACH, 0, 0, 2534 0, 0, 0, 0, 2535 ENOPROTOOPT, ECONNREFUSED 2536}; 2537 2538static int 2539sysctl_ipforwarding SYSCTL_HANDLER_ARGS 2540{ 2541#pragma unused(arg1, arg2) 2542 int i, was_ipforwarding = ipforwarding; 2543 2544 i = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); 2545 if (i != 0 || req->newptr == USER_ADDR_NULL) 2546 return (i); 2547 2548 if (was_ipforwarding && !ipforwarding) { 2549 /* clean up IPv4 forwarding cached routes */ 2550 ifnet_head_lock_shared(); 2551 for (i = 0; i <= if_index; i++) { 2552 struct ifnet *ifp = ifindex2ifnet[i]; 2553 if (ifp != NULL) { 2554 lck_mtx_lock(&ifp->if_cached_route_lock); 2555 ROUTE_RELEASE(&ifp->if_fwd_route); 2556 bzero(&ifp->if_fwd_route, 2557 sizeof (ifp->if_fwd_route)); 2558 lck_mtx_unlock(&ifp->if_cached_route_lock); 2559 } 2560 } 2561 ifnet_head_done(); 2562 } 2563 2564 return (0); 2565} 2566 2567/* 2568 * Similar to inp_route_{copyout,copyin} routines except that these copy 2569 * out the cached IPv4 forwarding route from struct ifnet instead of the 2570 * inpcb. See comments for those routines for explanations. 2571 */ 2572static void 2573ip_fwd_route_copyout(struct ifnet *ifp, struct route *dst) 2574{ 2575 struct route *src = &ifp->if_fwd_route; 2576 2577 lck_mtx_lock_spin(&ifp->if_cached_route_lock); 2578 lck_mtx_convert_spin(&ifp->if_cached_route_lock); 2579 2580 /* Minor sanity check */ 2581 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) 2582 panic("%s: wrong or corrupted route: %p", __func__, src); 2583 2584 route_copyout(dst, src, sizeof (*dst)); 2585 2586 lck_mtx_unlock(&ifp->if_cached_route_lock); 2587} 2588 2589static void 2590ip_fwd_route_copyin(struct ifnet *ifp, struct route *src) 2591{ 2592 struct route *dst = &ifp->if_fwd_route; 2593 2594 lck_mtx_lock_spin(&ifp->if_cached_route_lock); 2595 lck_mtx_convert_spin(&ifp->if_cached_route_lock); 2596 2597 /* Minor sanity check */ 2598 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) 2599 panic("%s: wrong or corrupted route: %p", __func__, src); 2600 2601 if (ifp->if_fwd_cacheok) 2602 route_copyin(src, dst, sizeof (*src)); 2603 2604 lck_mtx_unlock(&ifp->if_cached_route_lock); 2605} 2606 2607/* 2608 * Forward a packet. If some error occurs return the sender 2609 * an icmp packet. Note we can't always generate a meaningful 2610 * icmp message because icmp doesn't have a large enough repertoire 2611 * of codes and types. 2612 * 2613 * If not forwarding, just drop the packet. This could be confusing 2614 * if ipforwarding was zero but some routing protocol was advancing 2615 * us as a gateway to somewhere. However, we must let the routing 2616 * protocol deal with that. 2617 * 2618 * The srcrt parameter indicates whether the packet is being forwarded 2619 * via a source route. 2620 */ 2621static void 2622ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) 2623{ 2624#if !IPFIREWALL 2625#pragma unused(next_hop) 2626#endif 2627 struct ip *ip = mtod(m, struct ip *); 2628 struct sockaddr_in *sin; 2629 struct rtentry *rt; 2630 struct route fwd_rt; 2631 int error, type = 0, code = 0; 2632 struct mbuf *mcopy; 2633 n_long dest; 2634 struct in_addr pkt_dst; 2635 u_int32_t nextmtu = 0, len; 2636 struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, 0, 0 }; 2637 struct ifnet *rcvifp = m->m_pkthdr.rcvif; 2638#if IPSEC 2639 struct secpolicy *sp = NULL; 2640 int ipsecerror; 2641#endif /* IPSEC */ 2642#if PF 2643 struct pf_mtag *pf_mtag; 2644#endif /* PF */ 2645 2646 dest = 0; 2647#if IPFIREWALL 2648 /* 2649 * Cache the destination address of the packet; this may be 2650 * changed by use of 'ipfw fwd'. 2651 */ 2652 pkt_dst = ((next_hop != NULL) ? next_hop->sin_addr : ip->ip_dst); 2653#else /* !IPFIREWALL */ 2654 pkt_dst = ip->ip_dst; 2655#endif /* !IPFIREWALL */ 2656 2657#if DIAGNOSTIC 2658 if (ipprintfs) 2659 printf("forward: src %lx dst %lx ttl %x\n", 2660 (u_int32_t)ip->ip_src.s_addr, (u_int32_t)pkt_dst.s_addr, 2661 ip->ip_ttl); 2662#endif 2663 2664 if (m->m_flags & (M_BCAST|M_MCAST) || !in_canforward(pkt_dst)) { 2665 OSAddAtomic(1, &ipstat.ips_cantforward); 2666 m_freem(m); 2667 return; 2668 } 2669#if IPSTEALTH 2670 if (!ipstealth) { 2671#endif /* IPSTEALTH */ 2672 if (ip->ip_ttl <= IPTTLDEC) { 2673 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 2674 dest, 0); 2675 return; 2676 } 2677#if IPSTEALTH 2678 } 2679#endif /* IPSTEALTH */ 2680 2681#if PF 2682 pf_mtag = pf_find_mtag(m); 2683 if (pf_mtag != NULL && pf_mtag->pftag_rtableid != IFSCOPE_NONE) { 2684 ipoa.ipoa_boundif = pf_mtag->pftag_rtableid; 2685 ipoa.ipoa_flags |= IPOAF_BOUND_IF; 2686 } 2687#endif /* PF */ 2688 2689 ip_fwd_route_copyout(rcvifp, &fwd_rt); 2690 2691 sin = SIN(&fwd_rt.ro_dst); 2692 if (ROUTE_UNUSABLE(&fwd_rt) || pkt_dst.s_addr != sin->sin_addr.s_addr) { 2693 ROUTE_RELEASE(&fwd_rt); 2694 2695 sin->sin_family = AF_INET; 2696 sin->sin_len = sizeof (*sin); 2697 sin->sin_addr = pkt_dst; 2698 2699 rtalloc_scoped_ign(&fwd_rt, RTF_PRCLONING, ipoa.ipoa_boundif); 2700 if (fwd_rt.ro_rt == NULL) { 2701 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); 2702 goto done; 2703 } 2704 } 2705 rt = fwd_rt.ro_rt; 2706 2707 /* 2708 * Save the IP header and at most 8 bytes of the payload, 2709 * in case we need to generate an ICMP message to the src. 2710 * 2711 * We don't use m_copy() because it might return a reference 2712 * to a shared cluster. Both this function and ip_output() 2713 * assume exclusive access to the IP header in `m', so any 2714 * data in a cluster may change before we reach icmp_error(). 2715 */ 2716 MGET(mcopy, M_DONTWAIT, m->m_type); 2717 if (mcopy != NULL) { 2718 M_COPY_PKTHDR(mcopy, m); 2719 mcopy->m_len = imin((IP_VHL_HL(ip->ip_vhl) << 2) + 8, 2720 (int)ip->ip_len); 2721 m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); 2722 } 2723 2724#if IPSTEALTH 2725 if (!ipstealth) { 2726#endif /* IPSTEALTH */ 2727 ip->ip_ttl -= IPTTLDEC; 2728#if IPSTEALTH 2729 } 2730#endif /* IPSTEALTH */ 2731 2732 /* 2733 * If forwarding packet using same interface that it came in on, 2734 * perhaps should send a redirect to sender to shortcut a hop. 2735 * Only send redirect if source is sending directly to us, 2736 * and if packet was not source routed (or has any options). 2737 * Also, don't send redirect if forwarding using a default route 2738 * or a route modified by a redirect. 2739 */ 2740 RT_LOCK_SPIN(rt); 2741 if (rt->rt_ifp == m->m_pkthdr.rcvif && 2742 !(rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) && 2743 satosin(rt_key(rt))->sin_addr.s_addr != INADDR_ANY && 2744 ipsendredirects && !srcrt && rt->rt_ifa != NULL) { 2745 struct in_ifaddr *ia = (struct in_ifaddr *)rt->rt_ifa; 2746 u_int32_t src = ntohl(ip->ip_src.s_addr); 2747 2748 /* Become a regular mutex */ 2749 RT_CONVERT_LOCK(rt); 2750 IFA_LOCK_SPIN(&ia->ia_ifa); 2751 if ((src & ia->ia_subnetmask) == ia->ia_subnet) { 2752 if (rt->rt_flags & RTF_GATEWAY) 2753 dest = satosin(rt->rt_gateway)->sin_addr.s_addr; 2754 else 2755 dest = pkt_dst.s_addr; 2756 /* 2757 * Router requirements says to only send 2758 * host redirects. 2759 */ 2760 type = ICMP_REDIRECT; 2761 code = ICMP_REDIRECT_HOST; 2762#if DIAGNOSTIC 2763 if (ipprintfs) 2764 printf("redirect (%d) to %lx\n", code, 2765 (u_int32_t)dest); 2766#endif 2767 } 2768 IFA_UNLOCK(&ia->ia_ifa); 2769 } 2770 RT_UNLOCK(rt); 2771 2772#if IPFIREWALL 2773 if (next_hop != NULL) { 2774 /* Pass IPFORWARD info if available */ 2775 struct m_tag *tag; 2776 struct ip_fwd_tag *ipfwd_tag; 2777 2778 tag = m_tag_create(KERNEL_MODULE_TAG_ID, 2779 KERNEL_TAG_TYPE_IPFORWARD, 2780 sizeof (*ipfwd_tag), M_NOWAIT, m); 2781 if (tag == NULL) { 2782 error = ENOBUFS; 2783 m_freem(m); 2784 goto done; 2785 } 2786 2787 ipfwd_tag = (struct ip_fwd_tag *)(tag+1); 2788 ipfwd_tag->next_hop = next_hop; 2789 2790 m_tag_prepend(m, tag); 2791 } 2792#endif /* IPFIREWALL */ 2793 2794 /* Mark this packet as being forwarded from another interface */ 2795 m->m_pkthdr.pkt_flags |= PKTF_FORWARDED; 2796 len = m_pktlen(m); 2797 2798 error = ip_output(m, NULL, &fwd_rt, IP_FORWARDING | IP_OUTARGS, 2799 NULL, &ipoa); 2800 2801 /* Refresh rt since the route could have changed while in IP */ 2802 rt = fwd_rt.ro_rt; 2803 2804 if (error != 0) { 2805 OSAddAtomic(1, &ipstat.ips_cantforward); 2806 } else { 2807 /* 2808 * Increment stats on the source interface; the ones 2809 * for destination interface has been taken care of 2810 * during output above by virtue of PKTF_FORWARDED. 2811 */ 2812 rcvifp->if_fpackets++; 2813 rcvifp->if_fbytes += len; 2814 2815 OSAddAtomic(1, &ipstat.ips_forward); 2816 if (type != 0) { 2817 OSAddAtomic(1, &ipstat.ips_redirectsent); 2818 } else { 2819 if (mcopy != NULL) { 2820 /* 2821 * If we didn't have to go thru ipflow and 2822 * the packet was successfully consumed by 2823 * ip_output, the mcopy is rather a waste; 2824 * this could be further optimized. 2825 */ 2826 m_freem(mcopy); 2827 } 2828 goto done; 2829 } 2830 } 2831 if (mcopy == NULL) 2832 goto done; 2833 2834 switch (error) { 2835 case 0: /* forwarded, but need redirect */ 2836 /* type, code set above */ 2837 break; 2838 2839 case ENETUNREACH: /* shouldn't happen, checked above */ 2840 case EHOSTUNREACH: 2841 case ENETDOWN: 2842 case EHOSTDOWN: 2843 default: 2844 type = ICMP_UNREACH; 2845 code = ICMP_UNREACH_HOST; 2846 break; 2847 2848 case EMSGSIZE: 2849 type = ICMP_UNREACH; 2850 code = ICMP_UNREACH_NEEDFRAG; 2851 2852 if (rt == NULL) { 2853 break; 2854 } else { 2855 RT_LOCK_SPIN(rt); 2856 if (rt->rt_ifp != NULL) 2857 nextmtu = rt->rt_ifp->if_mtu; 2858 RT_UNLOCK(rt); 2859 } 2860#ifdef IPSEC 2861 if (ipsec_bypass) 2862 break; 2863 2864 /* 2865 * If the packet is routed over IPsec tunnel, tell the 2866 * originator the tunnel MTU. 2867 * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz 2868 * XXX quickhack!!! 2869 */ 2870 sp = ipsec4_getpolicybyaddr(mcopy, IPSEC_DIR_OUTBOUND, 2871 IP_FORWARDING, &ipsecerror); 2872 2873 if (sp == NULL) 2874 break; 2875 2876 /* 2877 * find the correct route for outer IPv4 2878 * header, compute tunnel MTU. 2879 */ 2880 nextmtu = 0; 2881 2882 if (sp->req != NULL && 2883 sp->req->saidx.mode == IPSEC_MODE_TUNNEL) { 2884 struct secasindex saidx; 2885 struct secasvar *sav; 2886 struct route *ro; 2887 struct ip *ipm; 2888 int ipsechdr; 2889 2890 /* count IPsec header size */ 2891 ipsechdr = ipsec_hdrsiz(sp); 2892 2893 ipm = mtod(mcopy, struct ip *); 2894 bcopy(&sp->req->saidx, &saidx, sizeof (saidx)); 2895 saidx.mode = sp->req->saidx.mode; 2896 saidx.reqid = sp->req->saidx.reqid; 2897 sin = SIN(&saidx.src); 2898 if (sin->sin_len == 0) { 2899 sin->sin_len = sizeof (*sin); 2900 sin->sin_family = AF_INET; 2901 sin->sin_port = IPSEC_PORT_ANY; 2902 bcopy(&ipm->ip_src, &sin->sin_addr, 2903 sizeof (sin->sin_addr)); 2904 } 2905 sin = SIN(&saidx.dst); 2906 if (sin->sin_len == 0) { 2907 sin->sin_len = sizeof (*sin); 2908 sin->sin_family = AF_INET; 2909 sin->sin_port = IPSEC_PORT_ANY; 2910 bcopy(&ipm->ip_dst, &sin->sin_addr, 2911 sizeof (sin->sin_addr)); 2912 } 2913 sav = key_allocsa_policy(&saidx); 2914 if (sav != NULL) { 2915 lck_mtx_lock(sadb_mutex); 2916 if (sav->sah != NULL) { 2917 ro = &sav->sah->sa_route; 2918 if (ro->ro_rt != NULL) { 2919 RT_LOCK(ro->ro_rt); 2920 if (ro->ro_rt->rt_ifp != NULL) { 2921 nextmtu = ro->ro_rt-> 2922 rt_ifp->if_mtu; 2923 nextmtu -= ipsechdr; 2924 } 2925 RT_UNLOCK(ro->ro_rt); 2926 } 2927 } 2928 key_freesav(sav, KEY_SADB_LOCKED); 2929 lck_mtx_unlock(sadb_mutex); 2930 } 2931 } 2932 key_freesp(sp, KEY_SADB_UNLOCKED); 2933#endif /* IPSEC */ 2934 break; 2935 2936 case ENOBUFS: 2937 /* 2938 * A router should not generate ICMP_SOURCEQUENCH as 2939 * required in RFC1812 Requirements for IP Version 4 Routers. 2940 * Source quench could be a big problem under DoS attacks, 2941 * or if the underlying interface is rate-limited. 2942 * Those who need source quench packets may re-enable them 2943 * via the net.inet.ip.sendsourcequench sysctl. 2944 */ 2945 if (ip_sendsourcequench == 0) { 2946 m_freem(mcopy); 2947 goto done; 2948 } else { 2949 type = ICMP_SOURCEQUENCH; 2950 code = 0; 2951 } 2952 break; 2953 2954 case EACCES: /* ipfw denied packet */ 2955 m_freem(mcopy); 2956 goto done; 2957 } 2958 2959 if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG) 2960 OSAddAtomic(1, &ipstat.ips_cantfrag); 2961 2962 icmp_error(mcopy, type, code, dest, nextmtu); 2963done: 2964 ip_fwd_route_copyin(rcvifp, &fwd_rt); 2965} 2966 2967int 2968ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, 2969 struct mbuf *m) 2970{ 2971 *mp = NULL; 2972 if (inp->inp_socket->so_options & SO_TIMESTAMP) { 2973 struct timeval tv; 2974 2975 getmicrotime(&tv); 2976 mp = sbcreatecontrol_mbuf((caddr_t)&tv, sizeof (tv), 2977 SCM_TIMESTAMP, SOL_SOCKET, mp); 2978 if (*mp == NULL) { 2979 goto no_mbufs; 2980 } 2981 } 2982 if (inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) { 2983 uint64_t time; 2984 2985 time = mach_absolute_time(); 2986 mp = sbcreatecontrol_mbuf((caddr_t)&time, sizeof (time), 2987 SCM_TIMESTAMP_MONOTONIC, SOL_SOCKET, mp); 2988 if (*mp == NULL) { 2989 goto no_mbufs; 2990 } 2991 } 2992 if (inp->inp_flags & INP_RECVDSTADDR) { 2993 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_dst, 2994 sizeof (struct in_addr), IP_RECVDSTADDR, IPPROTO_IP, mp); 2995 if (*mp == NULL) { 2996 goto no_mbufs; 2997 } 2998 } 2999#ifdef notyet 3000 /* 3001 * XXX 3002 * Moving these out of udp_input() made them even more broken 3003 * than they already were. 3004 */ 3005 /* options were tossed already */ 3006 if (inp->inp_flags & INP_RECVOPTS) { 3007 mp = sbcreatecontrol_mbuf((caddr_t)opts_deleted_above, 3008 sizeof (struct in_addr), IP_RECVOPTS, IPPROTO_IP, mp); 3009 if (*mp == NULL) { 3010 goto no_mbufs; 3011 } 3012 } 3013 /* ip_srcroute doesn't do what we want here, need to fix */ 3014 if (inp->inp_flags & INP_RECVRETOPTS) { 3015 mp = sbcreatecontrol_mbuf((caddr_t)ip_srcroute(), 3016 sizeof (struct in_addr), IP_RECVRETOPTS, IPPROTO_IP, mp); 3017 if (*mp == NULL) { 3018 goto no_mbufs; 3019 } 3020 } 3021#endif /* notyet */ 3022 if (inp->inp_flags & INP_RECVIF) { 3023 struct ifnet *ifp; 3024 uint8_t sdlbuf[SOCK_MAXADDRLEN + 1]; 3025 struct sockaddr_dl *sdl2 = SDL(&sdlbuf); 3026 3027 /* 3028 * Make sure to accomodate the largest possible 3029 * size of SA(if_lladdr)->sa_len. 3030 */ 3031 _CASSERT(sizeof (sdlbuf) == (SOCK_MAXADDRLEN + 1)); 3032 3033 ifnet_head_lock_shared(); 3034 if ((ifp = m->m_pkthdr.rcvif) != NULL && 3035 ifp->if_index && (ifp->if_index <= if_index)) { 3036 struct ifaddr *ifa = ifnet_addrs[ifp->if_index - 1]; 3037 struct sockaddr_dl *sdp; 3038 3039 if (!ifa || !ifa->ifa_addr) 3040 goto makedummy; 3041 3042 IFA_LOCK_SPIN(ifa); 3043 sdp = SDL(ifa->ifa_addr); 3044 /* 3045 * Change our mind and don't try copy. 3046 */ 3047 if (sdp->sdl_family != AF_LINK) { 3048 IFA_UNLOCK(ifa); 3049 goto makedummy; 3050 } 3051 /* the above _CASSERT ensures sdl_len fits in sdlbuf */ 3052 bcopy(sdp, sdl2, sdp->sdl_len); 3053 IFA_UNLOCK(ifa); 3054 } else { 3055makedummy: 3056 sdl2->sdl_len = 3057 offsetof(struct sockaddr_dl, sdl_data[0]); 3058 sdl2->sdl_family = AF_LINK; 3059 sdl2->sdl_index = 0; 3060 sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; 3061 } 3062 ifnet_head_done(); 3063 mp = sbcreatecontrol_mbuf((caddr_t)sdl2, sdl2->sdl_len, 3064 IP_RECVIF, IPPROTO_IP, mp); 3065 if (*mp == NULL) { 3066 goto no_mbufs; 3067 } 3068 } 3069 if (inp->inp_flags & INP_RECVTTL) { 3070 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_ttl, 3071 sizeof (ip->ip_ttl), IP_RECVTTL, IPPROTO_IP, mp); 3072 if (*mp == NULL) { 3073 goto no_mbufs; 3074 } 3075 } 3076 if (inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) { 3077 int tc = m_get_traffic_class(m); 3078 3079 mp = sbcreatecontrol_mbuf((caddr_t)&tc, sizeof (tc), 3080 SO_TRAFFIC_CLASS, SOL_SOCKET, mp); 3081 if (*mp == NULL) { 3082 goto no_mbufs; 3083 } 3084 } 3085 if (inp->inp_flags & INP_PKTINFO) { 3086 struct in_pktinfo pi; 3087 3088 bzero(&pi, sizeof (struct in_pktinfo)); 3089 bcopy(&ip->ip_dst, &pi.ipi_addr, sizeof (struct in_addr)); 3090 pi.ipi_ifindex = (m != NULL && m->m_pkthdr.rcvif != NULL) ? 3091 m->m_pkthdr.rcvif->if_index : 0; 3092 3093 mp = sbcreatecontrol_mbuf((caddr_t)&pi, 3094 sizeof (struct in_pktinfo), IP_RECVPKTINFO, IPPROTO_IP, mp); 3095 if (*mp == NULL) { 3096 goto no_mbufs; 3097 } 3098 } 3099 return (0); 3100 3101no_mbufs: 3102 ipstat.ips_pktdropcntrl++; 3103 return (ENOBUFS); 3104} 3105 3106static inline u_short 3107ip_cksum(struct mbuf *m, int hlen) 3108{ 3109 u_short sum; 3110 3111 if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { 3112 sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); 3113 } else if (!(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) && 3114 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) { 3115 /* 3116 * The packet arrived on an interface which isn't capable 3117 * of performing IP header checksum; compute it now. 3118 */ 3119 sum = ip_cksum_hdr_in(m, hlen); 3120 } else { 3121 sum = 0; 3122 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 3123 CSUM_IP_CHECKED | CSUM_IP_VALID); 3124 m->m_pkthdr.csum_data = 0xffff; 3125 } 3126 3127 if (sum != 0) 3128 OSAddAtomic(1, &ipstat.ips_badsum); 3129 3130 return (sum); 3131} 3132 3133static int 3134ip_getstat SYSCTL_HANDLER_ARGS 3135{ 3136#pragma unused(oidp, arg1, arg2) 3137 if (req->oldptr == USER_ADDR_NULL) 3138 req->oldlen = (size_t)sizeof (struct ipstat); 3139 3140 return (SYSCTL_OUT(req, &ipstat, MIN(sizeof (ipstat), req->oldlen))); 3141} 3142 3143void 3144ip_setsrcifaddr_info(struct mbuf *m, uint32_t src_idx, struct in_ifaddr *ia) 3145{ 3146 VERIFY(m->m_flags & M_PKTHDR); 3147 3148 /* 3149 * If the source ifaddr is specified, pick up the information 3150 * from there; otherwise just grab the passed-in ifindex as the 3151 * caller may not have the ifaddr available. 3152 */ 3153 if (ia != NULL) { 3154 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO; 3155 m->m_pkthdr.src_ifindex = ia->ia_ifp->if_index; 3156 } else { 3157 m->m_pkthdr.src_ifindex = src_idx; 3158 if (src_idx != 0) 3159 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO; 3160 } 3161} 3162 3163void 3164ip_setdstifaddr_info(struct mbuf *m, uint32_t dst_idx, struct in_ifaddr *ia) 3165{ 3166 VERIFY(m->m_flags & M_PKTHDR); 3167 3168 /* 3169 * If the destination ifaddr is specified, pick up the information 3170 * from there; otherwise just grab the passed-in ifindex as the 3171 * caller may not have the ifaddr available. 3172 */ 3173 if (ia != NULL) { 3174 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO; 3175 m->m_pkthdr.dst_ifindex = ia->ia_ifp->if_index; 3176 } else { 3177 m->m_pkthdr.dst_ifindex = dst_idx; 3178 if (dst_idx != 0) 3179 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO; 3180 } 3181} 3182 3183int 3184ip_getsrcifaddr_info(struct mbuf *m, uint32_t *src_idx, uint32_t *iaf) 3185{ 3186 VERIFY(m->m_flags & M_PKTHDR); 3187 3188 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) 3189 return (-1); 3190 3191 if (src_idx != NULL) 3192 *src_idx = m->m_pkthdr.src_ifindex; 3193 3194 if (iaf != NULL) 3195 *iaf = 0; 3196 3197 return (0); 3198} 3199 3200int 3201ip_getdstifaddr_info(struct mbuf *m, uint32_t *dst_idx, uint32_t *iaf) 3202{ 3203 VERIFY(m->m_flags & M_PKTHDR); 3204 3205 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) 3206 return (-1); 3207 3208 if (dst_idx != NULL) 3209 *dst_idx = m->m_pkthdr.dst_ifindex; 3210 3211 if (iaf != NULL) 3212 *iaf = 0; 3213 3214 return (0); 3215} 3216 3217/* 3218 * Protocol input handler for IPPROTO_GRE. 3219 */ 3220void 3221gre_input(struct mbuf *m, int off) 3222{ 3223 gre_input_func_t fn = gre_input_func; 3224 3225 /* 3226 * If there is a registered GRE input handler, pass mbuf to it. 3227 */ 3228 if (fn != NULL) { 3229 lck_mtx_unlock(inet_domain_mutex); 3230 m = fn(m, off, (mtod(m, struct ip *))->ip_p); 3231 lck_mtx_lock(inet_domain_mutex); 3232 } 3233 3234 /* 3235 * If no matching tunnel that is up is found, we inject 3236 * the mbuf to raw ip socket to see if anyone picks it up. 3237 */ 3238 if (m != NULL) 3239 rip_input(m, off); 3240} 3241 3242/* 3243 * Private KPI for PPP/PPTP. 3244 */ 3245int 3246ip_gre_register_input(gre_input_func_t fn) 3247{ 3248 lck_mtx_lock(inet_domain_mutex); 3249 gre_input_func = fn; 3250 lck_mtx_unlock(inet_domain_mutex); 3251 3252 return (0); 3253} 3254