1/* 2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1988, 1990, 1993 30 * The Regents of the University of California. All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or without 33 * modification, are permitted provided that the following conditions 34 * are met: 35 * 1. Redistributions of source code must retain the above copyright 36 * notice, this list of conditions and the following disclaimer. 37 * 2. Redistributions in binary form must reproduce the above copyright 38 * notice, this list of conditions and the following disclaimer in the 39 * documentation and/or other materials provided with the distribution. 40 * 3. All advertising materials mentioning features or use of this software 41 * must display the following acknowledgement: 42 * This product includes software developed by the University of 43 * California, Berkeley and its contributors. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 61 */ 62/* 63 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce 64 * support for mandatory and extensible security protections. This notice 65 * is included in support of clause 2.2 (b) of the Apple Public License, 66 * Version 2.0. 67 */ 68 69#define _IP_VHL 70 71#include <sys/param.h> 72#include <sys/systm.h> 73#include <sys/kernel.h> 74#include <sys/malloc.h> 75#include <sys/mbuf.h> 76#include <sys/protosw.h> 77#include <sys/socket.h> 78#include <sys/socketvar.h> 79#include <kern/locks.h> 80#include <sys/sysctl.h> 81#include <sys/mcache.h> 82#include <sys/kdebug.h> 83 84#include <machine/endian.h> 85#include <pexpert/pexpert.h> 86#include <mach/sdt.h> 87 88#include <libkern/OSAtomic.h> 89#include <libkern/OSByteOrder.h> 90 91#include <net/if.h> 92#include <net/if_dl.h> 93#include <net/if_types.h> 94#include <net/route.h> 95#include <net/ntstat.h> 96#include <net/net_osdep.h> 97#include <net/dlil.h> 98 99#include <netinet/in.h> 100#include <netinet/in_systm.h> 101#include <netinet/ip.h> 102#include <netinet/in_pcb.h> 103#include <netinet/in_var.h> 104#include <netinet/ip_var.h> 105#include <netinet/kpi_ipfilter_var.h> 106 107#if CONFIG_MACF_NET 108#include <security/mac_framework.h> 109#endif /* CONFIG_MACF_NET */ 110 111#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 1) 112#define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 3) 113#define DBG_FNC_IP_OUTPUT NETDBG_CODE(DBG_NETIP, (1 << 8) | 1) 114#define DBG_FNC_IPSEC4_OUTPUT NETDBG_CODE(DBG_NETIP, (2 << 8) | 1) 115 116#if IPSEC 117#include <netinet6/ipsec.h> 118#include <netkey/key.h> 119#if IPSEC_DEBUG 120#include <netkey/key_debug.h> 121#else 122#define KEYDEBUG(lev, arg) 123#endif 124#endif /* IPSEC */ 125 126#if NECP 127#include <net/necp.h> 128#endif /* NECP */ 129 130#if IPFIREWALL 131#include <netinet/ip_fw.h> 132#if IPDIVERT 133#include <netinet/ip_divert.h> 134#endif /* IPDIVERT */ 135#endif /* IPFIREWALL */ 136 137#if DUMMYNET 138#include <netinet/ip_dummynet.h> 139#endif 140 141#if PF 142#include <net/pfvar.h> 143#endif /* PF */ 144 145#if IPFIREWALL_FORWARD && IPFIREWALL_FORWARD_DEBUG 146#define print_ip(a) \ 147 printf("%ld.%ld.%ld.%ld", (ntohl(a.s_addr) >> 24) & 0xFF, \ 148 (ntohl(a.s_addr) >> 16) & 0xFF, \ 149 (ntohl(a.s_addr) >> 8) & 0xFF, \ 150 (ntohl(a.s_addr)) & 0xFF); 151#endif /* IPFIREWALL_FORWARD && IPFIREWALL_FORWARD_DEBUG */ 152 153u_short ip_id; 154 155static void ip_out_cksum_stats(int, u_int32_t); 156static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); 157static int ip_optcopy(struct ip *, struct ip *); 158static int ip_pcbopts(int, struct mbuf **, struct mbuf *); 159static void imo_trace(struct ip_moptions *, int); 160static void ip_mloopback(struct ifnet *, struct ifnet *, struct mbuf *, 161 struct sockaddr_in *, int); 162static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int); 163 164extern struct ip_linklocal_stat ip_linklocal_stat; 165 166/* temporary: for testing */ 167#if IPSEC 168extern int ipsec_bypass; 169#endif 170 171static int ip_maxchainsent = 0; 172SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, 173 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0, 174 "use dlil_output_list"); 175#if DEBUG 176static int forge_ce = 0; 177SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, 178 CTLFLAG_RW | CTLFLAG_LOCKED, &forge_ce, 0, 179 "Forge ECN CE"); 180#endif /* DEBUG */ 181 182static int ip_select_srcif_debug = 0; 183SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, 184 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0, 185 "log source interface selection debug info"); 186 187#define IMO_TRACE_HIST_SIZE 32 /* size of trace history */ 188 189/* For gdb */ 190__private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE; 191 192struct ip_moptions_dbg { 193 struct ip_moptions imo; /* ip_moptions */ 194 u_int16_t imo_refhold_cnt; /* # of IMO_ADDREF */ 195 u_int16_t imo_refrele_cnt; /* # of IMO_REMREF */ 196 /* 197 * Alloc and free callers. 198 */ 199 ctrace_t imo_alloc; 200 ctrace_t imo_free; 201 /* 202 * Circular lists of IMO_ADDREF and IMO_REMREF callers. 203 */ 204 ctrace_t imo_refhold[IMO_TRACE_HIST_SIZE]; 205 ctrace_t imo_refrele[IMO_TRACE_HIST_SIZE]; 206}; 207 208#if DEBUG 209static unsigned int imo_debug = 1; /* debugging (enabled) */ 210#else 211static unsigned int imo_debug; /* debugging (disabled) */ 212#endif /* !DEBUG */ 213static unsigned int imo_size; /* size of zone element */ 214static struct zone *imo_zone; /* zone for ip_moptions */ 215 216#define IMO_ZONE_MAX 64 /* maximum elements in zone */ 217#define IMO_ZONE_NAME "ip_moptions" /* zone name */ 218 219/* 220 * IP output. The packet in mbuf chain m contains a skeletal IP 221 * header (with len, off, ttl, proto, tos, src, dst). 222 * The mbuf chain containing the packet will be freed. 223 * The mbuf opt, if present, will not be freed. 224 */ 225int 226ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags, 227 struct ip_moptions *imo, struct ip_out_args *ipoa) 228{ 229 return (ip_output_list(m0, 0, opt, ro, flags, imo, ipoa)); 230} 231 232/* 233 * IP output. The packet in mbuf chain m contains a skeletal IP 234 * header (with len, off, ttl, proto, tos, src, dst). 235 * The mbuf chain containing the packet will be freed. 236 * The mbuf opt, if present, will not be freed. 237 * 238 * Route ro MUST be non-NULL; if ro->ro_rt is valid, route lookup would be 239 * skipped and ro->ro_rt would be used. Otherwise the result of route 240 * lookup is stored in ro->ro_rt. 241 * 242 * In the IP forwarding case, the packet will arrive with options already 243 * inserted, so must have a NULL opt pointer. 244 */ 245int 246ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, 247 struct route *ro, int flags, struct ip_moptions *imo, 248 struct ip_out_args *ipoa) 249{ 250 struct ip *ip; 251 struct ifnet *ifp = NULL; /* not refcnt'd */ 252 struct mbuf *m = m0, *prevnxt = NULL, **mppn = &prevnxt; 253 int hlen = sizeof (struct ip); 254 int len = 0, error = 0; 255 struct sockaddr_in *dst = NULL; 256 struct in_ifaddr *ia = NULL, *src_ia = NULL; 257 struct in_addr pkt_dst; 258 struct ipf_pktopts *ippo = NULL; 259 ipfilter_t inject_filter_ref = NULL; 260 struct mbuf *packetlist; 261 uint32_t sw_csum, pktcnt = 0, scnt = 0, bytecnt = 0; 262 unsigned int ifscope = IFSCOPE_NONE; 263 struct flowadv *adv = NULL; 264#if IPSEC 265 struct socket *so = NULL; 266 struct secpolicy *sp = NULL; 267#endif /* IPSEC */ 268#if NECP 269 necp_kernel_policy_result necp_result = 0; 270 necp_kernel_policy_result_parameter necp_result_parameter; 271 necp_kernel_policy_id necp_matched_policy_id = 0; 272#endif /* NECP */ 273#if IPFIREWALL 274 int ipfwoff; 275 struct sockaddr_in *next_hop_from_ipfwd_tag = NULL; 276#endif /* IPFIREWALL */ 277#if IPFIREWALL || DUMMYNET 278 struct m_tag *tag; 279#endif /* IPFIREWALL || DUMMYNET */ 280#if DUMMYNET 281 struct ip_out_args saved_ipoa; 282 struct sockaddr_in dst_buf; 283#endif /* DUMMYNET */ 284 struct { 285#if IPSEC 286 struct ipsec_output_state ipsec_state; 287#endif /* IPSEC */ 288#if NECP 289 struct route necp_route; 290#endif /* NECP */ 291#if IPFIREWALL || DUMMYNET 292 struct ip_fw_args args; 293#endif /* IPFIREWALL || DUMMYNET */ 294#if IPFIREWALL_FORWARD 295 struct route sro_fwd; 296#endif /* IPFIREWALL_FORWARD */ 297#if DUMMYNET 298 struct route saved_route; 299#endif /* DUMMYNET */ 300 struct ipf_pktopts ipf_pktopts; 301 } ipobz; 302#define ipsec_state ipobz.ipsec_state 303#define necp_route ipobz.necp_route 304#define args ipobz.args 305#define sro_fwd ipobz.sro_fwd 306#define saved_route ipobz.saved_route 307#define ipf_pktopts ipobz.ipf_pktopts 308 union { 309 struct { 310 boolean_t select_srcif : 1; /* set once */ 311 boolean_t srcbound : 1; /* set once */ 312 boolean_t nocell : 1; /* set once */ 313 boolean_t isbroadcast : 1; 314 boolean_t didfilter : 1; 315 boolean_t noexpensive : 1; /* set once */ 316 boolean_t awdl_unrestricted : 1; /* set once */ 317#if IPFIREWALL_FORWARD 318 boolean_t fwd_rewrite_src : 1; 319#endif /* IPFIREWALL_FORWARD */ 320 }; 321 uint32_t raw; 322 } ipobf = { .raw = 0 }; 323 324#define IP_CHECK_RESTRICTIONS(_ifp, _ipobf) \ 325 (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) || \ 326 ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) || \ 327 (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp))) 328 329 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); 330 331 VERIFY(m0->m_flags & M_PKTHDR); 332 packetlist = m0; 333 334 /* zero out {ipsec_state, args, sro_fwd, saved_route, ipf_pktops} */ 335 bzero(&ipobz, sizeof (ipobz)); 336 ippo = &ipf_pktopts; 337 338#if IPFIREWALL || DUMMYNET 339 if (SLIST_EMPTY(&m0->m_pkthdr.tags)) 340 goto ipfw_tags_done; 341 342 /* Grab info from mtags prepended to the chain */ 343#if DUMMYNET 344 if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, 345 KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { 346 struct dn_pkt_tag *dn_tag; 347 348 dn_tag = (struct dn_pkt_tag *)(tag+1); 349 args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule; 350 args.fwa_pf_rule = dn_tag->dn_pf_rule; 351 opt = NULL; 352 saved_route = dn_tag->dn_ro; 353 ro = &saved_route; 354 355 imo = NULL; 356 bcopy(&dn_tag->dn_dst, &dst_buf, sizeof (dst_buf)); 357 dst = &dst_buf; 358 ifp = dn_tag->dn_ifp; 359 flags = dn_tag->dn_flags; 360 if ((dn_tag->dn_flags & IP_OUTARGS)) { 361 saved_ipoa = dn_tag->dn_ipoa; 362 ipoa = &saved_ipoa; 363 } 364 365 m_tag_delete(m0, tag); 366 } 367#endif /* DUMMYNET */ 368 369#if IPDIVERT 370 if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, 371 KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { 372 struct divert_tag *div_tag; 373 374 div_tag = (struct divert_tag *)(tag+1); 375 args.fwa_divert_rule = div_tag->cookie; 376 377 m_tag_delete(m0, tag); 378 } 379#endif /* IPDIVERT */ 380 381#if IPFIREWALL 382 if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, 383 KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { 384 struct ip_fwd_tag *ipfwd_tag; 385 386 ipfwd_tag = (struct ip_fwd_tag *)(tag+1); 387 next_hop_from_ipfwd_tag = ipfwd_tag->next_hop; 388 389 m_tag_delete(m0, tag); 390 } 391#endif /* IPFIREWALL */ 392 393ipfw_tags_done: 394#endif /* IPFIREWALL || DUMMYNET */ 395 396 m = m0; 397 m->m_pkthdr.pkt_flags &= ~(PKTF_LOOP|PKTF_IFAINFO); 398 399#if IPSEC 400 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) { 401 /* If packet is bound to an interface, check bound policies */ 402 if ((flags & IP_OUTARGS) && (ipoa != NULL) && 403 (ipoa->ipoa_flags & IPOAF_BOUND_IF) && 404 ipoa->ipoa_boundif != IFSCOPE_NONE) { 405 if (ipsec4_getpolicybyinterface(m, IPSEC_DIR_OUTBOUND, 406 &flags, ipoa, &sp) != 0) 407 goto bad; 408 } 409 } 410#endif /* IPSEC */ 411 412 VERIFY(ro != NULL); 413 414 if (ip_doscopedroute && (flags & IP_OUTARGS)) { 415 /* 416 * In the forwarding case, only the ifscope value is used, 417 * as source interface selection doesn't take place. 418 */ 419 if ((ipobf.select_srcif = (!(flags & IP_FORWARDING) && 420 (ipoa->ipoa_flags & IPOAF_SELECT_SRCIF)))) { 421 ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF; 422 } 423 424 if ((ipoa->ipoa_flags & IPOAF_BOUND_IF) && 425 ipoa->ipoa_boundif != IFSCOPE_NONE) { 426 ifscope = ipoa->ipoa_boundif; 427 ipf_pktopts.ippo_flags |= 428 (IPPOF_BOUND_IF | (ifscope << IPPOF_SHIFT_IFSCOPE)); 429 } 430 431 /* double negation needed for bool bit field */ 432 ipobf.srcbound = !!(ipoa->ipoa_flags & IPOAF_BOUND_SRCADDR); 433 if (ipobf.srcbound) 434 ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR; 435 } else { 436 ipobf.select_srcif = FALSE; 437 ipobf.srcbound = FALSE; 438 ifscope = IFSCOPE_NONE; 439 if (flags & IP_OUTARGS) { 440 ipoa->ipoa_boundif = IFSCOPE_NONE; 441 ipoa->ipoa_flags &= ~(IPOAF_SELECT_SRCIF | 442 IPOAF_BOUND_IF | IPOAF_BOUND_SRCADDR); 443 } 444 } 445 446 if (flags & IP_OUTARGS) { 447 if (ipoa->ipoa_flags & IPOAF_NO_CELLULAR) { 448 ipobf.nocell = TRUE; 449 ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR; 450 } 451 if (ipoa->ipoa_flags & IPOAF_NO_EXPENSIVE) { 452 ipobf.noexpensive = TRUE; 453 ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_EXPENSIVE; 454 } 455 if (ipoa->ipoa_flags & IPOAF_AWDL_UNRESTRICTED) 456 ipobf.awdl_unrestricted = TRUE; 457 adv = &ipoa->ipoa_flowadv; 458 adv->code = FADV_SUCCESS; 459 ipoa->ipoa_retflags = 0; 460 } 461 462#if IPSEC 463 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) { 464 so = ipsec_getsocket(m); 465 if (so != NULL) { 466 (void) ipsec_setsocket(m, NULL); 467 } 468 } 469#endif /* IPSEC */ 470 471#if DUMMYNET 472 if (args.fwa_ipfw_rule != NULL || args.fwa_pf_rule != NULL) { 473 /* dummynet already saw us */ 474 ip = mtod(m, struct ip *); 475 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 476 pkt_dst = ip->ip_dst; 477 if (ro->ro_rt != NULL) { 478 RT_LOCK_SPIN(ro->ro_rt); 479 ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; 480 if (ia) { 481 /* Become a regular mutex */ 482 RT_CONVERT_LOCK(ro->ro_rt); 483 IFA_ADDREF(&ia->ia_ifa); 484 } 485 RT_UNLOCK(ro->ro_rt); 486 } 487 488#if IPFIREWALL 489 if (args.fwa_ipfw_rule != NULL) 490 goto skip_ipsec; 491#endif /* IPFIREWALL */ 492 if (args.fwa_pf_rule != NULL) 493 goto sendit; 494 } 495#endif /* DUMMYNET */ 496 497loopit: 498 ipobf.isbroadcast = FALSE; 499 ipobf.didfilter = FALSE; 500#if IPFIREWALL_FORWARD 501 ipobf.fwd_rewrite_src = FALSE; 502#endif /* IPFIREWALL_FORWARD */ 503 504 VERIFY(m->m_flags & M_PKTHDR); 505 /* 506 * No need to proccess packet twice if we've already seen it. 507 */ 508 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) 509 inject_filter_ref = ipf_get_inject_filter(m); 510 else 511 inject_filter_ref = NULL; 512 513 if (opt) { 514 m = ip_insertoptions(m, opt, &len); 515 hlen = len; 516 /* Update the chain */ 517 if (m != m0) { 518 if (m0 == packetlist) 519 packetlist = m; 520 m0 = m; 521 } 522 } 523 ip = mtod(m, struct ip *); 524 525#if IPFIREWALL 526 /* 527 * rdar://8542331 528 * 529 * When dealing with a packet chain, we need to reset "next_hop" 530 * because "dst" may have been changed to the gateway address below 531 * for the previous packet of the chain. This could cause the route 532 * to be inavertandly changed to the route to the gateway address 533 * (instead of the route to the destination). 534 */ 535 args.fwa_next_hop = next_hop_from_ipfwd_tag; 536 pkt_dst = args.fwa_next_hop ? args.fwa_next_hop->sin_addr : ip->ip_dst; 537#else /* !IPFIREWALL */ 538 pkt_dst = ip->ip_dst; 539#endif /* !IPFIREWALL */ 540 541 /* 542 * We must not send if the packet is destined to network zero. 543 * RFC1122 3.2.1.3 (a) and (b). 544 */ 545 if (IN_ZERONET(ntohl(pkt_dst.s_addr))) { 546 error = EHOSTUNREACH; 547 goto bad; 548 } 549 550 /* 551 * Fill in IP header. 552 */ 553 if (!(flags & (IP_FORWARDING|IP_RAWOUTPUT))) { 554 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2); 555 ip->ip_off &= IP_DF; 556 ip->ip_id = ip_randomid(); 557 OSAddAtomic(1, &ipstat.ips_localout); 558 } else { 559 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 560 } 561 562#if DEBUG 563 /* For debugging, we let the stack forge congestion */ 564 if (forge_ce != 0 && 565 ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 || 566 (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) { 567 ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE; 568 forge_ce--; 569 } 570#endif /* DEBUG */ 571 572 KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr, 573 ip->ip_p, ip->ip_off, ip->ip_len); 574 575 dst = SIN(&ro->ro_dst); 576 577 /* 578 * If there is a cached route, 579 * check that it is to the same destination 580 * and is still up. If not, free it and try again. 581 * The address family should also be checked in case of sharing the 582 * cache with IPv6. 583 */ 584 585 if (ro->ro_rt != NULL) { 586 if (ROUTE_UNUSABLE(ro) && ip->ip_src.s_addr != INADDR_ANY && 587 !(flags & (IP_ROUTETOIF | IP_FORWARDING))) { 588 src_ia = ifa_foraddr(ip->ip_src.s_addr); 589 if (src_ia == NULL) { 590 error = EADDRNOTAVAIL; 591 goto bad; 592 } 593 IFA_REMREF(&src_ia->ia_ifa); 594 src_ia = NULL; 595 } 596 /* 597 * Test rt_flags without holding rt_lock for performance 598 * reasons; if the route is down it will hopefully be 599 * caught by the layer below (since it uses this route 600 * as a hint) or during the next transmit. 601 */ 602 if (ROUTE_UNUSABLE(ro) || dst->sin_family != AF_INET || 603 dst->sin_addr.s_addr != pkt_dst.s_addr) 604 ROUTE_RELEASE(ro); 605 606 /* 607 * If we're doing source interface selection, we may not 608 * want to use this route; only synch up the generation 609 * count otherwise. 610 */ 611 if (!ipobf.select_srcif && ro->ro_rt != NULL && 612 RT_GENID_OUTOFSYNC(ro->ro_rt)) 613 RT_GENID_SYNC(ro->ro_rt); 614 } 615 if (ro->ro_rt == NULL) { 616 bzero(dst, sizeof (*dst)); 617 dst->sin_family = AF_INET; 618 dst->sin_len = sizeof (*dst); 619 dst->sin_addr = pkt_dst; 620 } 621 /* 622 * If routing to interface only, 623 * short circuit routing lookup. 624 */ 625 if (flags & IP_ROUTETOIF) { 626 if (ia != NULL) 627 IFA_REMREF(&ia->ia_ifa); 628 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { 629 ia = ifatoia(ifa_ifwithnet(sintosa(dst))); 630 if (ia == NULL) { 631 OSAddAtomic(1, &ipstat.ips_noroute); 632 error = ENETUNREACH; 633 goto bad; 634 } 635 } 636 ifp = ia->ia_ifp; 637 ip->ip_ttl = 1; 638 ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp); 639 /* 640 * For consistency with other cases below. Loopback 641 * multicast case is handled separately by ip_mloopback(). 642 */ 643 if ((ifp->if_flags & IFF_LOOPBACK) && 644 !IN_MULTICAST(ntohl(pkt_dst.s_addr))) { 645 m->m_pkthdr.rcvif = ifp; 646 ip_setsrcifaddr_info(m, ifp->if_index, NULL); 647 ip_setdstifaddr_info(m, ifp->if_index, NULL); 648 } 649 } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && 650 imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) { 651 /* 652 * Bypass the normal routing lookup for multicast 653 * packets if the interface is specified. 654 */ 655 ipobf.isbroadcast = FALSE; 656 if (ia != NULL) 657 IFA_REMREF(&ia->ia_ifa); 658 659 /* Macro takes reference on ia */ 660 IFP_TO_IA(ifp, ia); 661 } else { 662 struct ifaddr *ia0 = NULL; 663 boolean_t cloneok = FALSE; 664 /* 665 * Perform source interface selection; the source IP address 666 * must belong to one of the addresses of the interface used 667 * by the route. For performance reasons, do this only if 668 * there is no route, or if the routing table has changed, 669 * or if we haven't done source interface selection on this 670 * route (for this PCB instance) before. 671 */ 672 if (ipobf.select_srcif && 673 ip->ip_src.s_addr != INADDR_ANY && (ROUTE_UNUSABLE(ro) || 674 !(ro->ro_flags & ROF_SRCIF_SELECTED))) { 675 /* Find the source interface */ 676 ia0 = in_selectsrcif(ip, ro, ifscope); 677 678 /* 679 * If the source address belongs to a restricted 680 * interface and the caller forbids our using 681 * interfaces of such type, pretend that there is no 682 * route. 683 */ 684 if (ia0 != NULL && 685 IP_CHECK_RESTRICTIONS(ia0->ifa_ifp, ipobf)) { 686 IFA_REMREF(ia0); 687 ia0 = NULL; 688 error = EHOSTUNREACH; 689 if (flags & IP_OUTARGS) 690 ipoa->ipoa_retflags |= IPOARF_IFDENIED; 691 goto bad; 692 } 693 694 /* 695 * If the source address is spoofed (in the case of 696 * IP_RAWOUTPUT on an unbounded socket), or if this 697 * is destined for local/loopback, just let it go out 698 * using the interface of the route. Otherwise, 699 * there's no interface having such an address, 700 * so bail out. 701 */ 702 if (ia0 == NULL && (!(flags & IP_RAWOUTPUT) || 703 ipobf.srcbound) && ifscope != lo_ifp->if_index) { 704 error = EADDRNOTAVAIL; 705 goto bad; 706 } 707 708 /* 709 * If the caller didn't explicitly specify the scope, 710 * pick it up from the source interface. If the cached 711 * route was wrong and was blown away as part of source 712 * interface selection, don't mask out RTF_PRCLONING 713 * since that route may have been allocated by the ULP, 714 * unless the IP header was created by the caller or 715 * the destination is IPv4 LLA. The check for the 716 * latter is needed because IPv4 LLAs are never scoped 717 * in the current implementation, and we don't want to 718 * replace the resolved IPv4 LLA route with one whose 719 * gateway points to that of the default gateway on 720 * the primary interface of the system. 721 */ 722 if (ia0 != NULL) { 723 if (ifscope == IFSCOPE_NONE) 724 ifscope = ia0->ifa_ifp->if_index; 725 cloneok = (!(flags & IP_RAWOUTPUT) && 726 !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)))); 727 } 728 } 729 730 /* 731 * If this is the case, we probably don't want to allocate 732 * a protocol-cloned route since we didn't get one from the 733 * ULP. This lets TCP do its thing, while not burdening 734 * forwarding or ICMP with the overhead of cloning a route. 735 * Of course, we still want to do any cloning requested by 736 * the link layer, as this is probably required in all cases 737 * for correct operation (as it is for ARP). 738 */ 739 if (ro->ro_rt == NULL) { 740 unsigned long ign = RTF_PRCLONING; 741 /* 742 * We make an exception here: if the destination 743 * address is INADDR_BROADCAST, allocate a protocol- 744 * cloned host route so that we end up with a route 745 * marked with the RTF_BROADCAST flag. Otherwise, 746 * we would end up referring to the default route, 747 * instead of creating a cloned host route entry. 748 * That would introduce inconsistencies between ULPs 749 * that allocate a route and those that don't. The 750 * RTF_BROADCAST route is important since we'd want 751 * to send out undirected IP broadcast packets using 752 * link-level broadcast address. Another exception 753 * is for ULP-created routes that got blown away by 754 * source interface selection (see above). 755 * 756 * These exceptions will no longer be necessary when 757 * the RTF_PRCLONING scheme is no longer present. 758 */ 759 if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST) 760 ign &= ~RTF_PRCLONING; 761 762 /* 763 * Loosen the route lookup criteria if the ifscope 764 * corresponds to the loopback interface; this is 765 * needed to support Application Layer Gateways 766 * listening on loopback, in conjunction with packet 767 * filter redirection rules. The final source IP 768 * address will be rewritten by the packet filter 769 * prior to the RFC1122 loopback check below. 770 */ 771 if (ifscope == lo_ifp->if_index) 772 rtalloc_ign(ro, ign); 773 else 774 rtalloc_scoped_ign(ro, ign, ifscope); 775 776 /* 777 * If the route points to a cellular/expensive interface 778 * and the caller forbids our using interfaces of such type, 779 * pretend that there is no route. 780 */ 781 if (ro->ro_rt != NULL) { 782 RT_LOCK_SPIN(ro->ro_rt); 783 if (IP_CHECK_RESTRICTIONS(ro->ro_rt->rt_ifp, 784 ipobf)) { 785 RT_UNLOCK(ro->ro_rt); 786 ROUTE_RELEASE(ro); 787 if (flags & IP_OUTARGS) { 788 ipoa->ipoa_retflags |= 789 IPOARF_IFDENIED; 790 } 791 } else { 792 RT_UNLOCK(ro->ro_rt); 793 } 794 } 795 } 796 797 if (ro->ro_rt == NULL) { 798 OSAddAtomic(1, &ipstat.ips_noroute); 799 error = EHOSTUNREACH; 800 if (ia0 != NULL) { 801 IFA_REMREF(ia0); 802 ia0 = NULL; 803 } 804 goto bad; 805 } 806 807 if (ia != NULL) 808 IFA_REMREF(&ia->ia_ifa); 809 RT_LOCK_SPIN(ro->ro_rt); 810 ia = ifatoia(ro->ro_rt->rt_ifa); 811 if (ia != NULL) { 812 /* Become a regular mutex */ 813 RT_CONVERT_LOCK(ro->ro_rt); 814 IFA_ADDREF(&ia->ia_ifa); 815 } 816 /* 817 * Note: ia_ifp may not be the same as rt_ifp; the latter 818 * is what we use for determining outbound i/f, mtu, etc. 819 */ 820 ifp = ro->ro_rt->rt_ifp; 821 ro->ro_rt->rt_use++; 822 if (ro->ro_rt->rt_flags & RTF_GATEWAY) { 823 dst = SIN(ro->ro_rt->rt_gateway); 824 } 825 if (ro->ro_rt->rt_flags & RTF_HOST) { 826 /* double negation needed for bool bit field */ 827 ipobf.isbroadcast = 828 !!(ro->ro_rt->rt_flags & RTF_BROADCAST); 829 } else { 830 /* Become a regular mutex */ 831 RT_CONVERT_LOCK(ro->ro_rt); 832 ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp); 833 } 834 /* 835 * For consistency with IPv6, as well as to ensure that 836 * IP_RECVIF is set correctly for packets that are sent 837 * to one of the local addresses. ia (rt_ifa) would have 838 * been fixed up by rt_setif for local routes. This 839 * would make it appear as if the packet arrives on the 840 * interface which owns the local address. Loopback 841 * multicast case is handled separately by ip_mloopback(). 842 */ 843 if (ia != NULL && (ifp->if_flags & IFF_LOOPBACK) && 844 !IN_MULTICAST(ntohl(pkt_dst.s_addr))) { 845 uint32_t srcidx; 846 847 m->m_pkthdr.rcvif = ia->ia_ifa.ifa_ifp; 848 849 if (ia0 != NULL) 850 srcidx = ia0->ifa_ifp->if_index; 851 else if ((ro->ro_flags & ROF_SRCIF_SELECTED) && 852 ro->ro_srcia != NULL) 853 srcidx = ro->ro_srcia->ifa_ifp->if_index; 854 else 855 srcidx = 0; 856 857 ip_setsrcifaddr_info(m, srcidx, NULL); 858 ip_setdstifaddr_info(m, 0, ia); 859 } 860 RT_UNLOCK(ro->ro_rt); 861 if (ia0 != NULL) { 862 IFA_REMREF(ia0); 863 ia0 = NULL; 864 } 865 } 866 867 if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) { 868 struct ifnet *srcifp = NULL; 869 struct in_multi *inm; 870 u_int32_t vif; 871 u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL; 872 u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP; 873 874 m->m_flags |= M_MCAST; 875 /* 876 * IP destination address is multicast. Make sure "dst" 877 * still points to the address in "ro". (It may have been 878 * changed to point to a gateway address, above.) 879 */ 880 dst = SIN(&ro->ro_dst); 881 /* 882 * See if the caller provided any multicast options 883 */ 884 if (imo != NULL) { 885 IMO_LOCK(imo); 886 vif = imo->imo_multicast_vif; 887 ttl = imo->imo_multicast_ttl; 888 loop = imo->imo_multicast_loop; 889 if (!(flags & IP_RAWOUTPUT)) 890 ip->ip_ttl = ttl; 891 if (imo->imo_multicast_ifp != NULL) 892 ifp = imo->imo_multicast_ifp; 893 IMO_UNLOCK(imo); 894 } else if (!(flags & IP_RAWOUTPUT)) { 895 vif = -1; 896 ip->ip_ttl = ttl; 897 } 898 /* 899 * Confirm that the outgoing interface supports multicast. 900 */ 901 if (imo == NULL || vif == -1) { 902 if (!(ifp->if_flags & IFF_MULTICAST)) { 903 OSAddAtomic(1, &ipstat.ips_noroute); 904 error = ENETUNREACH; 905 goto bad; 906 } 907 } 908 /* 909 * If source address not specified yet, use address 910 * of outgoing interface. 911 */ 912 if (ip->ip_src.s_addr == INADDR_ANY) { 913 struct in_ifaddr *ia1; 914 lck_rw_lock_shared(in_ifaddr_rwlock); 915 TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) { 916 IFA_LOCK_SPIN(&ia1->ia_ifa); 917 if (ia1->ia_ifp == ifp) { 918 ip->ip_src = IA_SIN(ia1)->sin_addr; 919 srcifp = ifp; 920 IFA_UNLOCK(&ia1->ia_ifa); 921 break; 922 } 923 IFA_UNLOCK(&ia1->ia_ifa); 924 } 925 lck_rw_done(in_ifaddr_rwlock); 926 if (ip->ip_src.s_addr == INADDR_ANY) { 927 error = ENETUNREACH; 928 goto bad; 929 } 930 } 931 932 in_multihead_lock_shared(); 933 IN_LOOKUP_MULTI(&pkt_dst, ifp, inm); 934 in_multihead_lock_done(); 935 if (inm != NULL && (imo == NULL || loop)) { 936 /* 937 * If we belong to the destination multicast group 938 * on the outgoing interface, and the caller did not 939 * forbid loopback, loop back a copy. 940 */ 941 if (!TAILQ_EMPTY(&ipv4_filters)) { 942 struct ipfilter *filter; 943 int seen = (inject_filter_ref == NULL); 944 945 if (imo != NULL) { 946 ipf_pktopts.ippo_flags |= 947 IPPOF_MCAST_OPTS; 948 ipf_pktopts.ippo_mcast_ifnet = ifp; 949 ipf_pktopts.ippo_mcast_ttl = ttl; 950 ipf_pktopts.ippo_mcast_loop = loop; 951 } 952 953 ipf_ref(); 954 955 /* 956 * 4135317 - always pass network byte 957 * order to filter 958 */ 959#if BYTE_ORDER != BIG_ENDIAN 960 HTONS(ip->ip_len); 961 HTONS(ip->ip_off); 962#endif 963 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { 964 if (seen == 0) { 965 if ((struct ipfilter *) 966 inject_filter_ref == filter) 967 seen = 1; 968 } else if (filter->ipf_filter. 969 ipf_output != NULL) { 970 errno_t result; 971 result = filter->ipf_filter. 972 ipf_output(filter-> 973 ipf_filter.cookie, 974 (mbuf_t *)&m, ippo); 975 if (result == EJUSTRETURN) { 976 ipf_unref(); 977 INM_REMREF(inm); 978 goto done; 979 } 980 if (result != 0) { 981 ipf_unref(); 982 INM_REMREF(inm); 983 goto bad; 984 } 985 } 986 } 987 988 /* set back to host byte order */ 989 ip = mtod(m, struct ip *); 990#if BYTE_ORDER != BIG_ENDIAN 991 NTOHS(ip->ip_len); 992 NTOHS(ip->ip_off); 993#endif 994 ipf_unref(); 995 ipobf.didfilter = TRUE; 996 } 997 ip_mloopback(srcifp, ifp, m, dst, hlen); 998 } 999 if (inm != NULL) 1000 INM_REMREF(inm); 1001 /* 1002 * Multicasts with a time-to-live of zero may be looped- 1003 * back, above, but must not be transmitted on a network. 1004 * Also, multicasts addressed to the loopback interface 1005 * are not sent -- the above call to ip_mloopback() will 1006 * loop back a copy if this host actually belongs to the 1007 * destination group on the loopback interface. 1008 */ 1009 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { 1010 m_freem(m); 1011 goto done; 1012 } 1013 1014 goto sendit; 1015 } 1016 /* 1017 * If source address not specified yet, use address 1018 * of outgoing interface. 1019 */ 1020 if (ip->ip_src.s_addr == INADDR_ANY) { 1021 IFA_LOCK_SPIN(&ia->ia_ifa); 1022 ip->ip_src = IA_SIN(ia)->sin_addr; 1023 IFA_UNLOCK(&ia->ia_ifa); 1024#if IPFIREWALL_FORWARD 1025 /* 1026 * Keep note that we did this - if the firewall changes 1027 * the next-hop, our interface may change, changing the 1028 * default source IP. It's a shame so much effort happens 1029 * twice. Oh well. 1030 */ 1031 ipobf.fwd_rewrite_src = TRUE; 1032#endif /* IPFIREWALL_FORWARD */ 1033 } 1034 1035 /* 1036 * Look for broadcast address and 1037 * and verify user is allowed to send 1038 * such a packet. 1039 */ 1040 if (ipobf.isbroadcast) { 1041 if (!(ifp->if_flags & IFF_BROADCAST)) { 1042 error = EADDRNOTAVAIL; 1043 goto bad; 1044 } 1045 if (!(flags & IP_ALLOWBROADCAST)) { 1046 error = EACCES; 1047 goto bad; 1048 } 1049 /* don't allow broadcast messages to be fragmented */ 1050 if ((u_short)ip->ip_len > ifp->if_mtu) { 1051 error = EMSGSIZE; 1052 goto bad; 1053 } 1054 m->m_flags |= M_BCAST; 1055 } else { 1056 m->m_flags &= ~M_BCAST; 1057 } 1058 1059sendit: 1060#if PF 1061 /* Invoke outbound packet filter */ 1062 if (PF_IS_ENABLED) { 1063 int rc; 1064 1065 m0 = m; /* Save for later */ 1066#if DUMMYNET 1067 args.fwa_m = m; 1068 args.fwa_next_hop = dst; 1069 args.fwa_oif = ifp; 1070 args.fwa_ro = ro; 1071 args.fwa_dst = dst; 1072 args.fwa_oflags = flags; 1073 if (flags & IP_OUTARGS) 1074 args.fwa_ipoa = ipoa; 1075 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, &args); 1076#else /* DUMMYNET */ 1077 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, NULL); 1078#endif /* DUMMYNET */ 1079 if (rc != 0 || m == NULL) { 1080 /* Move to the next packet */ 1081 m = *mppn; 1082 1083 /* Skip ahead if first packet in list got dropped */ 1084 if (packetlist == m0) 1085 packetlist = m; 1086 1087 if (m != NULL) { 1088 m0 = m; 1089 /* Next packet in the chain */ 1090 goto loopit; 1091 } else if (packetlist != NULL) { 1092 /* No more packet; send down the chain */ 1093 goto sendchain; 1094 } 1095 /* Nothing left; we're done */ 1096 goto done; 1097 } 1098 m0 = m; 1099 ip = mtod(m, struct ip *); 1100 pkt_dst = ip->ip_dst; 1101 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 1102 } 1103#endif /* PF */ 1104 /* 1105 * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt 1106 */ 1107 if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || 1108 IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { 1109 ip_linklocal_stat.iplls_out_total++; 1110 if (ip->ip_ttl != MAXTTL) { 1111 ip_linklocal_stat.iplls_out_badttl++; 1112 ip->ip_ttl = MAXTTL; 1113 } 1114 } 1115 1116 if (!ipobf.didfilter && !TAILQ_EMPTY(&ipv4_filters)) { 1117 struct ipfilter *filter; 1118 int seen = (inject_filter_ref == NULL); 1119 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; 1120 1121 /* 1122 * Check that a TSO frame isn't passed to a filter. 1123 * This could happen if a filter is inserted while 1124 * TCP is sending the TSO packet. 1125 */ 1126 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) { 1127 error = EMSGSIZE; 1128 goto bad; 1129 } 1130 1131 ipf_ref(); 1132 1133 /* 4135317 - always pass network byte order to filter */ 1134#if BYTE_ORDER != BIG_ENDIAN 1135 HTONS(ip->ip_len); 1136 HTONS(ip->ip_off); 1137#endif 1138 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { 1139 if (seen == 0) { 1140 if ((struct ipfilter *)inject_filter_ref == 1141 filter) 1142 seen = 1; 1143 } else if (filter->ipf_filter.ipf_output) { 1144 errno_t result; 1145 result = filter->ipf_filter. 1146 ipf_output(filter->ipf_filter.cookie, 1147 (mbuf_t *)&m, ippo); 1148 if (result == EJUSTRETURN) { 1149 ipf_unref(); 1150 goto done; 1151 } 1152 if (result != 0) { 1153 ipf_unref(); 1154 goto bad; 1155 } 1156 } 1157 } 1158 /* set back to host byte order */ 1159 ip = mtod(m, struct ip *); 1160#if BYTE_ORDER != BIG_ENDIAN 1161 NTOHS(ip->ip_len); 1162 NTOHS(ip->ip_off); 1163#endif 1164 ipf_unref(); 1165 } 1166 1167#if NECP 1168 /* Process Network Extension Policy. Will Pass, Drop, or Rebind packet. */ 1169 necp_matched_policy_id = necp_ip_output_find_policy_match (m, 1170 flags, (flags & IP_OUTARGS) ? ipoa : NULL, &necp_result, &necp_result_parameter); 1171 if (necp_matched_policy_id) { 1172 necp_mark_packet_from_ip(m, necp_matched_policy_id); 1173 switch (necp_result) { 1174 case NECP_KERNEL_POLICY_RESULT_PASS: 1175 goto skip_ipsec; 1176 case NECP_KERNEL_POLICY_RESULT_DROP: 1177 case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT: 1178 /* Flow divert packets should be blocked at the IP layer */ 1179 error = EHOSTUNREACH; 1180 goto bad; 1181 case NECP_KERNEL_POLICY_RESULT_IP_TUNNEL: { 1182 /* Verify that the packet is being routed to the tunnel */ 1183 struct ifnet *policy_ifp = necp_get_ifnet_from_result_parameter(&necp_result_parameter); 1184 if (policy_ifp == ifp) { 1185 goto skip_ipsec; 1186 } else { 1187 if (necp_packet_can_rebind_to_ifnet(m, policy_ifp, &necp_route, AF_INET)) { 1188 /* Set ifp to the tunnel interface, since it is compatible with the packet */ 1189 ifp = policy_ifp; 1190 ro = &necp_route; 1191 goto skip_ipsec; 1192 } else { 1193 error = ENETUNREACH; 1194 goto bad; 1195 } 1196 } 1197 break; 1198 } 1199 default: 1200 break; 1201 } 1202 } 1203#endif /* NECP */ 1204 1205#if IPSEC 1206 if (ipsec_bypass != 0 || (flags & IP_NOIPSEC)) 1207 goto skip_ipsec; 1208 1209 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); 1210 1211 if (sp == NULL) { 1212 /* get SP for this packet */ 1213 if (so != NULL) { 1214 sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, 1215 so, &error); 1216 } else { 1217 sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, 1218 flags, &error); 1219 } 1220 if (sp == NULL) { 1221 IPSEC_STAT_INCREMENT(ipsecstat.out_inval); 1222 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1223 0, 0, 0, 0, 0); 1224 goto bad; 1225 } 1226 } 1227 1228 error = 0; 1229 1230 /* check policy */ 1231 switch (sp->policy) { 1232 case IPSEC_POLICY_DISCARD: 1233 case IPSEC_POLICY_GENERATE: 1234 /* 1235 * This packet is just discarded. 1236 */ 1237 IPSEC_STAT_INCREMENT(ipsecstat.out_polvio); 1238 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1239 1, 0, 0, 0, 0); 1240 goto bad; 1241 1242 case IPSEC_POLICY_BYPASS: 1243 case IPSEC_POLICY_NONE: 1244 /* no need to do IPsec. */ 1245 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1246 2, 0, 0, 0, 0); 1247 goto skip_ipsec; 1248 1249 case IPSEC_POLICY_IPSEC: 1250 if (sp->req == NULL) { 1251 /* acquire a policy */ 1252 error = key_spdacquire(sp); 1253 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1254 3, 0, 0, 0, 0); 1255 goto bad; 1256 } 1257 if (sp->ipsec_if) { 1258 /* Verify the redirect to ipsec interface */ 1259 if (sp->ipsec_if == ifp) { 1260 goto skip_ipsec; 1261 } 1262 goto bad; 1263 } 1264 break; 1265 1266 case IPSEC_POLICY_ENTRUST: 1267 default: 1268 printf("ip_output: Invalid policy found. %d\n", sp->policy); 1269 } 1270 { 1271 ipsec_state.m = m; 1272 if (flags & IP_ROUTETOIF) { 1273 bzero(&ipsec_state.ro, sizeof (ipsec_state.ro)); 1274 } else { 1275 route_copyout(&ipsec_state.ro, ro, sizeof (ipsec_state.ro)); 1276 } 1277 ipsec_state.dst = SA(dst); 1278 1279 ip->ip_sum = 0; 1280 1281 /* 1282 * XXX 1283 * delayed checksums are not currently compatible with IPsec 1284 */ 1285 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) 1286 in_delayed_cksum(m); 1287 1288#if BYTE_ORDER != BIG_ENDIAN 1289 HTONS(ip->ip_len); 1290 HTONS(ip->ip_off); 1291#endif 1292 1293 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, 1294 struct ip *, ip, struct ifnet *, ifp, 1295 struct ip *, ip, struct ip6_hdr *, NULL); 1296 1297 error = ipsec4_output(&ipsec_state, sp, flags); 1298 1299 m0 = m = ipsec_state.m; 1300 1301#if DUMMYNET 1302 /* 1303 * If we're about to use the route in ipsec_state 1304 * and this came from dummynet, cleaup now. 1305 */ 1306 if (ro == &saved_route && 1307 (!(flags & IP_ROUTETOIF) || ipsec_state.tunneled)) 1308 ROUTE_RELEASE(ro); 1309#endif /* DUMMYNET */ 1310 1311 if (flags & IP_ROUTETOIF) { 1312 /* 1313 * if we have tunnel mode SA, we may need to ignore 1314 * IP_ROUTETOIF. 1315 */ 1316 if (ipsec_state.tunneled) { 1317 flags &= ~IP_ROUTETOIF; 1318 ro = &ipsec_state.ro; 1319 } 1320 } else { 1321 ro = &ipsec_state.ro; 1322 } 1323 dst = SIN(ipsec_state.dst); 1324 if (error) { 1325 /* mbuf is already reclaimed in ipsec4_output. */ 1326 m0 = NULL; 1327 switch (error) { 1328 case EHOSTUNREACH: 1329 case ENETUNREACH: 1330 case EMSGSIZE: 1331 case ENOBUFS: 1332 case ENOMEM: 1333 break; 1334 default: 1335 printf("ip4_output (ipsec): error code %d\n", error); 1336 /* FALLTHRU */ 1337 case ENOENT: 1338 /* don't show these error codes to the user */ 1339 error = 0; 1340 break; 1341 } 1342 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1343 4, 0, 0, 0, 0); 1344 goto bad; 1345 } 1346 } 1347 1348 /* be sure to update variables that are affected by ipsec4_output() */ 1349 ip = mtod(m, struct ip *); 1350 1351#ifdef _IP_VHL 1352 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 1353#else /* !_IP_VHL */ 1354 hlen = ip->ip_hl << 2; 1355#endif /* !_IP_VHL */ 1356 /* Check that there wasn't a route change and src is still valid */ 1357 if (ROUTE_UNUSABLE(ro)) { 1358 ROUTE_RELEASE(ro); 1359 VERIFY(src_ia == NULL); 1360 if (ip->ip_src.s_addr != INADDR_ANY && 1361 !(flags & (IP_ROUTETOIF | IP_FORWARDING)) && 1362 (src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL) { 1363 error = EADDRNOTAVAIL; 1364 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1365 5, 0, 0, 0, 0); 1366 goto bad; 1367 } 1368 if (src_ia != NULL) { 1369 IFA_REMREF(&src_ia->ia_ifa); 1370 src_ia = NULL; 1371 } 1372 } 1373 1374 if (ro->ro_rt == NULL) { 1375 if (!(flags & IP_ROUTETOIF)) { 1376 printf("%s: can't update route after " 1377 "IPsec processing\n", __func__); 1378 error = EHOSTUNREACH; /* XXX */ 1379 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1380 6, 0, 0, 0, 0); 1381 goto bad; 1382 } 1383 } else { 1384 if (ia != NULL) 1385 IFA_REMREF(&ia->ia_ifa); 1386 RT_LOCK_SPIN(ro->ro_rt); 1387 ia = ifatoia(ro->ro_rt->rt_ifa); 1388 if (ia != NULL) { 1389 /* Become a regular mutex */ 1390 RT_CONVERT_LOCK(ro->ro_rt); 1391 IFA_ADDREF(&ia->ia_ifa); 1392 } 1393 ifp = ro->ro_rt->rt_ifp; 1394 RT_UNLOCK(ro->ro_rt); 1395 } 1396 1397 /* make it flipped, again. */ 1398#if BYTE_ORDER != BIG_ENDIAN 1399 NTOHS(ip->ip_len); 1400 NTOHS(ip->ip_off); 1401#endif 1402 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1403 7, 0xff, 0xff, 0xff, 0xff); 1404 1405 /* Pass to filters again */ 1406 if (!TAILQ_EMPTY(&ipv4_filters)) { 1407 struct ipfilter *filter; 1408 1409 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; 1410 1411 /* 1412 * Check that a TSO frame isn't passed to a filter. 1413 * This could happen if a filter is inserted while 1414 * TCP is sending the TSO packet. 1415 */ 1416 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) { 1417 error = EMSGSIZE; 1418 goto bad; 1419 } 1420 1421 ipf_ref(); 1422 1423 /* 4135317 - always pass network byte order to filter */ 1424#if BYTE_ORDER != BIG_ENDIAN 1425 HTONS(ip->ip_len); 1426 HTONS(ip->ip_off); 1427#endif 1428 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { 1429 if (filter->ipf_filter.ipf_output) { 1430 errno_t result; 1431 result = filter->ipf_filter. 1432 ipf_output(filter->ipf_filter.cookie, 1433 (mbuf_t *)&m, ippo); 1434 if (result == EJUSTRETURN) { 1435 ipf_unref(); 1436 goto done; 1437 } 1438 if (result != 0) { 1439 ipf_unref(); 1440 goto bad; 1441 } 1442 } 1443 } 1444 /* set back to host byte order */ 1445 ip = mtod(m, struct ip *); 1446#if BYTE_ORDER != BIG_ENDIAN 1447 NTOHS(ip->ip_len); 1448 NTOHS(ip->ip_off); 1449#endif 1450 ipf_unref(); 1451 } 1452skip_ipsec: 1453#endif /* IPSEC */ 1454 1455#if IPFIREWALL 1456 /* 1457 * Check with the firewall... 1458 * but not if we are already being fwd'd from a firewall. 1459 */ 1460 if (fw_enable && IPFW_LOADED && !args.fwa_next_hop) { 1461 struct sockaddr_in *old = dst; 1462 1463 args.fwa_m = m; 1464 args.fwa_next_hop = dst; 1465 args.fwa_oif = ifp; 1466 ipfwoff = ip_fw_chk_ptr(&args); 1467 m = args.fwa_m; 1468 dst = args.fwa_next_hop; 1469 1470 /* 1471 * On return we must do the following: 1472 * IP_FW_PORT_DENY_FLAG -> drop the pkt (XXX new) 1473 * 1<=off<= 0xffff -> DIVERT 1474 * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe 1475 * (off & IP_FW_PORT_TEE_FLAG) -> TEE the packet 1476 * dst != old -> IPFIREWALL_FORWARD 1477 * off==0, dst==old -> accept 1478 * If some of the above modules is not compiled in, then 1479 * we should't have to check the corresponding condition 1480 * (because the ipfw control socket should not accept 1481 * unsupported rules), but better play safe and drop 1482 * packets in case of doubt. 1483 */ 1484 m0 = m; 1485 if ((ipfwoff & IP_FW_PORT_DENY_FLAG) || m == NULL) { 1486 if (m) 1487 m_freem(m); 1488 error = EACCES; 1489 goto done; 1490 } 1491 ip = mtod(m, struct ip *); 1492 1493 if (ipfwoff == 0 && dst == old) { /* common case */ 1494 goto pass; 1495 } 1496#if DUMMYNET 1497 if (DUMMYNET_LOADED && (ipfwoff & IP_FW_PORT_DYNT_FLAG) != 0) { 1498 /* 1499 * pass the pkt to dummynet. Need to include 1500 * pipe number, m, ifp, ro, dst because these are 1501 * not recomputed in the next pass. 1502 * All other parameters have been already used and 1503 * so they are not needed anymore. 1504 * XXX note: if the ifp or ro entry are deleted 1505 * while a pkt is in dummynet, we are in trouble! 1506 */ 1507 args.fwa_ro = ro; 1508 args.fwa_dst = dst; 1509 args.fwa_oflags = flags; 1510 if (flags & IP_OUTARGS) 1511 args.fwa_ipoa = ipoa; 1512 1513 error = ip_dn_io_ptr(m, ipfwoff & 0xffff, DN_TO_IP_OUT, 1514 &args, DN_CLIENT_IPFW); 1515 goto done; 1516 } 1517#endif /* DUMMYNET */ 1518#if IPDIVERT 1519 if (ipfwoff != 0 && (ipfwoff & IP_FW_PORT_DYNT_FLAG) == 0) { 1520 struct mbuf *clone = NULL; 1521 1522 /* Clone packet if we're doing a 'tee' */ 1523 if ((ipfwoff & IP_FW_PORT_TEE_FLAG) != 0) 1524 clone = m_dup(m, M_DONTWAIT); 1525 /* 1526 * XXX 1527 * delayed checksums are not currently compatible 1528 * with divert sockets. 1529 */ 1530 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) 1531 in_delayed_cksum(m); 1532 1533 /* Restore packet header fields to original values */ 1534 1535#if BYTE_ORDER != BIG_ENDIAN 1536 HTONS(ip->ip_len); 1537 HTONS(ip->ip_off); 1538#endif 1539 1540 /* Deliver packet to divert input routine */ 1541 divert_packet(m, 0, ipfwoff & 0xffff, 1542 args.fwa_divert_rule); 1543 1544 /* If 'tee', continue with original packet */ 1545 if (clone != NULL) { 1546 m0 = m = clone; 1547 ip = mtod(m, struct ip *); 1548 goto pass; 1549 } 1550 goto done; 1551 } 1552#endif /* IPDIVERT */ 1553#if IPFIREWALL_FORWARD 1554 /* 1555 * Here we check dst to make sure it's directly reachable on 1556 * the interface we previously thought it was. 1557 * If it isn't (which may be likely in some situations) we have 1558 * to re-route it (ie, find a route for the next-hop and the 1559 * associated interface) and set them here. This is nested 1560 * forwarding which in most cases is undesirable, except where 1561 * such control is nigh impossible. So we do it here. 1562 * And I'm babbling. 1563 */ 1564 if (ipfwoff == 0 && old != dst) { 1565 struct in_ifaddr *ia_fw; 1566 struct route *ro_fwd = &sro_fwd; 1567 1568#if IPFIREWALL_FORWARD_DEBUG 1569 printf("IPFIREWALL_FORWARD: New dst ip: "); 1570 print_ip(dst->sin_addr); 1571 printf("\n"); 1572#endif /* IPFIREWALL_FORWARD_DEBUG */ 1573 /* 1574 * We need to figure out if we have been forwarded 1575 * to a local socket. If so then we should somehow 1576 * "loop back" to ip_input, and get directed to the 1577 * PCB as if we had received this packet. This is 1578 * because it may be dificult to identify the packets 1579 * you want to forward until they are being output 1580 * and have selected an interface. (e.g. locally 1581 * initiated packets) If we used the loopback inteface, 1582 * we would not be able to control what happens 1583 * as the packet runs through ip_input() as 1584 * it is done through a ISR. 1585 */ 1586 lck_rw_lock_shared(in_ifaddr_rwlock); 1587 TAILQ_FOREACH(ia_fw, &in_ifaddrhead, ia_link) { 1588 /* 1589 * If the addr to forward to is one 1590 * of ours, we pretend to 1591 * be the destination for this packet. 1592 */ 1593 IFA_LOCK_SPIN(&ia_fw->ia_ifa); 1594 if (IA_SIN(ia_fw)->sin_addr.s_addr == 1595 dst->sin_addr.s_addr) { 1596 IFA_UNLOCK(&ia_fw->ia_ifa); 1597 break; 1598 } 1599 IFA_UNLOCK(&ia_fw->ia_ifa); 1600 } 1601 lck_rw_done(in_ifaddr_rwlock); 1602 if (ia_fw) { 1603 /* tell ip_input "dont filter" */ 1604 struct m_tag *fwd_tag; 1605 struct ip_fwd_tag *ipfwd_tag; 1606 1607 fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID, 1608 KERNEL_TAG_TYPE_IPFORWARD, 1609 sizeof (*ipfwd_tag), M_NOWAIT, m); 1610 if (fwd_tag == NULL) { 1611 error = ENOBUFS; 1612 goto bad; 1613 } 1614 1615 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); 1616 ipfwd_tag->next_hop = args.fwa_next_hop; 1617 1618 m_tag_prepend(m, fwd_tag); 1619 1620 if (m->m_pkthdr.rcvif == NULL) 1621 m->m_pkthdr.rcvif = lo_ifp; 1622 1623#if BYTE_ORDER != BIG_ENDIAN 1624 HTONS(ip->ip_len); 1625 HTONS(ip->ip_off); 1626#endif 1627 mbuf_outbound_finalize(m, PF_INET, 0); 1628 1629 /* 1630 * we need to call dlil_output to run filters 1631 * and resync to avoid recursion loops. 1632 */ 1633 if (lo_ifp) { 1634 dlil_output(lo_ifp, PF_INET, m, NULL, 1635 SA(dst), 0, adv); 1636 } else { 1637 printf("%s: no loopback ifp for " 1638 "forwarding!!!\n", __func__); 1639 } 1640 goto done; 1641 } 1642 /* 1643 * Some of the logic for this was nicked from above. 1644 * 1645 * This rewrites the cached route in a local PCB. 1646 * Is this what we want to do? 1647 */ 1648 ROUTE_RELEASE(ro_fwd); 1649 bcopy(dst, &ro_fwd->ro_dst, sizeof (*dst)); 1650 1651 rtalloc_ign(ro_fwd, RTF_PRCLONING); 1652 1653 if (ro_fwd->ro_rt == NULL) { 1654 OSAddAtomic(1, &ipstat.ips_noroute); 1655 error = EHOSTUNREACH; 1656 goto bad; 1657 } 1658 1659 RT_LOCK_SPIN(ro_fwd->ro_rt); 1660 ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa); 1661 if (ia_fw != NULL) { 1662 /* Become a regular mutex */ 1663 RT_CONVERT_LOCK(ro_fwd->ro_rt); 1664 IFA_ADDREF(&ia_fw->ia_ifa); 1665 } 1666 ifp = ro_fwd->ro_rt->rt_ifp; 1667 ro_fwd->ro_rt->rt_use++; 1668 if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) 1669 dst = SIN(ro_fwd->ro_rt->rt_gateway); 1670 if (ro_fwd->ro_rt->rt_flags & RTF_HOST) { 1671 /* double negation needed for bool bit field */ 1672 ipobf.isbroadcast = 1673 !!(ro_fwd->ro_rt->rt_flags & RTF_BROADCAST); 1674 } else { 1675 /* Become a regular mutex */ 1676 RT_CONVERT_LOCK(ro_fwd->ro_rt); 1677 ipobf.isbroadcast = 1678 in_broadcast(dst->sin_addr, ifp); 1679 } 1680 RT_UNLOCK(ro_fwd->ro_rt); 1681 ROUTE_RELEASE(ro); 1682 ro->ro_rt = ro_fwd->ro_rt; 1683 ro_fwd->ro_rt = NULL; 1684 dst = SIN(&ro_fwd->ro_dst); 1685 1686 /* 1687 * If we added a default src ip earlier, 1688 * which would have been gotten from the-then 1689 * interface, do it again, from the new one. 1690 */ 1691 if (ia_fw != NULL) { 1692 if (ipobf.fwd_rewrite_src) { 1693 IFA_LOCK_SPIN(&ia_fw->ia_ifa); 1694 ip->ip_src = IA_SIN(ia_fw)->sin_addr; 1695 IFA_UNLOCK(&ia_fw->ia_ifa); 1696 } 1697 IFA_REMREF(&ia_fw->ia_ifa); 1698 } 1699 goto pass; 1700 } 1701#endif /* IPFIREWALL_FORWARD */ 1702 /* 1703 * if we get here, none of the above matches, and 1704 * we have to drop the pkt 1705 */ 1706 m_freem(m); 1707 error = EACCES; /* not sure this is the right error msg */ 1708 goto done; 1709 } 1710 1711pass: 1712#endif /* IPFIREWALL */ 1713 1714 /* 127/8 must not appear on wire - RFC1122 */ 1715 if (!(ifp->if_flags & IFF_LOOPBACK) && 1716 ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 1717 (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) { 1718 OSAddAtomic(1, &ipstat.ips_badaddr); 1719 error = EADDRNOTAVAIL; 1720 goto bad; 1721 } 1722 1723 ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2), 1724 ip->ip_len, &sw_csum); 1725 1726 /* 1727 * If small enough for interface, or the interface will take 1728 * care of the fragmentation for us, can just send directly. 1729 */ 1730 if ((u_short)ip->ip_len <= ifp->if_mtu || TSO_IPV4_OK(ifp, m) || 1731 (!(ip->ip_off & IP_DF) && (ifp->if_hwassist & CSUM_FRAGMENT))) { 1732#if BYTE_ORDER != BIG_ENDIAN 1733 HTONS(ip->ip_len); 1734 HTONS(ip->ip_off); 1735#endif 1736 1737 ip->ip_sum = 0; 1738 if (sw_csum & CSUM_DELAY_IP) { 1739 ip->ip_sum = ip_cksum_hdr_out(m, hlen); 1740 sw_csum &= ~CSUM_DELAY_IP; 1741 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP; 1742 } 1743 1744#if IPSEC 1745 /* clean ipsec history once it goes out of the node */ 1746 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) 1747 ipsec_delaux(m); 1748#endif /* IPSEC */ 1749 if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) && 1750 (m->m_pkthdr.tso_segsz > 0)) 1751 scnt += m->m_pkthdr.len / m->m_pkthdr.tso_segsz; 1752 else 1753 scnt++; 1754 1755 if (packetchain == 0) { 1756 if (ro->ro_rt != NULL && nstat_collect) 1757 nstat_route_tx(ro->ro_rt, scnt, 1758 m->m_pkthdr.len, 0); 1759 1760 error = dlil_output(ifp, PF_INET, m, ro->ro_rt, 1761 SA(dst), 0, adv); 1762 if (dlil_verbose && error) { 1763 printf("dlil_output error on interface %s: %d\n", 1764 ifp->if_xname, error); 1765 } 1766 scnt = 0; 1767 goto done; 1768 } else { 1769 /* 1770 * packet chaining allows us to reuse the 1771 * route for all packets 1772 */ 1773 bytecnt += m->m_pkthdr.len; 1774 mppn = &m->m_nextpkt; 1775 m = m->m_nextpkt; 1776 if (m == NULL) { 1777#if PF 1778sendchain: 1779#endif /* PF */ 1780 if (pktcnt > ip_maxchainsent) 1781 ip_maxchainsent = pktcnt; 1782 if (ro->ro_rt != NULL && nstat_collect) 1783 nstat_route_tx(ro->ro_rt, scnt, 1784 bytecnt, 0); 1785 1786 error = dlil_output(ifp, PF_INET, packetlist, 1787 ro->ro_rt, SA(dst), 0, adv); 1788 if (dlil_verbose && error) { 1789 printf("dlil_output error on interface %s: %d\n", 1790 ifp->if_xname, error); 1791 } 1792 pktcnt = 0; 1793 scnt = 0; 1794 bytecnt = 0; 1795 goto done; 1796 1797 } 1798 m0 = m; 1799 pktcnt++; 1800 goto loopit; 1801 } 1802 } 1803 /* 1804 * Too large for interface; fragment if possible. 1805 * Must be able to put at least 8 bytes per fragment. 1806 * Balk when DF bit is set or the interface didn't support TSO. 1807 */ 1808 if ((ip->ip_off & IP_DF) || pktcnt > 0 || 1809 (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4)) { 1810 error = EMSGSIZE; 1811 /* 1812 * This case can happen if the user changed the MTU 1813 * of an interface after enabling IP on it. Because 1814 * most netifs don't keep track of routes pointing to 1815 * them, there is no way for one to update all its 1816 * routes when the MTU is changed. 1817 */ 1818 if (ro->ro_rt) { 1819 RT_LOCK_SPIN(ro->ro_rt); 1820 if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) && 1821 !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) && 1822 (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { 1823 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; 1824 } 1825 RT_UNLOCK(ro->ro_rt); 1826 } 1827 if (pktcnt > 0) { 1828 m0 = packetlist; 1829 } 1830 OSAddAtomic(1, &ipstat.ips_cantfrag); 1831 goto bad; 1832 } 1833 1834 error = ip_fragment(m, ifp, ifp->if_mtu, sw_csum); 1835 if (error != 0) { 1836 m0 = m = NULL; 1837 goto bad; 1838 } 1839 1840 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, 1841 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); 1842 1843 for (m = m0; m; m = m0) { 1844 m0 = m->m_nextpkt; 1845 m->m_nextpkt = 0; 1846#if IPSEC 1847 /* clean ipsec history once it goes out of the node */ 1848 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) 1849 ipsec_delaux(m); 1850#endif /* IPSEC */ 1851 if (error == 0) { 1852 if ((packetchain != 0) && (pktcnt > 0)) { 1853 panic("%s: mix of packet in packetlist is " 1854 "wrong=%p", __func__, packetlist); 1855 /* NOTREACHED */ 1856 } 1857 if (ro->ro_rt != NULL && nstat_collect) { 1858 nstat_route_tx(ro->ro_rt, 1, 1859 m->m_pkthdr.len, 0); 1860 } 1861 error = dlil_output(ifp, PF_INET, m, ro->ro_rt, 1862 SA(dst), 0, adv); 1863 if (dlil_verbose && error) { 1864 printf("dlil_output error on interface %s: %d\n", 1865 ifp->if_xname, error); 1866 } 1867 } else { 1868 m_freem(m); 1869 } 1870 } 1871 1872 if (error == 0) 1873 OSAddAtomic(1, &ipstat.ips_fragmented); 1874 1875done: 1876 if (ia != NULL) { 1877 IFA_REMREF(&ia->ia_ifa); 1878 ia = NULL; 1879 } 1880#if IPSEC 1881 ROUTE_RELEASE(&ipsec_state.ro); 1882 if (sp != NULL) { 1883 KEYDEBUG(KEYDEBUG_IPSEC_STAMP, 1884 printf("DP ip_output call free SP:%x\n", sp)); 1885 key_freesp(sp, KEY_SADB_UNLOCKED); 1886 } 1887#endif /* IPSEC */ 1888#if NECP 1889 ROUTE_RELEASE(&necp_route); 1890#endif /* NECP */ 1891#if DUMMYNET 1892 ROUTE_RELEASE(&saved_route); 1893#endif /* DUMMYNET */ 1894#if IPFIREWALL_FORWARD 1895 ROUTE_RELEASE(&sro_fwd); 1896#endif /* IPFIREWALL_FORWARD */ 1897 1898 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error, 0, 0, 0, 0); 1899 return (error); 1900bad: 1901 if (pktcnt > 0) 1902 m0 = packetlist; 1903 m_freem_list(m0); 1904 goto done; 1905 1906#undef ipsec_state 1907#undef args 1908#undef sro_fwd 1909#undef saved_route 1910#undef ipf_pktopts 1911#undef IP_CHECK_RESTRICTIONS 1912} 1913 1914int 1915ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) 1916{ 1917 struct ip *ip, *mhip; 1918 int len, hlen, mhlen, firstlen, off, error = 0; 1919 struct mbuf **mnext = &m->m_nextpkt, *m0; 1920 int nfrags = 1; 1921 1922 ip = mtod(m, struct ip *); 1923#ifdef _IP_VHL 1924 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 1925#else /* !_IP_VHL */ 1926 hlen = ip->ip_hl << 2; 1927#endif /* !_IP_VHL */ 1928 1929 firstlen = len = (mtu - hlen) &~ 7; 1930 if (len < 8) { 1931 m_freem(m); 1932 return (EMSGSIZE); 1933 } 1934 1935 /* 1936 * if the interface will not calculate checksums on 1937 * fragmented packets, then do it here. 1938 */ 1939 if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) && 1940 !(ifp->if_hwassist & CSUM_IP_FRAGS)) 1941 in_delayed_cksum(m); 1942 1943 /* 1944 * Loop through length of segment after first fragment, 1945 * make new header and copy data of each part and link onto chain. 1946 */ 1947 m0 = m; 1948 mhlen = sizeof (struct ip); 1949 for (off = hlen + len; off < (u_short)ip->ip_len; off += len) { 1950 MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */ 1951 if (m == NULL) { 1952 error = ENOBUFS; 1953 OSAddAtomic(1, &ipstat.ips_odropped); 1954 goto sendorfree; 1955 } 1956 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; 1957 m->m_data += max_linkhdr; 1958 mhip = mtod(m, struct ip *); 1959 *mhip = *ip; 1960 if (hlen > sizeof (struct ip)) { 1961 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 1962 mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2); 1963 } 1964 m->m_len = mhlen; 1965 mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF); 1966 if (ip->ip_off & IP_MF) 1967 mhip->ip_off |= IP_MF; 1968 if (off + len >= (u_short)ip->ip_len) 1969 len = (u_short)ip->ip_len - off; 1970 else 1971 mhip->ip_off |= IP_MF; 1972 mhip->ip_len = htons((u_short)(len + mhlen)); 1973 m->m_next = m_copy(m0, off, len); 1974 if (m->m_next == NULL) { 1975 (void) m_free(m); 1976 error = ENOBUFS; /* ??? */ 1977 OSAddAtomic(1, &ipstat.ips_odropped); 1978 goto sendorfree; 1979 } 1980 m->m_pkthdr.len = mhlen + len; 1981 m->m_pkthdr.rcvif = NULL; 1982 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; 1983 1984 M_COPY_CLASSIFIER(m, m0); 1985 M_COPY_PFTAG(m, m0); 1986 1987#if CONFIG_MACF_NET 1988 mac_netinet_fragment(m0, m); 1989#endif /* CONFIG_MACF_NET */ 1990 1991#if BYTE_ORDER != BIG_ENDIAN 1992 HTONS(mhip->ip_off); 1993#endif 1994 1995 mhip->ip_sum = 0; 1996 if (sw_csum & CSUM_DELAY_IP) { 1997 mhip->ip_sum = ip_cksum_hdr_out(m, mhlen); 1998 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP; 1999 } 2000 *mnext = m; 2001 mnext = &m->m_nextpkt; 2002 nfrags++; 2003 } 2004 OSAddAtomic(nfrags, &ipstat.ips_ofragments); 2005 2006 /* set first/last markers for fragment chain */ 2007 m->m_flags |= M_LASTFRAG; 2008 m0->m_flags |= M_FIRSTFRAG | M_FRAG; 2009 m0->m_pkthdr.csum_data = nfrags; 2010 2011 /* 2012 * Update first fragment by trimming what's been copied out 2013 * and updating header, then send each fragment (in order). 2014 */ 2015 m = m0; 2016 m_adj(m, hlen + firstlen - (u_short)ip->ip_len); 2017 m->m_pkthdr.len = hlen + firstlen; 2018 ip->ip_len = htons((u_short)m->m_pkthdr.len); 2019 ip->ip_off |= IP_MF; 2020 2021#if BYTE_ORDER != BIG_ENDIAN 2022 HTONS(ip->ip_off); 2023#endif 2024 2025 ip->ip_sum = 0; 2026 if (sw_csum & CSUM_DELAY_IP) { 2027 ip->ip_sum = ip_cksum_hdr_out(m, hlen); 2028 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP; 2029 } 2030sendorfree: 2031 if (error) 2032 m_freem_list(m0); 2033 2034 return (error); 2035} 2036 2037static void 2038ip_out_cksum_stats(int proto, u_int32_t len) 2039{ 2040 switch (proto) { 2041 case IPPROTO_TCP: 2042 tcp_out_cksum_stats(len); 2043 break; 2044 case IPPROTO_UDP: 2045 udp_out_cksum_stats(len); 2046 break; 2047 default: 2048 /* keep only TCP or UDP stats for now */ 2049 break; 2050 } 2051} 2052 2053/* 2054 * Process a delayed payload checksum calculation (outbound path.) 2055 * 2056 * hoff is the number of bytes beyond the mbuf data pointer which 2057 * points to the IP header. 2058 * 2059 * Returns a bitmask representing all the work done in software. 2060 */ 2061uint32_t 2062in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags) 2063{ 2064 unsigned char buf[15 << 2] __attribute__((aligned(8))); 2065 struct ip *ip; 2066 uint32_t offset, _hlen, mlen, hlen, len, sw_csum; 2067 uint16_t csum, ip_len; 2068 2069 _CASSERT(sizeof (csum) == sizeof (uint16_t)); 2070 VERIFY(m->m_flags & M_PKTHDR); 2071 2072 sw_csum = (csum_flags & m->m_pkthdr.csum_flags); 2073 2074 if ((sw_csum &= (CSUM_DELAY_IP | CSUM_DELAY_DATA)) == 0) 2075 goto done; 2076 2077 mlen = m->m_pkthdr.len; /* total mbuf len */ 2078 2079 /* sanity check (need at least simple IP header) */ 2080 if (mlen < (hoff + sizeof (*ip))) { 2081 panic("%s: mbuf %p pkt len (%u) < hoff+ip_hdr " 2082 "(%u+%u)\n", __func__, m, mlen, hoff, 2083 (uint32_t)sizeof (*ip)); 2084 /* NOTREACHED */ 2085 } 2086 2087 /* 2088 * In case the IP header is not contiguous, or not 32-bit aligned, 2089 * or if we're computing the IP header checksum, copy it to a local 2090 * buffer. Copy only the simple IP header here (IP options case 2091 * is handled below.) 2092 */ 2093 if ((sw_csum & CSUM_DELAY_IP) || (hoff + sizeof (*ip)) > m->m_len || 2094 !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + hoff)) { 2095 m_copydata(m, hoff, sizeof (*ip), (caddr_t)buf); 2096 ip = (struct ip *)(void *)buf; 2097 _hlen = sizeof (*ip); 2098 } else { 2099 ip = (struct ip *)(void *)(m->m_data + hoff); 2100 _hlen = 0; 2101 } 2102 2103 hlen = IP_VHL_HL(ip->ip_vhl) << 2; /* IP header len */ 2104 2105 /* sanity check */ 2106 if (mlen < (hoff + hlen)) { 2107 panic("%s: mbuf %p pkt too short (%d) for IP header (%u), " 2108 "hoff %u", __func__, m, mlen, hlen, hoff); 2109 /* NOTREACHED */ 2110 } 2111 2112 /* 2113 * We could be in the context of an IP or interface filter; in the 2114 * former case, ip_len would be in host (correct) order while for 2115 * the latter it would be in network order. Because of this, we 2116 * attempt to interpret the length field by comparing it against 2117 * the actual packet length. If the comparison fails, byte swap 2118 * the length and check again. If it still fails, use the actual 2119 * packet length. This also covers the trailing bytes case. 2120 */ 2121 ip_len = ip->ip_len; 2122 if (ip_len != (mlen - hoff)) { 2123 ip_len = OSSwapInt16(ip_len); 2124 if (ip_len != (mlen - hoff)) { 2125 printf("%s: mbuf 0x%llx proto %d IP len %d (%x) " 2126 "[swapped %d (%x)] doesn't match actual packet " 2127 "length; %d is used instead\n", __func__, 2128 (uint64_t)VM_KERNEL_ADDRPERM(m), ip->ip_p, 2129 ip->ip_len, ip->ip_len, ip_len, ip_len, 2130 (mlen - hoff)); 2131 ip_len = mlen - hoff; 2132 } 2133 } 2134 2135 len = ip_len - hlen; /* csum span */ 2136 2137 if (sw_csum & CSUM_DELAY_DATA) { 2138 uint16_t ulpoff; 2139 2140 /* 2141 * offset is added to the lower 16-bit value of csum_data, 2142 * which is expected to contain the ULP offset; therefore 2143 * CSUM_PARTIAL offset adjustment must be undone. 2144 */ 2145 if ((m->m_pkthdr.csum_flags & (CSUM_PARTIAL|CSUM_DATA_VALID)) == 2146 (CSUM_PARTIAL|CSUM_DATA_VALID)) { 2147 /* 2148 * Get back the original ULP offset (this will 2149 * undo the CSUM_PARTIAL logic in ip_output.) 2150 */ 2151 m->m_pkthdr.csum_data = (m->m_pkthdr.csum_tx_stuff - 2152 m->m_pkthdr.csum_tx_start); 2153 } 2154 2155 ulpoff = (m->m_pkthdr.csum_data & 0xffff); /* ULP csum offset */ 2156 offset = hoff + hlen; /* ULP header */ 2157 2158 if (mlen < (ulpoff + sizeof (csum))) { 2159 panic("%s: mbuf %p pkt len (%u) proto %d invalid ULP " 2160 "cksum offset (%u) cksum flags 0x%x\n", __func__, 2161 m, mlen, ip->ip_p, ulpoff, m->m_pkthdr.csum_flags); 2162 /* NOTREACHED */ 2163 } 2164 2165 csum = inet_cksum(m, 0, offset, len); 2166 2167 /* Update stats */ 2168 ip_out_cksum_stats(ip->ip_p, len); 2169 2170 /* RFC1122 4.1.3.4 */ 2171 if (csum == 0 && (m->m_pkthdr.csum_flags & CSUM_UDP)) 2172 csum = 0xffff; 2173 2174 /* Insert the checksum in the ULP csum field */ 2175 offset += ulpoff; 2176 if (offset + sizeof (csum) > m->m_len) { 2177 m_copyback(m, offset, sizeof (csum), &csum); 2178 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) { 2179 *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum; 2180 } else { 2181 bcopy(&csum, (mtod(m, char *) + offset), sizeof (csum)); 2182 } 2183 m->m_pkthdr.csum_flags &= 2184 ~(CSUM_DELAY_DATA | CSUM_DATA_VALID | CSUM_PARTIAL); 2185 } 2186 2187 if (sw_csum & CSUM_DELAY_IP) { 2188 /* IP header must be in the local buffer */ 2189 VERIFY(_hlen == sizeof (*ip)); 2190 if (_hlen != hlen) { 2191 VERIFY(hlen <= sizeof (buf)); 2192 m_copydata(m, hoff, hlen, (caddr_t)buf); 2193 ip = (struct ip *)(void *)buf; 2194 _hlen = hlen; 2195 } 2196 2197 /* 2198 * Compute the IP header checksum as if the IP length 2199 * is the length which we believe is "correct"; see 2200 * how ip_len gets calculated above. Note that this 2201 * is done on the local copy and not on the real one. 2202 */ 2203 ip->ip_len = htons(ip_len); 2204 ip->ip_sum = 0; 2205 csum = in_cksum_hdr_opt(ip); 2206 2207 /* Update stats */ 2208 ipstat.ips_snd_swcsum++; 2209 ipstat.ips_snd_swcsum_bytes += hlen; 2210 2211 /* 2212 * Insert only the checksum in the existing IP header 2213 * csum field; all other fields are left unchanged. 2214 */ 2215 offset = hoff + offsetof(struct ip, ip_sum); 2216 if (offset + sizeof (csum) > m->m_len) { 2217 m_copyback(m, offset, sizeof (csum), &csum); 2218 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) { 2219 *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum; 2220 } else { 2221 bcopy(&csum, (mtod(m, char *) + offset), sizeof (csum)); 2222 } 2223 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP; 2224 } 2225 2226done: 2227 return (sw_csum); 2228} 2229 2230/* 2231 * Insert IP options into preformed packet. 2232 * Adjust IP destination as required for IP source routing, 2233 * as indicated by a non-zero in_addr at the start of the options. 2234 * 2235 * XXX This routine assumes that the packet has no options in place. 2236 */ 2237static struct mbuf * 2238ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) 2239{ 2240 struct ipoption *p = mtod(opt, struct ipoption *); 2241 struct mbuf *n; 2242 struct ip *ip = mtod(m, struct ip *); 2243 unsigned optlen; 2244 2245 optlen = opt->m_len - sizeof (p->ipopt_dst); 2246 if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) 2247 return (m); /* XXX should fail */ 2248 if (p->ipopt_dst.s_addr) 2249 ip->ip_dst = p->ipopt_dst; 2250 if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { 2251 MGETHDR(n, M_DONTWAIT, MT_HEADER); /* MAC-OK */ 2252 if (n == NULL) 2253 return (m); 2254 n->m_pkthdr.rcvif = 0; 2255#if CONFIG_MACF_NET 2256 mac_mbuf_label_copy(m, n); 2257#endif /* CONFIG_MACF_NET */ 2258 n->m_pkthdr.len = m->m_pkthdr.len + optlen; 2259 m->m_len -= sizeof (struct ip); 2260 m->m_data += sizeof (struct ip); 2261 n->m_next = m; 2262 m = n; 2263 m->m_len = optlen + sizeof (struct ip); 2264 m->m_data += max_linkhdr; 2265 (void) memcpy(mtod(m, void *), ip, sizeof (struct ip)); 2266 } else { 2267 m->m_data -= optlen; 2268 m->m_len += optlen; 2269 m->m_pkthdr.len += optlen; 2270 ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof (struct ip)); 2271 } 2272 ip = mtod(m, struct ip *); 2273 bcopy(p->ipopt_list, ip + 1, optlen); 2274 *phlen = sizeof (struct ip) + optlen; 2275 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2); 2276 ip->ip_len += optlen; 2277 return (m); 2278} 2279 2280/* 2281 * Copy options from ip to jp, 2282 * omitting those not copied during fragmentation. 2283 */ 2284static int 2285ip_optcopy(struct ip *ip, struct ip *jp) 2286{ 2287 u_char *cp, *dp; 2288 int opt, optlen, cnt; 2289 2290 cp = (u_char *)(ip + 1); 2291 dp = (u_char *)(jp + 1); 2292 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); 2293 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2294 opt = cp[0]; 2295 if (opt == IPOPT_EOL) 2296 break; 2297 if (opt == IPOPT_NOP) { 2298 /* Preserve for IP mcast tunnel's LSRR alignment. */ 2299 *dp++ = IPOPT_NOP; 2300 optlen = 1; 2301 continue; 2302 } 2303#if DIAGNOSTIC 2304 if (cnt < IPOPT_OLEN + sizeof (*cp)) { 2305 panic("malformed IPv4 option passed to ip_optcopy"); 2306 /* NOTREACHED */ 2307 } 2308#endif 2309 optlen = cp[IPOPT_OLEN]; 2310#if DIAGNOSTIC 2311 if (optlen < IPOPT_OLEN + sizeof (*cp) || optlen > cnt) { 2312 panic("malformed IPv4 option passed to ip_optcopy"); 2313 /* NOTREACHED */ 2314 } 2315#endif 2316 /* bogus lengths should have been caught by ip_dooptions */ 2317 if (optlen > cnt) 2318 optlen = cnt; 2319 if (IPOPT_COPIED(opt)) { 2320 bcopy(cp, dp, optlen); 2321 dp += optlen; 2322 } 2323 } 2324 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) 2325 *dp++ = IPOPT_EOL; 2326 return (optlen); 2327} 2328 2329/* 2330 * IP socket option processing. 2331 */ 2332int 2333ip_ctloutput(struct socket *so, struct sockopt *sopt) 2334{ 2335 struct inpcb *inp = sotoinpcb(so); 2336 int error, optval; 2337 2338 error = optval = 0; 2339 if (sopt->sopt_level != IPPROTO_IP) 2340 return (EINVAL); 2341 2342 switch (sopt->sopt_dir) { 2343 case SOPT_SET: 2344 switch (sopt->sopt_name) { 2345#ifdef notyet 2346 case IP_RETOPTS: 2347#endif 2348 case IP_OPTIONS: { 2349 struct mbuf *m; 2350 2351 if (sopt->sopt_valsize > MLEN) { 2352 error = EMSGSIZE; 2353 break; 2354 } 2355 MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT, 2356 MT_HEADER); 2357 if (m == NULL) { 2358 error = ENOBUFS; 2359 break; 2360 } 2361 m->m_len = sopt->sopt_valsize; 2362 error = sooptcopyin(sopt, mtod(m, char *), 2363 m->m_len, m->m_len); 2364 if (error) 2365 break; 2366 2367 return (ip_pcbopts(sopt->sopt_name, 2368 &inp->inp_options, m)); 2369 } 2370 2371 case IP_TOS: 2372 case IP_TTL: 2373 case IP_RECVOPTS: 2374 case IP_RECVRETOPTS: 2375 case IP_RECVDSTADDR: 2376 case IP_RECVIF: 2377 case IP_RECVTTL: 2378 case IP_RECVPKTINFO: 2379 error = sooptcopyin(sopt, &optval, sizeof (optval), 2380 sizeof (optval)); 2381 if (error) 2382 break; 2383 2384 switch (sopt->sopt_name) { 2385 case IP_TOS: 2386 inp->inp_ip_tos = optval; 2387 break; 2388 2389 case IP_TTL: 2390 inp->inp_ip_ttl = optval; 2391 break; 2392#define OPTSET(bit) \ 2393 if (optval) \ 2394 inp->inp_flags |= bit; \ 2395 else \ 2396 inp->inp_flags &= ~bit; 2397 2398 case IP_RECVOPTS: 2399 OPTSET(INP_RECVOPTS); 2400 break; 2401 2402 case IP_RECVRETOPTS: 2403 OPTSET(INP_RECVRETOPTS); 2404 break; 2405 2406 case IP_RECVDSTADDR: 2407 OPTSET(INP_RECVDSTADDR); 2408 break; 2409 2410 case IP_RECVIF: 2411 OPTSET(INP_RECVIF); 2412 break; 2413 2414 case IP_RECVTTL: 2415 OPTSET(INP_RECVTTL); 2416 break; 2417 2418 case IP_RECVPKTINFO: 2419 OPTSET(INP_PKTINFO); 2420 break; 2421 } 2422 break; 2423#undef OPTSET 2424 2425#if CONFIG_FORCE_OUT_IFP 2426 /* 2427 * Apple private interface, similar to IP_BOUND_IF, except 2428 * that the parameter is a NULL-terminated string containing 2429 * the name of the network interface; an emptry string means 2430 * unbind. Applications are encouraged to use IP_BOUND_IF 2431 * instead, as that is the current "official" API. 2432 */ 2433 case IP_FORCE_OUT_IFP: { 2434 char ifname[IFNAMSIZ]; 2435 unsigned int ifscope; 2436 2437 /* This option is settable only for IPv4 */ 2438 if (!(inp->inp_vflag & INP_IPV4)) { 2439 error = EINVAL; 2440 break; 2441 } 2442 2443 /* Verify interface name parameter is sane */ 2444 if (sopt->sopt_valsize > sizeof (ifname)) { 2445 error = EINVAL; 2446 break; 2447 } 2448 2449 /* Copy the interface name */ 2450 if (sopt->sopt_valsize != 0) { 2451 error = sooptcopyin(sopt, ifname, 2452 sizeof (ifname), sopt->sopt_valsize); 2453 if (error) 2454 break; 2455 } 2456 2457 if (sopt->sopt_valsize == 0 || ifname[0] == '\0') { 2458 /* Unbind this socket from any interface */ 2459 ifscope = IFSCOPE_NONE; 2460 } else { 2461 ifnet_t ifp; 2462 2463 /* Verify name is NULL terminated */ 2464 if (ifname[sopt->sopt_valsize - 1] != '\0') { 2465 error = EINVAL; 2466 break; 2467 } 2468 2469 /* Bail out if given bogus interface name */ 2470 if (ifnet_find_by_name(ifname, &ifp) != 0) { 2471 error = ENXIO; 2472 break; 2473 } 2474 2475 /* Bind this socket to this interface */ 2476 ifscope = ifp->if_index; 2477 2478 /* 2479 * Won't actually free; since we don't release 2480 * this later, we should do it now. 2481 */ 2482 ifnet_release(ifp); 2483 } 2484 error = inp_bindif(inp, ifscope, NULL); 2485 } 2486 break; 2487#endif /* CONFIG_FORCE_OUT_IFP */ 2488 /* 2489 * Multicast socket options are processed by the in_mcast 2490 * module. 2491 */ 2492 case IP_MULTICAST_IF: 2493 case IP_MULTICAST_IFINDEX: 2494 case IP_MULTICAST_VIF: 2495 case IP_MULTICAST_TTL: 2496 case IP_MULTICAST_LOOP: 2497 case IP_ADD_MEMBERSHIP: 2498 case IP_DROP_MEMBERSHIP: 2499 case IP_ADD_SOURCE_MEMBERSHIP: 2500 case IP_DROP_SOURCE_MEMBERSHIP: 2501 case IP_BLOCK_SOURCE: 2502 case IP_UNBLOCK_SOURCE: 2503 case IP_MSFILTER: 2504 case MCAST_JOIN_GROUP: 2505 case MCAST_LEAVE_GROUP: 2506 case MCAST_JOIN_SOURCE_GROUP: 2507 case MCAST_LEAVE_SOURCE_GROUP: 2508 case MCAST_BLOCK_SOURCE: 2509 case MCAST_UNBLOCK_SOURCE: 2510 error = inp_setmoptions(inp, sopt); 2511 break; 2512 2513 case IP_PORTRANGE: 2514 error = sooptcopyin(sopt, &optval, sizeof (optval), 2515 sizeof (optval)); 2516 if (error) 2517 break; 2518 2519 switch (optval) { 2520 case IP_PORTRANGE_DEFAULT: 2521 inp->inp_flags &= ~(INP_LOWPORT); 2522 inp->inp_flags &= ~(INP_HIGHPORT); 2523 break; 2524 2525 case IP_PORTRANGE_HIGH: 2526 inp->inp_flags &= ~(INP_LOWPORT); 2527 inp->inp_flags |= INP_HIGHPORT; 2528 break; 2529 2530 case IP_PORTRANGE_LOW: 2531 inp->inp_flags &= ~(INP_HIGHPORT); 2532 inp->inp_flags |= INP_LOWPORT; 2533 break; 2534 2535 default: 2536 error = EINVAL; 2537 break; 2538 } 2539 break; 2540 2541#if IPSEC 2542 case IP_IPSEC_POLICY: { 2543 caddr_t req = NULL; 2544 size_t len = 0; 2545 int priv; 2546 struct mbuf *m; 2547 int optname; 2548 2549 if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ 2550 break; 2551 if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ 2552 break; 2553 priv = (proc_suser(sopt->sopt_p) == 0); 2554 if (m) { 2555 req = mtod(m, caddr_t); 2556 len = m->m_len; 2557 } 2558 optname = sopt->sopt_name; 2559 error = ipsec4_set_policy(inp, optname, req, len, priv); 2560 m_freem(m); 2561 break; 2562 } 2563#endif /* IPSEC */ 2564 2565#if TRAFFIC_MGT 2566 case IP_TRAFFIC_MGT_BACKGROUND: { 2567 unsigned background = 0; 2568 2569 error = sooptcopyin(sopt, &background, 2570 sizeof (background), sizeof (background)); 2571 if (error) 2572 break; 2573 2574 if (background) { 2575 socket_set_traffic_mgt_flags_locked(so, 2576 TRAFFIC_MGT_SO_BACKGROUND); 2577 } else { 2578 socket_clear_traffic_mgt_flags_locked(so, 2579 TRAFFIC_MGT_SO_BACKGROUND); 2580 } 2581 2582 break; 2583 } 2584#endif /* TRAFFIC_MGT */ 2585 2586 /* 2587 * On a multihomed system, scoped routing can be used to 2588 * restrict the source interface used for sending packets. 2589 * The socket option IP_BOUND_IF binds a particular AF_INET 2590 * socket to an interface such that data sent on the socket 2591 * is restricted to that interface. This is unlike the 2592 * SO_DONTROUTE option where the routing table is bypassed; 2593 * therefore it allows for a greater flexibility and control 2594 * over the system behavior, and does not place any restriction 2595 * on the destination address type (e.g. unicast, multicast, 2596 * or broadcast if applicable) or whether or not the host is 2597 * directly reachable. Note that in the multicast transmit 2598 * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over 2599 * IP_BOUND_IF, since the former practically bypasses the 2600 * routing table; in this case, IP_BOUND_IF sets the default 2601 * interface used for sending multicast packets in the absence 2602 * of an explicit multicast transmit interface. 2603 */ 2604 case IP_BOUND_IF: 2605 /* This option is settable only for IPv4 */ 2606 if (!(inp->inp_vflag & INP_IPV4)) { 2607 error = EINVAL; 2608 break; 2609 } 2610 2611 error = sooptcopyin(sopt, &optval, sizeof (optval), 2612 sizeof (optval)); 2613 2614 if (error) 2615 break; 2616 2617 error = inp_bindif(inp, optval, NULL); 2618 break; 2619 2620 case IP_NO_IFT_CELLULAR: 2621 /* This option is settable only for IPv4 */ 2622 if (!(inp->inp_vflag & INP_IPV4)) { 2623 error = EINVAL; 2624 break; 2625 } 2626 2627 error = sooptcopyin(sopt, &optval, sizeof (optval), 2628 sizeof (optval)); 2629 2630 if (error) 2631 break; 2632 2633 /* once set, it cannot be unset */ 2634 if (!optval && INP_NO_CELLULAR(inp)) { 2635 error = EINVAL; 2636 break; 2637 } 2638 2639 error = so_set_restrictions(so, 2640 SO_RESTRICT_DENY_CELLULAR); 2641 break; 2642 2643 case IP_OUT_IF: 2644 /* This option is not settable */ 2645 error = EINVAL; 2646 break; 2647 2648 default: 2649 error = ENOPROTOOPT; 2650 break; 2651 } 2652 break; 2653 2654 case SOPT_GET: 2655 switch (sopt->sopt_name) { 2656 case IP_OPTIONS: 2657 case IP_RETOPTS: 2658 if (inp->inp_options) { 2659 error = sooptcopyout(sopt, 2660 mtod(inp->inp_options, char *), 2661 inp->inp_options->m_len); 2662 } else { 2663 sopt->sopt_valsize = 0; 2664 } 2665 break; 2666 2667 case IP_TOS: 2668 case IP_TTL: 2669 case IP_RECVOPTS: 2670 case IP_RECVRETOPTS: 2671 case IP_RECVDSTADDR: 2672 case IP_RECVIF: 2673 case IP_RECVTTL: 2674 case IP_PORTRANGE: 2675 case IP_RECVPKTINFO: 2676 switch (sopt->sopt_name) { 2677 2678 case IP_TOS: 2679 optval = inp->inp_ip_tos; 2680 break; 2681 2682 case IP_TTL: 2683 optval = inp->inp_ip_ttl; 2684 break; 2685 2686#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 2687 2688 case IP_RECVOPTS: 2689 optval = OPTBIT(INP_RECVOPTS); 2690 break; 2691 2692 case IP_RECVRETOPTS: 2693 optval = OPTBIT(INP_RECVRETOPTS); 2694 break; 2695 2696 case IP_RECVDSTADDR: 2697 optval = OPTBIT(INP_RECVDSTADDR); 2698 break; 2699 2700 case IP_RECVIF: 2701 optval = OPTBIT(INP_RECVIF); 2702 break; 2703 2704 case IP_RECVTTL: 2705 optval = OPTBIT(INP_RECVTTL); 2706 break; 2707 2708 case IP_PORTRANGE: 2709 if (inp->inp_flags & INP_HIGHPORT) 2710 optval = IP_PORTRANGE_HIGH; 2711 else if (inp->inp_flags & INP_LOWPORT) 2712 optval = IP_PORTRANGE_LOW; 2713 else 2714 optval = 0; 2715 break; 2716 2717 case IP_RECVPKTINFO: 2718 optval = OPTBIT(INP_PKTINFO); 2719 break; 2720 } 2721 error = sooptcopyout(sopt, &optval, sizeof (optval)); 2722 break; 2723 2724 case IP_MULTICAST_IF: 2725 case IP_MULTICAST_IFINDEX: 2726 case IP_MULTICAST_VIF: 2727 case IP_MULTICAST_TTL: 2728 case IP_MULTICAST_LOOP: 2729 case IP_MSFILTER: 2730 error = inp_getmoptions(inp, sopt); 2731 break; 2732 2733#if IPSEC 2734 case IP_IPSEC_POLICY: { 2735 error = 0; /* This option is no longer supported */ 2736 break; 2737 } 2738#endif /* IPSEC */ 2739 2740#if TRAFFIC_MGT 2741 case IP_TRAFFIC_MGT_BACKGROUND: { 2742 unsigned background = (so->so_traffic_mgt_flags & 2743 TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0; 2744 return (sooptcopyout(sopt, &background, 2745 sizeof (background))); 2746 break; 2747 } 2748#endif /* TRAFFIC_MGT */ 2749 2750 case IP_BOUND_IF: 2751 if (inp->inp_flags & INP_BOUND_IF) 2752 optval = inp->inp_boundifp->if_index; 2753 error = sooptcopyout(sopt, &optval, sizeof (optval)); 2754 break; 2755 2756 case IP_NO_IFT_CELLULAR: 2757 optval = INP_NO_CELLULAR(inp) ? 1 : 0; 2758 error = sooptcopyout(sopt, &optval, sizeof (optval)); 2759 break; 2760 2761 case IP_OUT_IF: 2762 optval = (inp->inp_last_outifp != NULL) ? 2763 inp->inp_last_outifp->if_index : 0; 2764 error = sooptcopyout(sopt, &optval, sizeof (optval)); 2765 break; 2766 2767 default: 2768 error = ENOPROTOOPT; 2769 break; 2770 } 2771 break; 2772 } 2773 return (error); 2774} 2775 2776/* 2777 * Set up IP options in pcb for insertion in output packets. 2778 * Store in mbuf with pointer in pcbopt, adding pseudo-option 2779 * with destination address if source routed. 2780 */ 2781static int 2782ip_pcbopts(int optname, struct mbuf **pcbopt, struct mbuf *m) 2783{ 2784#pragma unused(optname) 2785 int cnt, optlen; 2786 u_char *cp; 2787 u_char opt; 2788 2789 /* turn off any old options */ 2790 if (*pcbopt) 2791 (void) m_free(*pcbopt); 2792 *pcbopt = 0; 2793 if (m == (struct mbuf *)0 || m->m_len == 0) { 2794 /* 2795 * Only turning off any previous options. 2796 */ 2797 if (m) 2798 (void) m_free(m); 2799 return (0); 2800 } 2801 2802 if (m->m_len % sizeof (int32_t)) 2803 goto bad; 2804 2805 /* 2806 * IP first-hop destination address will be stored before 2807 * actual options; move other options back 2808 * and clear it when none present. 2809 */ 2810 if (m->m_data + m->m_len + sizeof (struct in_addr) >= &m->m_dat[MLEN]) 2811 goto bad; 2812 cnt = m->m_len; 2813 m->m_len += sizeof (struct in_addr); 2814 cp = mtod(m, u_char *) + sizeof (struct in_addr); 2815 ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt); 2816 bzero(mtod(m, caddr_t), sizeof (struct in_addr)); 2817 2818 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2819 opt = cp[IPOPT_OPTVAL]; 2820 if (opt == IPOPT_EOL) 2821 break; 2822 if (opt == IPOPT_NOP) 2823 optlen = 1; 2824 else { 2825 if (cnt < IPOPT_OLEN + sizeof (*cp)) 2826 goto bad; 2827 optlen = cp[IPOPT_OLEN]; 2828 if (optlen < IPOPT_OLEN + sizeof (*cp) || optlen > cnt) 2829 goto bad; 2830 } 2831 switch (opt) { 2832 2833 default: 2834 break; 2835 2836 case IPOPT_LSRR: 2837 case IPOPT_SSRR: 2838 /* 2839 * user process specifies route as: 2840 * ->A->B->C->D 2841 * D must be our final destination (but we can't 2842 * check that since we may not have connected yet). 2843 * A is first hop destination, which doesn't appear in 2844 * actual IP option, but is stored before the options. 2845 */ 2846 if (optlen < IPOPT_MINOFF - 1 + sizeof (struct in_addr)) 2847 goto bad; 2848 m->m_len -= sizeof (struct in_addr); 2849 cnt -= sizeof (struct in_addr); 2850 optlen -= sizeof (struct in_addr); 2851 cp[IPOPT_OLEN] = optlen; 2852 /* 2853 * Move first hop before start of options. 2854 */ 2855 bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), 2856 sizeof (struct in_addr)); 2857 /* 2858 * Then copy rest of options back 2859 * to close up the deleted entry. 2860 */ 2861 ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] + 2862 sizeof (struct in_addr)), 2863 (caddr_t)&cp[IPOPT_OFFSET+1], 2864 (unsigned)cnt + sizeof (struct in_addr)); 2865 break; 2866 } 2867 } 2868 if (m->m_len > MAX_IPOPTLEN + sizeof (struct in_addr)) 2869 goto bad; 2870 *pcbopt = m; 2871 return (0); 2872 2873bad: 2874 (void) m_free(m); 2875 return (EINVAL); 2876} 2877 2878void 2879ip_moptions_init(void) 2880{ 2881 PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof (imo_debug)); 2882 2883 imo_size = (imo_debug == 0) ? sizeof (struct ip_moptions) : 2884 sizeof (struct ip_moptions_dbg); 2885 2886 imo_zone = zinit(imo_size, IMO_ZONE_MAX * imo_size, 0, 2887 IMO_ZONE_NAME); 2888 if (imo_zone == NULL) { 2889 panic("%s: failed allocating %s", __func__, IMO_ZONE_NAME); 2890 /* NOTREACHED */ 2891 } 2892 zone_change(imo_zone, Z_EXPAND, TRUE); 2893} 2894 2895void 2896imo_addref(struct ip_moptions *imo, int locked) 2897{ 2898 if (!locked) 2899 IMO_LOCK(imo); 2900 else 2901 IMO_LOCK_ASSERT_HELD(imo); 2902 2903 if (++imo->imo_refcnt == 0) { 2904 panic("%s: imo %p wraparound refcnt\n", __func__, imo); 2905 /* NOTREACHED */ 2906 } else if (imo->imo_trace != NULL) { 2907 (*imo->imo_trace)(imo, TRUE); 2908 } 2909 2910 if (!locked) 2911 IMO_UNLOCK(imo); 2912} 2913 2914void 2915imo_remref(struct ip_moptions *imo) 2916{ 2917 int i; 2918 2919 IMO_LOCK(imo); 2920 if (imo->imo_refcnt == 0) { 2921 panic("%s: imo %p negative refcnt", __func__, imo); 2922 /* NOTREACHED */ 2923 } else if (imo->imo_trace != NULL) { 2924 (*imo->imo_trace)(imo, FALSE); 2925 } 2926 2927 --imo->imo_refcnt; 2928 if (imo->imo_refcnt > 0) { 2929 IMO_UNLOCK(imo); 2930 return; 2931 } 2932 2933 for (i = 0; i < imo->imo_num_memberships; ++i) { 2934 struct in_mfilter *imf; 2935 2936 imf = imo->imo_mfilters ? &imo->imo_mfilters[i] : NULL; 2937 if (imf != NULL) 2938 imf_leave(imf); 2939 2940 (void) in_leavegroup(imo->imo_membership[i], imf); 2941 2942 if (imf != NULL) 2943 imf_purge(imf); 2944 2945 INM_REMREF(imo->imo_membership[i]); 2946 imo->imo_membership[i] = NULL; 2947 } 2948 imo->imo_num_memberships = 0; 2949 if (imo->imo_mfilters != NULL) { 2950 FREE(imo->imo_mfilters, M_INMFILTER); 2951 imo->imo_mfilters = NULL; 2952 } 2953 if (imo->imo_membership != NULL) { 2954 FREE(imo->imo_membership, M_IPMOPTS); 2955 imo->imo_membership = NULL; 2956 } 2957 IMO_UNLOCK(imo); 2958 2959 lck_mtx_destroy(&imo->imo_lock, ifa_mtx_grp); 2960 2961 if (!(imo->imo_debug & IFD_ALLOC)) { 2962 panic("%s: imo %p cannot be freed", __func__, imo); 2963 /* NOTREACHED */ 2964 } 2965 zfree(imo_zone, imo); 2966} 2967 2968static void 2969imo_trace(struct ip_moptions *imo, int refhold) 2970{ 2971 struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo; 2972 ctrace_t *tr; 2973 u_int32_t idx; 2974 u_int16_t *cnt; 2975 2976 if (!(imo->imo_debug & IFD_DEBUG)) { 2977 panic("%s: imo %p has no debug structure", __func__, imo); 2978 /* NOTREACHED */ 2979 } 2980 if (refhold) { 2981 cnt = &imo_dbg->imo_refhold_cnt; 2982 tr = imo_dbg->imo_refhold; 2983 } else { 2984 cnt = &imo_dbg->imo_refrele_cnt; 2985 tr = imo_dbg->imo_refrele; 2986 } 2987 2988 idx = atomic_add_16_ov(cnt, 1) % IMO_TRACE_HIST_SIZE; 2989 ctrace_record(&tr[idx]); 2990} 2991 2992struct ip_moptions * 2993ip_allocmoptions(int how) 2994{ 2995 struct ip_moptions *imo; 2996 2997 imo = (how == M_WAITOK) ? zalloc(imo_zone) : zalloc_noblock(imo_zone); 2998 if (imo != NULL) { 2999 bzero(imo, imo_size); 3000 lck_mtx_init(&imo->imo_lock, ifa_mtx_grp, ifa_mtx_attr); 3001 imo->imo_debug |= IFD_ALLOC; 3002 if (imo_debug != 0) { 3003 imo->imo_debug |= IFD_DEBUG; 3004 imo->imo_trace = imo_trace; 3005 } 3006 IMO_ADDREF(imo); 3007 } 3008 3009 return (imo); 3010} 3011 3012/* 3013 * Routine called from ip_output() to loop back a copy of an IP multicast 3014 * packet to the input queue of a specified interface. Note that this 3015 * calls the output routine of the loopback "driver", but with an interface 3016 * pointer that might NOT be a loopback interface -- evil, but easier than 3017 * replicating that code here. 3018 */ 3019static void 3020ip_mloopback(struct ifnet *srcifp, struct ifnet *origifp, struct mbuf *m, 3021 struct sockaddr_in *dst, int hlen) 3022{ 3023 struct mbuf *copym; 3024 struct ip *ip; 3025 3026 if (lo_ifp == NULL) 3027 return; 3028 3029 /* 3030 * Copy the packet header as it's needed for the checksum 3031 * Make sure to deep-copy IP header portion in case the data 3032 * is in an mbuf cluster, so that we can safely override the IP 3033 * header portion later. 3034 */ 3035 copym = m_copym_mode(m, 0, M_COPYALL, M_DONTWAIT, M_COPYM_COPY_HDR); 3036 if (copym != NULL && ((copym->m_flags & M_EXT) || copym->m_len < hlen)) 3037 copym = m_pullup(copym, hlen); 3038 3039 if (copym == NULL) 3040 return; 3041 3042 /* 3043 * We don't bother to fragment if the IP length is greater 3044 * than the interface's MTU. Can this possibly matter? 3045 */ 3046 ip = mtod(copym, struct ip *); 3047#if BYTE_ORDER != BIG_ENDIAN 3048 HTONS(ip->ip_len); 3049 HTONS(ip->ip_off); 3050#endif 3051 ip->ip_sum = 0; 3052 ip->ip_sum = ip_cksum_hdr_out(copym, hlen); 3053 3054 /* 3055 * Mark checksum as valid unless receive checksum offload is 3056 * disabled; if so, compute checksum in software. If the 3057 * interface itself is lo0, this will be overridden by if_loop. 3058 */ 3059 if (hwcksum_rx) { 3060 copym->m_pkthdr.csum_flags &= ~CSUM_PARTIAL; 3061 copym->m_pkthdr.csum_flags |= 3062 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 3063 copym->m_pkthdr.csum_data = 0xffff; 3064 } else if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 3065#if BYTE_ORDER != BIG_ENDIAN 3066 NTOHS(ip->ip_len); 3067#endif 3068 in_delayed_cksum(copym); 3069#if BYTE_ORDER != BIG_ENDIAN 3070 HTONS(ip->ip_len); 3071#endif 3072 } 3073 3074 /* 3075 * Stuff the 'real' ifp into the pkthdr, to be used in matching 3076 * in ip_input(); we need the loopback ifp/dl_tag passed as args 3077 * to make the loopback driver compliant with the data link 3078 * requirements. 3079 */ 3080 copym->m_pkthdr.rcvif = origifp; 3081 3082 /* 3083 * Also record the source interface (which owns the source address). 3084 * This is basically a stripped down version of ifa_foraddr(). 3085 */ 3086 if (srcifp == NULL) { 3087 struct in_ifaddr *ia; 3088 3089 lck_rw_lock_shared(in_ifaddr_rwlock); 3090 TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_src.s_addr), ia_hash) { 3091 IFA_LOCK_SPIN(&ia->ia_ifa); 3092 if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_src.s_addr) { 3093 srcifp = ia->ia_ifp; 3094 IFA_UNLOCK(&ia->ia_ifa); 3095 break; 3096 } 3097 IFA_UNLOCK(&ia->ia_ifa); 3098 } 3099 lck_rw_done(in_ifaddr_rwlock); 3100 } 3101 if (srcifp != NULL) 3102 ip_setsrcifaddr_info(copym, srcifp->if_index, NULL); 3103 ip_setdstifaddr_info(copym, origifp->if_index, NULL); 3104 3105 dlil_output(lo_ifp, PF_INET, copym, NULL, SA(dst), 0, NULL); 3106} 3107 3108/* 3109 * Given a source IP address (and route, if available), determine the best 3110 * interface to send the packet from. Checking for (and updating) the 3111 * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done 3112 * without any locks based on the assumption that ip_output() is single- 3113 * threaded per-pcb, i.e. for any given pcb there can only be one thread 3114 * performing output at the IP layer. 3115 * 3116 * This routine is analogous to in6_selectroute() for IPv6. 3117 */ 3118static struct ifaddr * 3119in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) 3120{ 3121 struct ifaddr *ifa = NULL; 3122 struct in_addr src = ip->ip_src; 3123 struct in_addr dst = ip->ip_dst; 3124 struct ifnet *rt_ifp; 3125 char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN]; 3126 3127 VERIFY(src.s_addr != INADDR_ANY); 3128 3129 if (ip_select_srcif_debug) { 3130 (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof (s_src)); 3131 (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof (s_dst)); 3132 } 3133 3134 if (ro->ro_rt != NULL) 3135 RT_LOCK(ro->ro_rt); 3136 3137 rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL; 3138 3139 /* 3140 * Given the source IP address, find a suitable source interface 3141 * to use for transmission; if the caller has specified a scope, 3142 * optimize the search by looking at the addresses only for that 3143 * interface. This is still suboptimal, however, as we need to 3144 * traverse the per-interface list. 3145 */ 3146 if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) { 3147 unsigned int scope = ifscope; 3148 3149 /* 3150 * If no scope is specified and the route is stale (pointing 3151 * to a defunct interface) use the current primary interface; 3152 * this happens when switching between interfaces configured 3153 * with the same IP address. Otherwise pick up the scope 3154 * information from the route; the ULP may have looked up a 3155 * correct route and we just need to verify it here and mark 3156 * it with the ROF_SRCIF_SELECTED flag below. 3157 */ 3158 if (scope == IFSCOPE_NONE) { 3159 scope = rt_ifp->if_index; 3160 if (scope != get_primary_ifscope(AF_INET) && 3161 ROUTE_UNUSABLE(ro)) 3162 scope = get_primary_ifscope(AF_INET); 3163 } 3164 3165 ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope); 3166 3167 if (ifa == NULL && ip->ip_p != IPPROTO_UDP && 3168 ip->ip_p != IPPROTO_TCP && ipforwarding) { 3169 /* 3170 * If forwarding is enabled, and if the packet isn't 3171 * TCP or UDP, check if the source address belongs 3172 * to one of our own interfaces; if so, demote the 3173 * interface scope and do a route lookup right below. 3174 */ 3175 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr); 3176 if (ifa != NULL) { 3177 IFA_REMREF(ifa); 3178 ifa = NULL; 3179 ifscope = IFSCOPE_NONE; 3180 } 3181 } 3182 3183 if (ip_select_srcif_debug && ifa != NULL) { 3184 if (ro->ro_rt != NULL) { 3185 printf("%s->%s ifscope %d->%d ifa_if %s " 3186 "ro_if %s\n", s_src, s_dst, ifscope, 3187 scope, if_name(ifa->ifa_ifp), 3188 if_name(rt_ifp)); 3189 } else { 3190 printf("%s->%s ifscope %d->%d ifa_if %s\n", 3191 s_src, s_dst, ifscope, scope, 3192 if_name(ifa->ifa_ifp)); 3193 } 3194 } 3195 } 3196 3197 /* 3198 * Slow path; search for an interface having the corresponding source 3199 * IP address if the scope was not specified by the caller, and: 3200 * 3201 * 1) There currently isn't any route, or, 3202 * 2) The interface used by the route does not own that source 3203 * IP address; in this case, the route will get blown away 3204 * and we'll do a more specific scoped search using the newly 3205 * found interface. 3206 */ 3207 if (ifa == NULL && ifscope == IFSCOPE_NONE) { 3208 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr); 3209 3210 /* 3211 * If we have the IP address, but not the route, we don't 3212 * really know whether or not it belongs to the correct 3213 * interface (it could be shared across multiple interfaces.) 3214 * The only way to find out is to do a route lookup. 3215 */ 3216 if (ifa != NULL && ro->ro_rt == NULL) { 3217 struct rtentry *rt; 3218 struct sockaddr_in sin; 3219 struct ifaddr *oifa = NULL; 3220 3221 bzero(&sin, sizeof (sin)); 3222 sin.sin_family = AF_INET; 3223 sin.sin_len = sizeof (sin); 3224 sin.sin_addr = dst; 3225 3226 lck_mtx_lock(rnh_lock); 3227 if ((rt = rt_lookup(TRUE, SA(&sin), NULL, 3228 rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) { 3229 RT_LOCK(rt); 3230 /* 3231 * If the route uses a different interface, 3232 * use that one instead. The IP address of 3233 * the ifaddr that we pick up here is not 3234 * relevant. 3235 */ 3236 if (ifa->ifa_ifp != rt->rt_ifp) { 3237 oifa = ifa; 3238 ifa = rt->rt_ifa; 3239 IFA_ADDREF(ifa); 3240 RT_UNLOCK(rt); 3241 } else { 3242 RT_UNLOCK(rt); 3243 } 3244 rtfree_locked(rt); 3245 } 3246 lck_mtx_unlock(rnh_lock); 3247 3248 if (oifa != NULL) { 3249 struct ifaddr *iifa; 3250 3251 /* 3252 * See if the interface pointed to by the 3253 * route is configured with the source IP 3254 * address of the packet. 3255 */ 3256 iifa = (struct ifaddr *)ifa_foraddr_scoped( 3257 src.s_addr, ifa->ifa_ifp->if_index); 3258 3259 if (iifa != NULL) { 3260 /* 3261 * Found it; drop the original one 3262 * as well as the route interface 3263 * address, and use this instead. 3264 */ 3265 IFA_REMREF(oifa); 3266 IFA_REMREF(ifa); 3267 ifa = iifa; 3268 } else if (!ipforwarding || 3269 (rt->rt_flags & RTF_GATEWAY)) { 3270 /* 3271 * This interface doesn't have that 3272 * source IP address; drop the route 3273 * interface address and just use the 3274 * original one, and let the caller 3275 * do a scoped route lookup. 3276 */ 3277 IFA_REMREF(ifa); 3278 ifa = oifa; 3279 } else { 3280 /* 3281 * Forwarding is enabled and the source 3282 * address belongs to one of our own 3283 * interfaces which isn't the outgoing 3284 * interface, and we have a route, and 3285 * the destination is on a network that 3286 * is directly attached (onlink); drop 3287 * the original one and use the route 3288 * interface address instead. 3289 */ 3290 IFA_REMREF(oifa); 3291 } 3292 } 3293 } else if (ifa != NULL && ro->ro_rt != NULL && 3294 !(ro->ro_rt->rt_flags & RTF_GATEWAY) && 3295 ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) { 3296 /* 3297 * Forwarding is enabled and the source address belongs 3298 * to one of our own interfaces which isn't the same 3299 * as the interface used by the known route; drop the 3300 * original one and use the route interface address. 3301 */ 3302 IFA_REMREF(ifa); 3303 ifa = ro->ro_rt->rt_ifa; 3304 IFA_ADDREF(ifa); 3305 } 3306 3307 if (ip_select_srcif_debug && ifa != NULL) { 3308 printf("%s->%s ifscope %d ifa_if %s\n", 3309 s_src, s_dst, ifscope, if_name(ifa->ifa_ifp)); 3310 } 3311 } 3312 3313 if (ro->ro_rt != NULL) 3314 RT_LOCK_ASSERT_HELD(ro->ro_rt); 3315 /* 3316 * If there is a non-loopback route with the wrong interface, or if 3317 * there is no interface configured with such an address, blow it 3318 * away. Except for local/loopback, we look for one with a matching 3319 * interface scope/index. 3320 */ 3321 if (ro->ro_rt != NULL && 3322 (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) || 3323 !(ro->ro_rt->rt_flags & RTF_UP))) { 3324 if (ip_select_srcif_debug) { 3325 if (ifa != NULL) { 3326 printf("%s->%s ifscope %d ro_if %s != " 3327 "ifa_if %s (cached route cleared)\n", 3328 s_src, s_dst, ifscope, if_name(rt_ifp), 3329 if_name(ifa->ifa_ifp)); 3330 } else { 3331 printf("%s->%s ifscope %d ro_if %s " 3332 "(no ifa_if found)\n", 3333 s_src, s_dst, ifscope, if_name(rt_ifp)); 3334 } 3335 } 3336 3337 RT_UNLOCK(ro->ro_rt); 3338 ROUTE_RELEASE(ro); 3339 3340 /* 3341 * If the destination is IPv4 LLA and the route's interface 3342 * doesn't match the source interface, then the source IP 3343 * address is wrong; it most likely belongs to the primary 3344 * interface associated with the IPv4 LL subnet. Drop the 3345 * packet rather than letting it go out and return an error 3346 * to the ULP. This actually applies not only to IPv4 LL 3347 * but other shared subnets; for now we explicitly test only 3348 * for the former case and save the latter for future. 3349 */ 3350 if (IN_LINKLOCAL(ntohl(dst.s_addr)) && 3351 !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) { 3352 IFA_REMREF(ifa); 3353 ifa = NULL; 3354 } 3355 } 3356 3357 if (ip_select_srcif_debug && ifa == NULL) { 3358 printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n", 3359 s_src, s_dst, ifscope); 3360 } 3361 3362 /* 3363 * If there is a route, mark it accordingly. If there isn't one, 3364 * we'll get here again during the next transmit (possibly with a 3365 * route) and the flag will get set at that point. For IPv4 LLA 3366 * destination, mark it only if the route has been fully resolved; 3367 * otherwise we want to come back here again when the route points 3368 * to the interface over which the ARP reply arrives on. 3369 */ 3370 if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) || 3371 (ro->ro_rt->rt_gateway->sa_family == AF_LINK && 3372 SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) { 3373 if (ifa != NULL) 3374 IFA_ADDREF(ifa); /* for route */ 3375 if (ro->ro_srcia != NULL) 3376 IFA_REMREF(ro->ro_srcia); 3377 ro->ro_srcia = ifa; 3378 ro->ro_flags |= ROF_SRCIF_SELECTED; 3379 RT_GENID_SYNC(ro->ro_rt); 3380 } 3381 3382 if (ro->ro_rt != NULL) 3383 RT_UNLOCK(ro->ro_rt); 3384 3385 return (ifa); 3386} 3387 3388void 3389ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len, 3390 uint32_t *sw_csum) 3391{ 3392 int tso = TSO_IPV4_OK(ifp, m); 3393 uint32_t hwcap = ifp->if_hwassist; 3394 3395 m->m_pkthdr.csum_flags |= CSUM_IP; 3396 3397 if (!hwcksum_tx) { 3398 /* do all in software; hardware checksum offload is disabled */ 3399 *sw_csum = (CSUM_DELAY_DATA | CSUM_DELAY_IP) & 3400 m->m_pkthdr.csum_flags; 3401 } else { 3402 /* do in software what the hardware cannot */ 3403 *sw_csum = m->m_pkthdr.csum_flags & 3404 ~IF_HWASSIST_CSUM_FLAGS(hwcap); 3405 } 3406 3407 if (hlen != sizeof (struct ip)) { 3408 *sw_csum |= ((CSUM_DELAY_DATA | CSUM_DELAY_IP) & 3409 m->m_pkthdr.csum_flags); 3410 } else if (!(*sw_csum & CSUM_DELAY_DATA) && (hwcap & CSUM_PARTIAL)) { 3411 /* 3412 * Partial checksum offload, if non-IP fragment, and TCP only 3413 * (no UDP support, as the hardware may not be able to convert 3414 * +0 to -0 (0xffff) per RFC1122 4.1.3.4.) 3415 */ 3416 if (hwcksum_tx && !tso && 3417 (m->m_pkthdr.csum_flags & CSUM_TCP) && 3418 ip_len <= ifp->if_mtu) { 3419 uint16_t start = sizeof (struct ip); 3420 uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff; 3421 m->m_pkthdr.csum_flags |= 3422 (CSUM_DATA_VALID | CSUM_PARTIAL); 3423 m->m_pkthdr.csum_tx_stuff = (ulpoff + start); 3424 m->m_pkthdr.csum_tx_start = start; 3425 /* do IP hdr chksum in software */ 3426 *sw_csum = CSUM_DELAY_IP; 3427 } else { 3428 *sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags); 3429 } 3430 } 3431 3432 if (*sw_csum & CSUM_DELAY_DATA) { 3433 in_delayed_cksum(m); 3434 *sw_csum &= ~CSUM_DELAY_DATA; 3435 } 3436 3437 if (hwcksum_tx) { 3438 /* 3439 * Drop off bits that aren't supported by hardware; 3440 * also make sure to preserve non-checksum related bits. 3441 */ 3442 m->m_pkthdr.csum_flags = 3443 ((m->m_pkthdr.csum_flags & 3444 (IF_HWASSIST_CSUM_FLAGS(hwcap) | CSUM_DATA_VALID)) | 3445 (m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_MASK)); 3446 } else { 3447 /* drop all bits; hardware checksum offload is disabled */ 3448 m->m_pkthdr.csum_flags = 0; 3449 } 3450} 3451 3452/* 3453 * GRE protocol output for PPP/PPTP 3454 */ 3455int 3456ip_gre_output(struct mbuf *m) 3457{ 3458 struct route ro; 3459 int error; 3460 3461 bzero(&ro, sizeof (ro)); 3462 3463 error = ip_output(m, NULL, &ro, 0, NULL, NULL); 3464 3465 ROUTE_RELEASE(&ro); 3466 3467 return (error); 3468} 3469